In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [2]:
import os, glob
# (Optional) show where Colab can see them
print("True.csv hits:", glob.glob('/content/drive/MyDrive/**/True.csv', recursive=True)[:10])
print("Fake.csv hits:", glob.glob('/content/drive/MyDrive/**/Fake.csv', recursive=True)[:10])

# If they are in the *root* of My Drive (what your link suggests), use:
TRUE_PATH = '/content/drive/MyDrive/True.csv'
FAKE_PATH = '/content/drive/MyDrive/Fake.csv'

# If they are in a subfolder, change these accordingly, e.g.:
# TRUE_PATH = '/content/drive/MyDrive/EECS4412pt1/News-_dataset/True.csv'
# FAKE_PATH = '/content/drive/MyDrive/EECS4412pt1/News-_dataset/Fake.csv'

print("TRUE exists? ", os.path.exists(TRUE_PATH), "->", TRUE_PATH)
print("FAKE exists? ", os.path.exists(FAKE_PATH), "->", FAKE_PATH)


True.csv hits: ['/content/drive/MyDrive/True.csv']
Fake.csv hits: ['/content/drive/MyDrive/Fake.csv']
TRUE exists?  True -> /content/drive/MyDrive/True.csv
FAKE exists?  True -> /content/drive/MyDrive/Fake.csv


In [3]:
from pathlib import Path

TRUE_CSV = Path(TRUE_PATH)
FAKE_CSV = Path(FAKE_PATH)

assert TRUE_CSV.exists(), f"Missing: {TRUE_CSV}"
assert FAKE_CSV.exists(), f"Missing: {FAKE_CSV}"

print("Using:", TRUE_CSV, FAKE_CSV)


Using: /content/drive/MyDrive/True.csv /content/drive/MyDrive/Fake.csv


In [4]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

plt.rcParams["figure.dpi"] = 120

ROOT = Path('/content')
FIG_DIR = ROOT / 'figs_colab'
ART_DIR = ROOT / 'artifacts_colab'
FIG_DIR.mkdir(parents=True, exist_ok=True)
ART_DIR.mkdir(parents=True, exist_ok=True)


In [5]:
def choose_text_col(df: pd.DataFrame) -> str:
    for c in ["text", "content", "article", "body", "message", "full_text"]:
        if c in df.columns: return c
    if "title" in df.columns and "text" in df.columns:
        df["__tmp_text__"] = (df["title"].astype(str) + " " + df["text"].astype(str)).str.strip()
        return "__tmp_text__"
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if obj_cols:
        lens = {c: df[c].astype(str).str.len().mean() for c in obj_cols}
        return max(lens, key=lens.get)
    raise ValueError("Could not infer text column")

df_true = pd.read_csv(TRUE_CSV, low_memory=False)
df_fake = pd.read_csv(FAKE_CSV, low_memory=False)

t_true = choose_text_col(df_true)
t_fake = choose_text_col(df_fake)

df_true = df_true.rename(columns={t_true: "text"}).assign(label="true")
df_fake = df_fake.rename(columns={t_fake: "text"}).assign(label="fake")
df = pd.concat([df_true, df_fake], ignore_index=True)

# Add optional columns if missing
for c in ["title", "subject", "date"]:
    if c not in df.columns:
        df[c] = np.nan

# Clean
df["text"] = df["text"].astype(str).str.strip()
df = df.dropna(subset=["text", "label"])
df = df[df["text"].str.len() > 0].reset_index(drop=True)

print("Shape:", df.shape)
df.head(3)


Shape: (44267, 5)


Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True


In [6]:
# Engineer numeric features for distributions/correlations
df["char_len"] = df["text"].str.len()
df["word_count"] = df["text"].str.split().apply(len)
df["exclam_count"] = df["text"].str.count("!")
df["question_count"] = df["text"].str.count(r"\?")
df["digit_count"] = df["text"].str.count(r"\d")

# Parse date if present
dt = pd.to_datetime(df["date"], errors="coerce")
df["year"] = dt.dt.year
df["month"] = dt.dt.month

# Missingness
missing = df.isna().sum().sort_values(ascending=False)
missing_pct = (missing/len(df)*100).round(2)
miss_table = pd.DataFrame({"missing": missing, "missing_%": missing_pct})
display(miss_table.head(20))

# Numeric summary
num = df.select_dtypes(include=np.number)
display(num.describe().T)

# Categorical quick looks
for c in ["subject", "label", "year", "month"]:
    if c in df.columns:
        print(f"\n{c} top-15:")
        print(df[c].value_counts(dropna=False).head(15))


Unnamed: 0,missing,missing_%
month,22851,51.62
year,22851,51.62
title,0,0.0
text,0,0.0
date,0,0.0
subject,0,0.0
label,0,0.0
char_len,0,0.0
exclam_count,0,0.0
word_count,0,0.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
char_len,44267.0,2503.470283,2166.841665,5.0,1292.0,2208.0,3125.0,51793.0
word_count,44267.0,411.059344,350.387794,1.0,213.0,365.0,516.0,8135.0
exclam_count,44267.0,0.413446,1.465218,0.0,0.0,0.0,0.0,133.0
question_count,44267.0,0.688278,1.805737,0.0,0.0,0.0,1.0,94.0
digit_count,44267.0,16.37999,26.457382,0.0,2.0,9.0,21.0,1396.0
year,21416.0,2016.779838,0.414366,2016.0,2017.0,2017.0,2017.0,2017.0
month,21416.0,8.245844,3.345961,1.0,6.0,9.0,11.0,12.0



subject top-15:
subject
politicsNews       11271
worldnews          10145
News                9050
politics            6433
left-news           4309
Government News     1498
US_News              783
Middle-east          778
Name: count, dtype: int64

label top-15:
label
fake    22851
true    21416
Name: count, dtype: int64

year top-15:
year
NaN       22851
2017.0    16701
2016.0     4715
Name: count, dtype: int64

month top-15:
month
NaN     22851
11.0     3733
10.0     3508
9.0      3425
12.0     2958
3.0      1174
2.0      1023
1.0       996
6.0       974
4.0       926
8.0       922
5.0       899
7.0       878
Name: count, dtype: int64


In [11]:
# Directory to save the table
FIG_DIR = Path(FIG_DIR) if "FIG_DIR" in globals() else Path("/content/figs_colab")
FIG_DIR.mkdir(parents=True, exist_ok=True)

# Counts and percentages
counts = df["label"].value_counts(dropna=False)
n_total = int(counts.sum())
perc = (counts / n_total * 100).round(2)

# Build a clean table
class_table = pd.DataFrame({
    "count": counts.astype(int),
    "percent": perc
}).rename_axis("label").reset_index()

# Ensure both labels show up even if one is missing
for lab in ["fake", "true"]:
    if lab not in class_table["label"].values:
        class_table.loc[len(class_table)] = [lab, 0, 0.00]

# Order rows: fake, true
class_table = class_table.set_index("label").loc[["fake", "true"]].reset_index()

# Display in Colab and save to CSV
display(class_table.style.format({"percent": "{:.2f}%"}))
class_table.to_csv(FIG_DIR / "class_balance_table.csv", index=False)

print(f"Saved class balance table to: {FIG_DIR/'class_balance_table.csv'}")

Unnamed: 0,label,count,percent
0,fake,22851,51.62%
1,true,21416,48.38%


Saved class balance table to: /content/figs_colab/class_balance_table.csv


In [7]:
def save_hist(series, title, outname, bins=30):
    plt.figure()
    series.dropna().hist(bins=bins)
    plt.title(title)
    plt.xlabel(series.name); plt.ylabel("count")
    plt.tight_layout(); plt.savefig(FIG_DIR/outname); plt.close()

for col in ["char_len", "word_count", "exclam_count", "question_count", "digit_count"]:
    save_hist(df[col], f"Histogram: {col}", f"hist_{col}.png")

# Class balance
plt.figure()
df["label"].value_counts().plot(kind="bar")
plt.title("Class balance"); plt.xlabel("label"); plt.ylabel("count")
plt.tight_layout(); plt.savefig(FIG_DIR/"class_balance.png"); plt.close()

# Subject distribution (if present)
if df["subject"].notna().any():
    plt.figure()
    df["subject"].value_counts().head(15).plot(kind="bar")
    plt.title("Distribution: subject (top 15)")
    plt.xlabel("subject"); plt.ylabel("count")
    plt.tight_layout(); plt.savefig(FIG_DIR/"dist_subject_top15.png"); plt.close()

"Saved plots to " + str(FIG_DIR)


'Saved plots to /content/figs_colab'

In [None]:
# Duplicates analysis
import matplotlib.pyplot as plt

assert "text" in df.columns and "label" in df.columns, "df must have 'text' and 'label'"

def normalize_text(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.lower().str.strip()
    return s.str.replace(r"\s+", " ", regex=True)  # collapse whitespace

# Normalize and flag dup rows
df["text_norm"] = normalize_text(df["text"])
df["is_dup_text"] = df["text_norm"].duplicated(keep=False)  # True for every row in a dup group

# Overall
n_total = len(df)
n_dup_rows = int(df["is_dup_text"].sum())
pct_dup_rows = round(100 * n_dup_rows / n_total, 2)
print(f"\n=== Duplicates (normalized text) ===")
print(f"Total rows: {n_total}")
print(f"Duplicate rows: {n_dup_rows} ({pct_dup_rows}%)")

# Per-label breakdown
dup_by_label = (df.groupby("label")["is_dup_text"]
                  .agg(count_dup_rows="sum", total="count"))
dup_by_label["pct_dup_rows"] = (100 * dup_by_label["count_dup_rows"] /
                                dup_by_label["total"]).round(2)
print("\nPer-label duplicate rows:")
display(dup_by_label)

# Number of duplicate *groups* (unique duplicated texts)
dup_groups = df.loc[df["is_dup_text"], "text_norm"].value_counts()
n_dup_groups = int((dup_groups >= 2).sum())
print(f"\nDuplicate groups (unique duplicated texts): {n_dup_groups}")

# chart for % duplicates by label
plt.figure()
dup_by_label["pct_dup_rows"].plot(kind="bar")
plt.title("Duplicate rows (%) by label")
plt.xlabel("label"); plt.ylabel("% duplicate rows")
plt.show()

In [8]:
num = df.select_dtypes(include=np.number)
if num.shape[1] >= 2:
    corr = num.corr(numeric_only=True)
    display(corr)

    # Top-3 strongest pairs by absolute correlation
    tri = (corr.where(~np.eye(corr.shape[0], dtype=bool))
              .abs().unstack().dropna().sort_values(ascending=False))
    for (a,b) in list(dict.fromkeys(tri.index))[:3]:
        plt.figure()
        plt.scatter(num[a], num[b], s=10, alpha=0.4)
        plt.title(f"Scatter: {a} vs {b}")
        plt.xlabel(a); plt.ylabel(b)
        plt.tight_layout(); plt.savefig(FIG_DIR/f"scatter_{a}_vs_{b}.png"); plt.close()
else:
    print("Not enough numeric columns for correlations.")


Unnamed: 0,char_len,word_count,exclam_count,question_count,digit_count,year,month
char_len,1.0,0.995875,0.098378,0.318972,0.547673,-0.097931,-0.054673
word_count,0.995875,1.0,0.077041,0.318599,0.530114,-0.091544,-0.044928
exclam_count,0.098378,0.077041,1.0,0.355768,0.191449,-0.001366,-0.023088
question_count,0.318972,0.318599,0.355768,1.0,0.27238,-0.033187,-0.021193
digit_count,0.547673,0.530114,0.191449,0.27238,1.0,-0.061215,0.003561
year,-0.097931,-0.091544,-0.001366,-0.033187,-0.061215,1.0,0.241729
month,-0.054673,-0.044928,-0.023088,-0.021193,0.003561,0.241729,1.0


In [10]:
# Per-label histograms (True vs Fake on x-axis)
import matplotlib.pyplot as plt
from pathlib import Path

FIG_DIR = Path(FIG_DIR) if "FIG_DIR" in globals() else Path("/content/figs_colab")
FIG_DIR.mkdir(parents=True, exist_ok=True)

def hist_by_label(col, bins=40):
    fake_vals = df.loc[df["label"]=="fake", col].dropna()
    true_vals = df.loc[df["label"]=="true", col].dropna()

    # Overlaid histogram with legend for True vs Fake
    plt.figure()
    plt.hist(fake_vals, bins=bins, alpha=0.5, label="fake")
    plt.hist(true_vals, bins=bins, alpha=0.5, label="true")
    plt.title(f"{col}  overlaid")
    plt.xlabel(col); plt.ylabel("count")
    plt.legend(title="label")
    plt.tight_layout()
    plt.savefig(FIG_DIR / f"hist_{col}_by_label_overlaid.png", dpi=150)
    plt.close()

# Generate per-label histograms for each feature that is being analyzed
for col in ["char_len", "word_count", "exclam_count", "question_count", "digit_count"]:
    if col in df.columns:
        hist_by_label(col, bins=40)

# Keep your class balance plot (x-axis explicitly says True or Fake)
plt.figure()
df["label"].value_counts().plot(kind="bar")
plt.title("Class balance")
plt.xlabel("True or Fake"); plt.ylabel("count")
plt.tight_layout()
plt.savefig(FIG_DIR / "class_balance.png", dpi=150)
plt.close()

# Subject distribution (if present)
if "subject" in df.columns and df["subject"].notna().any():
    plt.figure()
    df["subject"].value_counts().head(15).plot(kind="bar")
    plt.title("Distribution: subject (top 15)")
    plt.xlabel("subject"); plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(FIG_DIR / "dist_subject_top15.png", dpi=150)
    plt.close()

print("Saved plots to:", FIG_DIR)

Saved plots to: /content/figs_colab


In [9]:
# Save small artifacts for your report
keep = [c for c in ["title","subject","date","text","label","char_len","word_count",
                    "exclam_count","question_count","digit_count","year","month"]
        if c in df.columns]
df.head(1000)[keep].to_csv(ART_DIR/"sample_1000.csv", index=False)
with open(ART_DIR/"schema.json", "w", encoding="utf-8") as f:
    json.dump({"rows": int(df.shape[0]), "cols": int(df.shape[1]),
               "columns": {c: str(t) for c,t in df.dtypes.items()}}, f, indent=2)

# Zip figures for quick download
import shutil
zip_path = '/content/figs_colab.zip'
shutil.make_archive('/content/figs_colab', 'zip', str(FIG_DIR))
print("Artifacts:", ART_DIR)
print("Zipped figures:", zip_path)


Artifacts: /content/artifacts_colab
Zipped figures: /content/figs_colab.zip
