In [None]:
# --- Classical Baselines for Fake-News Detection (Notebook A) ---
# Safe to run in Google Colab. Includes:
# - Data loading (FakeNewsNet Kaggle CSVs)
# - Cleaning & stratified splits (70/10/20)
# - TF-IDF (word + char)
# - Models: NB, Logistic, Linear SVM (calibrated), Random Forest, XGBoost
# - Metrics: Macro-F1, PR-AUC, ROC-AUC
# - Explainability: top coefficients (LogReg) + SHAP for XGBoost
# - Saving artifacts to Google Drive

!pip -q install scikit-learn xgboost lightgbm shap tqdm

import os, re, time, joblib, json
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    f1_score, roc_auc_score, average_precision_score, classification_report
)
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
import shap
import matplotlib.pyplot as plt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/FakeNewsDetector"
DATA_DIR   = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models", "classical")
FIGS_DIR   = os.path.join(BASE_DIR, "figs")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(FIGS_DIR, exist_ok=True)

P_FK = os.path.join(DATA_DIR, "PolitiFact_fake_news_content.csv")
P_RL = os.path.join(DATA_DIR, "PolitiFact_real_news_content.csv")
G_FK = os.path.join(DATA_DIR, "BuzzFeed_fake_news_content.csv")
G_RL = os.path.join(DATA_DIR, "BuzzFeed_real_news_content.csv")

missing = [p for p in [P_FK, P_RL, G_FK, G_RL] if not os.path.exists(p)]
if missing:
    print(" Missing file(s):")
    for m in missing: print("   ", m)
    print("\nUpload the files to:", DATA_DIR)
else:
    print("All files found in:", DATA_DIR)
    for f in os.listdir(DATA_DIR): print("   ", f)

All files found in: /content/drive/MyDrive/FakeNewsDetector/data
    BuzzFeed_real_news_content.csv
    BuzzFeed_fake_news_content.csv
    PolitiFact_real_news_content.csv
    PolitiFact_fake_news_content.csv


In [None]:
# Simple text cleaner for classical TF-IDF features
URL_RE = re.compile(r'https?://\S+|www\.\S+')
HTML_RE = re.compile(r'<.*?>')

def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = re.sub(HTML_RE, ' ', s)
    s = re.sub(URL_RE, ' ', s)
    s = s.replace('\n', ' ').replace('\r', ' ')
    s = re.sub(r'\s+', ' ', s).strip().lower()
    return s

def load_fakenewsnet_csvs():
    """
    Loads FakeNewsNet Kaggle mirror CSVs and returns a unified DataFrame with columns: [text, label]
    label: 1 = real, 0 = fake
    It tries common text columns: ['text','content','news_text'] and title fallbacks.
    """
    def load_one(path, label_value):
        df = pd.read_csv(path)
        # Heuristic: find a text column
        text_col = None
        for c in ['text','content','news_text','article_content','body','content_text']:
            if c in df.columns:
                text_col = c
                break
        if text_col is None:
            # Try to build from title + text-like fields
            candidates = [c for c in df.columns if 'text' in c.lower() or 'content' in c.lower()]
            if candidates:
                text_col = candidates[0]
            elif 'title' in df.columns:
                text_col = 'title'
            else:
                # last resort: first object column
                text_col = df.select_dtypes(include=['object']).columns[0]
        out = pd.DataFrame({
            'text_raw': df[text_col].astype(str),
            'label': label_value
        })
        return out

    # label mapping: real=1, fake=0
    frames = []
    if os.path.exists(P_FK): frames.append(load_one(P_FK, 0))
    if os.path.exists(P_RL): frames.append(load_one(P_RL, 1))
    if os.path.exists(G_FK): frames.append(load_one(G_FK, 0))
    if os.path.exists(G_RL): frames.append(load_one(G_RL, 1))

    if not frames:
        raise FileNotFoundError("No FakeNewsNet CSVs found. Please place them in DATA_DIR.")

    df = pd.concat(frames, ignore_index=True)
    df['text'] = df['text_raw'].map(clean_text)
    df = df.dropna(subset=['text']).drop_duplicates(subset=['text']).reset_index(drop=True)
    return df

df = load_fakenewsnet_csvs()
print(df.head(3))
print("Class balance:\n", df['label'].value_counts())
print("Total samples:", len(df))


                                            text_raw  label  \
0  16.8k SHARES SHARE THIS STORY\n\nHillary Clint...      0   
1  Famous dog killed in spot she waited a year fo...      0   
2  Story highlights The House Oversight panel vot...      0   

                                                text  
0  16.8k shares share this story hillary clinton ...  
1  famous dog killed in spot she waited a year fo...  
2  story highlights the house oversight panel vot...  
Class balance:
 label
0    200
1     89
Name: count, dtype: int64
Total samples: 289


In [None]:
RANDOM_SEED = 42

# First split train vs temp (val+test)
df_train, df_temp = train_test_split(
    df, test_size=0.30, stratify=df['label'], random_state=RANDOM_SEED
)

# Split temp into val and test equally (0.15 each of total)
df_val, df_test = train_test_split(
    df_temp, test_size=0.5, stratify=df_temp['label'], random_state=RANDOM_SEED
)

print("Train:", df_train.shape, "Val:", df_val.shape, "Test:", df_test.shape)

# Save splits for reproducibility (used by later notebooks)
SPLITS_PATH = os.path.join(DATA_DIR, "splits_classical.json")
with open(SPLITS_PATH, "w") as f:
    json.dump({
        "random_seed": RANDOM_SEED,
        "n_train": len(df_train),
        "n_val": len(df_val),
        "n_test": len(df_test)
    }, f, indent=2)

# X/y arrays
X_train, y_train = df_train['text'].values, df_train['label'].values
X_val,   y_val   = df_val['text'].values,   df_val['label'].values
X_test,  y_test  = df_test['text'].values,  df_test['label'].values


Train: (202, 3) Val: (43, 3) Test: (44, 3)


In [None]:
# Two TF-IDF views: word n-grams and char n-grams
word_vectorizer = TfidfVectorizer(
    max_features=100_000, ngram_range=(1,3), min_df=2, lowercase=True
)

char_vectorizer = TfidfVectorizer(
    analyzer='char', ngram_range=(3,5), min_df=2, lowercase=True
)

# Fit on TRAIN only (avoid leakage)
Xtr_word = word_vectorizer.fit_transform(X_train)
Xval_word = word_vectorizer.transform(X_val)
Xte_word = word_vectorizer.transform(X_test)

Xtr_char = char_vectorizer.fit_transform(X_train)
Xval_char = char_vectorizer.transform(X_val)
Xte_char = char_vectorizer.transform(X_test)

# Concatenate word + char views for a stronger classic baseline
from scipy.sparse import hstack
Xtr = hstack([Xtr_word, Xtr_char]).tocsr()
Xval = hstack([Xval_word, Xval_char]).tocsr()
Xte  = hstack([Xte_word, Xte_char]).tocsr()

print("TF-IDF shapes -> Train:", Xtr.shape, "| Val:", Xval.shape, "| Test:", Xte.shape)

# Save vectorizers
joblib.dump(word_vectorizer, os.path.join(MODELS_DIR, "tfidf_word.joblib"))
joblib.dump(char_vectorizer, os.path.join(MODELS_DIR, "tfidf_char.joblib"))


TF-IDF shapes -> Train: (202, 95458) | Val: (43, 95458) | Test: (44, 95458)


['/content/drive/MyDrive/FakeNewsDetector/models/classical/tfidf_char.joblib']

In [None]:
def evaluate_binary(y_true, scores, preds):
    """
    scores: probability of class 1 (real) or decision function (convert via sigmoid-like mapping if needed).
    preds: hard predictions {0,1}
    Returns Macro-F1, PR-AUC, ROC-AUC.
    """
    f1 = f1_score(y_true, preds, average='macro')
    try:
        prauc = average_precision_score(y_true, scores)
    except Exception:
        prauc = np.nan
    try:
        roc = roc_auc_score(y_true, scores)
    except Exception:
        roc = np.nan
    return f1, prauc, roc

results = []
def log_result(model_name, f1, prauc, roc, fit_s, infer_ms_per_doc, model_obj=None):
    results.append({
        "model": model_name,
        "macro_f1": round(float(f1), 4),
        "pr_auc": round(float(prauc), 4) if not np.isnan(prauc) else None,
        "roc_auc": round(float(roc), 4) if not np.isnan(roc) else None,
        "train_time_s": round(fit_s, 2),
        "latency_ms_per_doc": round(infer_ms_per_doc, 2),
        "n_params": getattr(model_obj, 'n_estimators', None) if model_obj is not None else None
    })


In [None]:
# NB expects non-negative features; TF-IDF is fine.
nb = MultinomialNB(alpha=0.5)

t0 = time.time()
nb.fit(Xtr, y_train)
fit_s = time.time() - t0

# Inference timing on validation
t1 = time.time()
val_proba = nb.predict_proba(Xval)[:, 1]
val_pred = (val_proba >= 0.5).astype(int)
lat_ms = (time.time() - t1) / len(y_val) * 1000

f1, prauc, roc = evaluate_binary(y_val, val_proba, val_pred)
log_result("NaiveBayes", f1, prauc, roc, fit_s, lat_ms, nb)

# Save
joblib.dump(nb, os.path.join(MODELS_DIR, "nb.joblib"))
print("NB Val -> Macro-F1:", f1, "PR-AUC:", prauc, "ROC-AUC:", roc)


NB Val -> Macro-F1: 0.410958904109589 PR-AUC: 0.4352129787357305 ROC-AUC: 0.523076923076923


In [None]:
logreg = LogisticRegression(
    C=3.0, penalty='l2', solver='liblinear', max_iter=200, class_weight='balanced'
)

t0 = time.time()
logreg.fit(Xtr, y_train)
fit_s = time.time() - t0

t1 = time.time()
val_proba = logreg.predict_proba(Xval)[:, 1]
val_pred = (val_proba >= 0.5).astype(int)
lat_ms = (time.time() - t1) / len(y_val) * 1000

f1, prauc, roc = evaluate_binary(y_val, val_proba, val_pred)
log_result("LogisticRegression", f1, prauc, roc, fit_s, lat_ms, logreg)

joblib.dump(logreg, os.path.join(MODELS_DIR, "logreg.joblib"))
print("LogReg Val -> Macro-F1:", f1, "PR-AUC:", prauc, "ROC-AUC:", roc)


LogReg Val -> Macro-F1: 0.43790849673202614 PR-AUC: 0.3157911754805104 ROC-AUC: 0.5307692307692308


In [None]:
# LinearSVC does not have predict_proba; we wrap it with CalibratedClassifierCV to get calibrated probabilities.
base_svm = LinearSVC(C=1.0, loss='squared_hinge', class_weight='balanced', max_iter=5000)
svm = CalibratedClassifierCV(base_svm, method='sigmoid', cv=3)

t0 = time.time()
svm.fit(Xtr, y_train)
fit_s = time.time() - t0

t1 = time.time()
val_proba = svm.predict_proba(Xval)[:, 1]
val_pred = (val_proba >= 0.5).astype(int)
lat_ms = (time.time() - t1) / len(y_val) * 1000

f1, prauc, roc = evaluate_binary(y_val, val_proba, val_pred)
log_result("LinearSVM+Calibrated", f1, prauc, roc, fit_s, lat_ms, svm)

joblib.dump(svm, os.path.join(MODELS_DIR, "svm_calibrated.joblib"))
print("SVM Calibrated Val -> Macro-F1:", f1, "PR-AUC:", prauc, "ROC-AUC:", roc)


SVM Calibrated Val -> Macro-F1: 0.4027777777777778 PR-AUC: 0.29404894185535535 ROC-AUC: 0.49230769230769234


In [None]:
rf = RandomForestClassifier(
    n_estimators=600, max_depth=None, n_jobs=-1, class_weight='balanced_subsample', random_state=RANDOM_SEED
)

t0 = time.time()
rf.fit(Xtr, y_train)
fit_s = time.time() - t0

t1 = time.time()
val_proba = rf.predict_proba(Xval)[:, 1]
val_pred = (val_proba >= 0.5).astype(int)
lat_ms = (time.time() - t1) / len(y_val) * 1000

f1, prauc, roc = evaluate_binary(y_val, val_proba, val_pred)
log_result("RandomForest", f1, prauc, roc, fit_s, lat_ms, rf)

joblib.dump(rf, os.path.join(MODELS_DIR, "random_forest.joblib"))
print("RF Val -> Macro-F1:", f1, "PR-AUC:", prauc, "ROC-AUC:", roc)


RF Val -> Macro-F1: 0.410958904109589 PR-AUC: 0.4354580277657201 ROC-AUC: 0.6000000000000001


In [None]:
xgb = XGBClassifier(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective='binary:logistic',
    n_jobs=-1,
    eval_metric='auc',
    tree_method='hist',  # fast on CPU
    random_state=RANDOM_SEED
)

t0 = time.time()
xgb.fit(Xtr, y_train)
fit_s = time.time() - t0

t1 = time.time()
val_proba = xgb.predict_proba(Xval)[:, 1]
val_pred = (val_proba >= 0.5).astype(int)
lat_ms = (time.time() - t1) / len(y_val) * 1000

f1, prauc, roc = evaluate_binary(y_val, val_proba, val_pred)
log_result("XGBoost", f1, prauc, roc, fit_s, lat_ms, xgb)

joblib.dump(xgb, os.path.join(MODELS_DIR, "xgboost.joblib"))
print("XGBoost Val -> Macro-F1:", f1, "PR-AUC:", prauc, "ROC-AUC:", roc)


XGBoost Val -> Macro-F1: 0.5234441602728048 PR-AUC: 0.41052727061776834 ROC-AUC: 0.5384615384615385


In [None]:
lb = pd.DataFrame(results).sort_values(by=["macro_f1","pr_auc"], ascending=False).reset_index(drop=True)
display(lb)
lb_path = os.path.join(MODELS_DIR, "leaderboard_val.csv")
lb.to_csv(lb_path, index=False)
print("Saved leaderboard to:", lb_path)


Unnamed: 0,model,macro_f1,pr_auc,roc_auc,train_time_s,latency_ms_per_doc,n_params
0,XGBoost,0.5234,0.4105,0.5385,131.47,0.46,1000.0
1,LogisticRegression,0.4379,0.3158,0.5308,0.6,0.11,
2,RandomForest,0.411,0.4355,0.6,10.01,20.57,600.0
3,NaiveBayes,0.411,0.4352,0.5231,0.03,0.13,
4,LinearSVM+Calibrated,0.4028,0.294,0.4923,1.33,0.72,


Saved leaderboard to: /content/drive/MyDrive/FakeNewsDetector/models/classical/leaderboard_val.csv


In [None]:
# Pick top-3 by Macro-F1 on validation
top3 = lb.head(3)["model"].tolist()
print("Top-3 models:", top3)

# Helper to load model by name
def load_model_by_name(name: str):
    if name == "NaiveBayes":
        return joblib.load(os.path.join(MODELS_DIR, "nb.joblib"))
    if name == "LogisticRegression":
        return joblib.load(os.path.join(MODELS_DIR, "logreg.joblib"))
    if name == "LinearSVM+Calibrated":
        return joblib.load(os.path.join(MODELS_DIR, "svm_calibrated.joblib"))
    if name == "RandomForest":
        return joblib.load(os.path.join(MODELS_DIR, "random_forest.joblib"))
    if name == "XGBoost":
        return joblib.load(os.path.join(MODELS_DIR, "xgboost.joblib"))
    raise ValueError(f"Unknown model: {name}")

test_rows = []
for name in top3:
    mdl = load_model_by_name(name)
    t0 = time.time()
    if hasattr(mdl, "predict_proba"):
        proba = mdl.predict_proba(Xte)[:, 1]
    else:
        # Calibrated SVM also has predict_proba; this is just a safe fallback
        proba = mdl.decision_function(Xte)
    preds = (proba >= 0.5).astype(int)
    lat_ms = (time.time() - t0) / len(y_test) * 1000

    f1, prauc, roc = evaluate_binary(y_test, proba, preds)
    test_rows.append({
        "model": name,
        "macro_f1": round(float(f1), 4),
        "pr_auc": round(float(prauc), 4),
        "roc_auc": round(float(roc), 4),
        "latency_ms_per_doc": round(lat_ms, 2)
    })

test_df = pd.DataFrame(test_rows).sort_values(by=["macro_f1","pr_auc"], ascending=False)
display(test_df)

test_path = os.path.join(MODELS_DIR, "leaderboard_test.csv")
test_df.to_csv(test_path, index=False)
print("Saved test leaderboard to:", test_path)


Top-3 models: ['XGBoost', 'LogisticRegression', 'RandomForest']


Unnamed: 0,model,macro_f1,pr_auc,roc_auc,latency_ms_per_doc
1,LogisticRegression,0.4545,0.3202,0.4976,0.05
2,RandomForest,0.4054,0.3696,0.5821,18.38
0,XGBoost,0.3714,0.2599,0.3643,1.38


Saved test leaderboard to: /content/drive/MyDrive/FakeNewsDetector/models/classical/leaderboard_test.csv


In [None]:
# Show most indicative n-grams for REAL (class 1) vs FAKE (class 0)
# Works for LogisticRegression with TF-IDF features
if os.path.exists(os.path.join(MODELS_DIR, "logreg.joblib")):
    lr = joblib.load(os.path.join(MODELS_DIR, "logreg.joblib"))
    # Get combined feature names for word+char vectorizers
    word_feats = [f"w::{t}" for t in word_vectorizer.get_feature_names_out()]
    char_feats = [f"c::{t}" for t in char_vectorizer.get_feature_names_out()]
    feats = np.array(word_feats + char_feats)

    coefs = lr.coef_.ravel()
    top_pos_idx = np.argsort(coefs)[-20:]
    top_neg_idx = np.argsort(coefs)[:20]

    print("\nTop indicators for REAL (class=1):")
    for i in reversed(top_pos_idx):
        print(f"{feats[i]:30s}  coef= {coefs[i]:.4f}")

    print("\nTop indicators for FAKE (class=0):")
    for i in top_neg_idx:
        print(f"{feats[i]:30s}  coef= {coefs[i]:.4f}")
else:
    print("LogReg model not found; skip coef inspection.")



Top indicators for REAL (class=1):
w::said                         coef= 0.6755
w::2015                         coef= 0.6659
w::conway                       coef= 0.6321
w::smile                        coef= 0.6156
w::know                         coef= 0.5892
w::rahami                       coef= 0.5857
w::johnson                      coef= 0.5782
w::cruz                         coef= 0.5225
w::she said                     coef= 0.5057
w::tax                          coef= 0.4749
w::returns                      coef= 0.4612
w::face                         coef= 0.4575
w::week                         coef= 0.4518
w::his                          coef= 0.4517
w::county                       coef= 0.4515
w::simmons                      coef= 0.4489
w::weapon                       coef= 0.4391
w::test                         coef= 0.4216
w::tax returns                  coef= 0.4215
w::nelson                       coef= 0.4180

Top indicators for FAKE (class=0):
w::to                       

In [None]:
# Save vectorizers already done; pick the single best model on validation to mark as "deployed"
best_model_name = lb.iloc[0]["model"]
best_model = load_model_by_name(best_model_name)
DEPLOY_PATH = os.path.join(MODELS_DIR, "best_model.joblib")
joblib.dump(best_model, DEPLOY_PATH)

ARTIFACTS = {
    "best_model": best_model_name,
    "vectorizers": ["tfidf_word.joblib", "tfidf_char.joblib"],
    "val_leaderboard": "leaderboard_val.csv",
    "test_leaderboard": "leaderboard_test.csv",
}
with open(os.path.join(MODELS_DIR, "artifacts.json"), "w") as f:
    json.dump(ARTIFACTS, f, indent=2)

print("Saved best model & artifacts to:", MODELS_DIR)


Saved best model & artifacts to: /content/drive/MyDrive/FakeNewsDetector/models/classical
