In [None]:
# Cell 1 â€” Setup & Mount
!pip install -q transformers datasets accelerate scikit-learn requests

from google.colab import drive
drive.mount('/content/drive')

import os, re, json, time, requests, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
)

# Project paths
BASE_DIR = "/content/drive/MyDrive/FakeNewsDetector"
DATA_DIR = os.path.join(BASE_DIR, "data")
FINETUNE_DIR = os.path.join(BASE_DIR, "models", "finetuned_fnn_distilbert")
FIGS_DIR = os.path.join(BASE_DIR, "figs")
os.makedirs(FINETUNE_DIR, exist_ok=True)
os.makedirs(FIGS_DIR, exist_ok=True)

# Speed / telemetry hygiene
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
torch.backends.cuda.matmul.allow_tf32 = True  # safe speedup on Ampere+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Using device:", DEVICE)


Mounted at /content/drive
Using device: cpu


In [None]:
# Cell 2 â€” Load + Clean (PolitiFact + BuzzFeed only)
files = {
    "P_fake": "PolitiFact_fake_news_content.csv",
    "P_real": "PolitiFact_real_news_content.csv",
    "B_fake": "BuzzFeed_fake_news_content.csv",
    "B_real": "BuzzFeed_real_news_content.csv",
}

def load_one(tag, fname):
    df = pd.read_csv(os.path.join(DATA_DIR, fname))
    df = df.rename(columns=lambda c: c.lower())
    # text column may be "text" or "content"
    txt_col = "text" if "text" in df.columns else ("content" if "content" in df.columns else None)
    if txt_col is None:
        raise ValueError(f"No text/content column in {fname}")
    out = df[[txt_col]].dropna().copy()
    out["text"] = out[txt_col].astype(str)
    out["label"] = 0 if "fake" in tag.lower() else 1
    return out[["text","label"]]

parts = [load_one(tag, fn) for tag, fn in files.items()]
df_raw = pd.concat(parts, ignore_index=True)
print("Loaded rows:", df_raw.shape[0])

# Cleaning to remove publisher/URL shortcuts that inflate scores
def clean_text(t: str) -> str:
    t = t.lower()
    t = re.sub(r"http\S+|www\S+|@\S+|#\S+", " ", t)
    t = re.sub(r"(cnn|fox|breitbart|snopes|reuters|bbc|nytimes|washington post|ap news|buzzfeed|politifact)", " ", t)
    t = re.sub(r"[^a-z\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df = df_raw.copy()
df["text"] = df["text"].apply(clean_text)
df = df[df["text"].str.len() > 50].copy()

# Exact de-dup on normalized text to prevent cross-split overlap
df["text_norm"] = df["text"].str.replace(r"\s+", " ", regex=True)
before = len(df)
df = df.drop_duplicates(subset="text_norm").drop(columns=["text_norm"]).reset_index(drop=True)
print(f"After clean & dedup: {len(df)} (removed {before-len(df)})")
df.head()


Loaded rows: 422
After clean & dedup: 288 (removed 132)


Unnamed: 0,text,label
0,k shares share this story hillary clinton just...,0
1,famous dog killed in spot she waited a year fo...,0
2,story highlights the house oversight panel vot...,0
3,we are absolutely heartbroken to hear about th...,0
4,nine years ago a driver lost control of his pi...,0


In [None]:
# Cell 3 â€” Stratified, Leak-Free Split + Integrity Check
X_train, X_temp, y_train, y_temp = train_test_split(
    df["text"], df["label"], test_size=0.30, stratify=df["label"],
    random_state=42, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp,
    random_state=42, shuffle=True
)

print("Split sizes:")
print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))
print("Label balance (Real=1 fraction):",
      round(y_train.mean(),3), round(y_val.mean(),3), round(y_test.mean(),3))

# Overlap check (should be 0,0,0)
ov_tv = len(set(X_train) & set(X_val))
ov_tt = len(set(X_train) & set(X_test))
ov_vt = len(set(X_val)   & set(X_test))
print("Overlaps (Trainâˆ©Val, Trainâˆ©Test, Valâˆ©Test):", ov_tv, ov_tt, ov_vt)


Split sizes:
Train: 201 Val: 43 Test: 44
Label balance (Real=1 fraction): 0.308 0.302 0.318
Overlaps (Trainâˆ©Val, Trainâˆ©Test, Valâˆ©Test): 0 0 0


In [None]:
# Cell 4 â€” HF Dataset + Tokenization
tok_distil = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(batch):
    return tok_distil(batch["text"], truncation=True, padding="max_length", max_length=256)

def mk_ds(x, y):
    return Dataset.from_pandas(pd.DataFrame({"text": x, "label": y}))

ds = DatasetDict({
    "train": mk_ds(X_train, y_train),
    "validation": mk_ds(X_val, y_val),
    "test": mk_ds(X_test, y_test),
})

ds_tok = ds.map(tokenize_fn, batched=True).remove_columns(["text"])
ds_tok.set_format("torch")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

In [None]:
# Cell 5 â€” Metrics
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    # softmax for class 1 probability
    probs = np.exp(logits) / np.exp(logits).sum(axis=1, keepdims=True)
    preds = probs.argmax(axis=1)
    return {
        "macro_f1": f1_score(labels, preds, average="macro"),
        "roc_auc":  roc_auc_score(labels, probs[:,1]),
        "pr_auc":   average_precision_score(labels, probs[:,1]),
    }


In [None]:
# Cell 6 â€” DistilBERT Fine-Tune (LR sweep)
EPOCHS = 5
TRAIN_BS = 8
VAL_BS = 32
GRAD_ACCUM = 2
LR_GRID = [1e-5, 3e-5, 5e-5]

distil_results = []

for lr in LR_GRID:
    print(f"\nðŸ”¹ Training DistilBERT @ lr={lr}")
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased", num_labels=2
    )

    args = TrainingArguments(
        output_dir=os.path.join(FINETUNE_DIR, f"distilbert_lr{lr}"),
        learning_rate=lr,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=VAL_BS,
        gradient_accumulation_steps=GRAD_ACCUM,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        warmup_ratio=0.06,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        logging_steps=50,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds_tok["train"],
        eval_dataset=ds_tok["validation"],
        tokenizer=tok_distil,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    eval_res = trainer.evaluate(ds_tok["validation"])
    row = {"lr": lr}
    row.update({k.replace("eval_",""): float(v) for k,v in eval_res.items() if k.startswith("eval_")})
    distil_results.append(row)

df_distil = pd.DataFrame(distil_results).sort_values("macro_f1", ascending=False)
display(df_distil)

best_lr = float(df_distil.iloc[0]["lr"])
print(f"âœ… Best LR for DistilBERT: {best_lr}")
best_ckpt = os.path.join(FINETUNE_DIR, f"distilbert_best_lr{best_lr}")
trainer.save_model(best_ckpt)
tok_distil.save_pretrained(best_ckpt)



ðŸ”¹ Training DistilBERT @ lr=1e-05


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Roc Auc,Pr Auc
1,No log,0.617258,0.410959,0.571795,0.434335
2,No log,0.609769,0.410959,0.597436,0.379407
3,No log,0.606739,0.410959,0.612821,0.432168
4,0.621700,0.604829,0.410959,0.615385,0.440464
5,0.621700,0.604059,0.410959,0.630769,0.457955





ðŸ”¹ Training DistilBERT @ lr=3e-05


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Roc Auc,Pr Auc
1,No log,0.610421,0.410959,0.569231,0.371832
2,No log,0.604862,0.410959,0.594872,0.380993
3,No log,0.653416,0.410959,0.569231,0.351702
4,0.596600,0.608138,0.410959,0.597436,0.473055
5,0.596600,0.658583,0.410959,0.54359,0.354086





ðŸ”¹ Training DistilBERT @ lr=5e-05


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1,Roc Auc,Pr Auc
1,No log,0.61015,0.410959,0.597436,0.398595
2,No log,0.599416,0.410959,0.651282,0.420999
3,No log,0.65679,0.410959,0.628205,0.389938
4,0.582300,0.608232,0.410959,0.617949,0.392745
5,0.582300,0.702287,0.410959,0.610256,0.386066




Unnamed: 0,lr,loss,macro_f1,roc_auc,pr_auc,runtime,samples_per_second,steps_per_second
0,1e-05,0.617258,0.410959,0.571795,0.434335,28.1034,1.53,0.071
1,3e-05,0.610421,0.410959,0.569231,0.371832,29.6793,1.449,0.067
2,5e-05,0.61015,0.410959,0.597436,0.398595,29.4157,1.462,0.068


âœ… Best LR for DistilBERT: 1e-05


('/content/drive/MyDrive/FakeNewsDetector/models/finetuned_fnn_distilbert/distilbert_best_lr1e-05/tokenizer_config.json',
 '/content/drive/MyDrive/FakeNewsDetector/models/finetuned_fnn_distilbert/distilbert_best_lr1e-05/special_tokens_map.json',
 '/content/drive/MyDrive/FakeNewsDetector/models/finetuned_fnn_distilbert/distilbert_best_lr1e-05/vocab.txt',
 '/content/drive/MyDrive/FakeNewsDetector/models/finetuned_fnn_distilbert/distilbert_best_lr1e-05/added_tokens.json',
 '/content/drive/MyDrive/FakeNewsDetector/models/finetuned_fnn_distilbert/distilbert_best_lr1e-05/tokenizer.json')