# Fake News Detector using BERT and PyTorch

- BERT Based Fake News Detector (HugginFace Transformers, Pytorch)
- Fine-tune a pretrained transformers (DistilBERT / BERT) on Fake vs Real news
- Evaluate using accuracy / precision / recall / f1
- Save Tokenizer + model for inference

## Import libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from datasets import Dataset, load_metric

## Setting hyperparameters

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

MODEL_NAME = "distilbert-base-uncased"   # use "bert-base-uncased" if you have GPU and more time
MAX_SAMPLES = None   # e.g., 20000 for subsampling on low-memory machines, or None to use all
MAX_LENGTH = 256     # truncation/padding length
BATCH_SIZE = 16      # reduce to 8 or 4 on low-memory CPUs
EPOCHS = 3
OUTPUT_DIR = "hf_fake_news_model"

device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

## Download and Load Dataset

In [None]:
import kagglehub
import os

download_path = ""
os.environ["KAGGLEHUB_CACHE"] = download_path

dataset_identifier = "clmentbisaillon/fake-and-real-news-dataset"
path = kagglehub.dataset_download(dataset_identifier)

In [None]:
root_dir = os.getcwd()
real_file = "True.csv"
fake_file = "Fake.csv"
real_path = ""
fake_path = ""
for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:
        if filename == real_file:
            real_path = os.path.join(dirpath, filename)
        elif filename == fake_file:
            fake_path = os.path.join(dirpath, filename)
    
fake = pd.read_csv(fake_path)
real = pd.read_csv(real_path)

In [None]:
# Label: fake=0, real=1
fake["label"] = 0
real["label"] = 1

## Preprocess

In [None]:
df = pd.concat([fake, real], axis=0).sample(frac=1, random_state=SEED).reset_index(drop=True)
# Keep only text fields to simplify
df["content"] = (df["title"].fillna("") + " " + df["text"].fillna("")).str.strip()
df = df[["content", "label"]]
df = df[df["content"].str.len() > 30].reset_index(drop=True)   # remove extremely short rows

# Optional downsample for low memory
if isinstance(MAX_SAMPLES, int) and MAX_SAMPLES > 0:
    df = df.sample(n=MAX_SAMPLES, random_state=SEED).reset_index(drop=True)

print("Dataset size:", df.shape)
df.head()


## Train and Test Split

In [None]:
train_df, val_df = train_test_split(df, test_size=0.15, random_state=SEED, stratify=df["label"])
print("Train:", train_df.shape, "Val:", val_df.shape)

## Convert HugginFace Dataset

In [None]:
# 4. Convert to HuggingFace Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

## Load Tokenizer and Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

## Tokenization

In [None]:
def tokenize_fn(batch):
    return tokenizer(batch["content"], padding=False, truncation=True, max_length=MAX_LENGTH)

# Use map to tokenize datasets (batched)
train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["content"])
val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=["content"])

# Data collator (dynamic padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Set Metrics

In [None]:
metric_acc = load_metric("accuracy")
metric_f1 = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


## Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),  # only if GPU supports it
)

## Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Train

trainer.train()

## Save Model and Tokenizer

In [None]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

## Eval Model

In [None]:
eval_res = trainer.evaluate(eval_dataset=val_ds)
print("Eval results:", eval_res)

In [None]:
# Detailed classification report on validation set
val_preds = trainer.predict(val_ds)
val_logits = val_preds.predictions
val_labels = val_preds.label_ids
val_preds_arg = np.argmax(val_logits, axis=-1)

print("Classification Report (val):")
print(classification_report(val_labels, val_preds_arg, target_names=["FAKE","REAL"]))

## Example inference

In [None]:
from transformers import pipeline
pipe = pipeline("text-classification", model=OUTPUT_DIR, tokenizer=OUTPUT_DIR, device=0 if torch.cuda.is_available() else -1)

samples = [
    "Local council approves new budget for schools and parks.",
    "Shocking: cure for common cold discovered by home remedy!"
]
print(pipe(samples))

In [None]:
print("""
Notes:
- For production, set max_length carefully; longer max_length increases memory.
- You can switch MODEL_NAME to 'bert-base-uncased' for (usually) better results if you have GPU.
- To train on full dataset on CPU, use smaller BATCH_SIZE (4 or 8) and fewer EPOCHS.
- Use HuggingFace Accelerate or Deepspeed for multi-GPU / large-scale training.
""")