# Fake News Detection â€“ Capstone3 Project

## Objective
The goal of this project is to build a machine learning/NLP model
to detect whether a news article is **fake** or **real**.

This notebook focuses on:
- Dataset exploration (EDA for text)
- Data quality checks
- Baseline model training
- Model evaluation

Production deployment, monitoring, and API orchestration
are implemented separately in scripts.

### Imports

In [None]:
import os
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F

# For reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

### Dataset Structure Check

In [None]:
DATA_DIR = "data/processed"

for split in ["train.csv", "val.csv", "test.csv"]:
    path = os.path.join(DATA_DIR, split)
    df = pd.read_csv(path)
    print(f"\n{split.upper()}")
    print(f"Number of samples: {len(df)}")
    print("Class distribution:")
    print(df['label'].value_counts())

### Visualize Sample Texts (EDA)

In [None]:
def show_samples(df, label, n=5):
    print(f"\nSamples from class '{label}':\n")
    samples = df[df['label']==label]['text'].sample(n)
    for i, text in enumerate(samples):
        print(f"{i+1}. {text[:500]}...\n")  # show first 500 chars

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
show_samples(train_df, "fake")
show_samples(train_df, "real")

### Text Transformations / Preprocessing

In [None]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"\d+", "", text)      # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    return text

train_df['text_clean'] = train_df['text'].apply(clean_text)
val_df = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
val_df['text_clean'] = val_df['text'].apply(clean_text)

### TF-IDF Vectorization

In [None]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df['text_clean'])
X_val = vectorizer.transform(val_df['text_clean'])

y_train = train_df['label'].map({"fake":0, "real":1}).values
y_val = val_df['label'].map({"fake":0, "real":1}).values

### PyTorch Dataset & DataLoader (Optional, for later transformer use)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.todense(), dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = NewsDataset(X_train, y_train)
val_ds = NewsDataset(X_val, y_val)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)

### Baseline Model: Logistic Regression

In [None]:
baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, y_train)

### Evaluation on Validation Set

In [None]:
y_pred = baseline_model.predict(X_val)

print(classification_report(y_val, y_pred, target_names=["fake", "real"]))

cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["fake", "real"], yticklabels=["fake", "real"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

### Optional: Quick PyTorch Feedforward Model

In [None]:
# Small FFNN for demonstration
input_dim = X_train.shape[1]
class FFNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 2)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

device = "cuda" if torch.cuda.is_available() else "cpu"
ffnn_model = FFNN(input_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ffnn_model.parameters(), lr=1e-3)

### Training Loop (Few Epochs)

In [None]:
for epoch in range(3):
    ffnn_model.train()
    total_loss = 0
    for X_batch, y_batch in train_dl:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = ffnn_model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dl):.4f}")

### Validation Loop

In [None]:
ffnn_model.eval()
all_preds, all_targets = [], []
with torch.no_grad():
    for X_batch, y_batch in val_dl:
        X_batch = X_batch.to(device)
        logits = ffnn_model(X_batch)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_targets.extend(y_batch.numpy())

print(classification_report(all_targets, all_preds, target_names=["fake", "real"]))

cm = confusion_matrix(all_targets, all_preds)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens",
            xticklabels=["fake", "real"], yticklabels=["fake", "real"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - PyTorch Model")
plt.show()

## Observations
- Dataset is moderately balanced between fake and real classes.
- TF-IDF + Logistic Regression baseline performs reasonably well.
- Misclassifications often occur on very short or ambiguous articles.
- PyTorch feedforward network shows similar performance; transformer upgrade can improve metrics.

## Conclusion
This notebook validated the feasibility of text-based fake news detection using both:
- Classical ML (TF-IDF + Logistic Regression)
- Lightweight neural networks (PyTorch feedforward)

The system is ready for:
- Full training scripts (`train.py`)
- API-based inference (`predict.py`)
- Deployment, monitoring, and production scaling