In [5]:
from google.colab import files
uploaded = files.upload()


Saving True.csv to True.csv
Saving Fake.csv to Fake.csv


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from tqdm import tqdm

# ----------------------------
# Step 1: Load and Merge Dataset
# ----------------------------
# Ensure you have Fake.csv and True.csv from Kaggle:
# https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

fake['label'] = 0   # 0 = FAKE
true['label'] = 1   # 1 = REAL

data = pd.concat([fake[['text','label']], true[['text','label']]], axis=0).reset_index(drop=True)

texts = data['text'].tolist()
labels = data['label'].tolist()

# ----------------------------
# Step 2: Preprocessing
# ----------------------------
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    return text.lower().split()

tokenized_texts = [clean_text(t) for t in texts]

# Build vocabulary manually
counter = Counter([token for text in tokenized_texts for token in text])
vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.items())}
vocab["<unk>"] = 0
vocab["<pad>"] = 1

# Helper: convert tokens → ids
def tokens_to_ids(tokens, vocab):
    return [vocab.get(token, vocab["<unk>"]) for token in tokens]

# ----------------------------
# Step 3: Dataset Class
# ----------------------------
class NewsDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = tokens_to_ids(self.texts[idx], self.vocab)
        return torch.tensor(tokens), torch.tensor(self.labels[idx])

def collate_fn(batch):
    texts, labels = zip(*batch)
    texts_padded = pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"])
    return texts_padded, torch.tensor(labels)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    tokenized_texts, labels, test_size=0.2, random_state=42
)

train_dataset = NewsDataset(train_texts, train_labels, vocab)
test_dataset = NewsDataset(test_texts, test_labels, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

# ----------------------------
# Step 4: Model
# ----------------------------
class FakeNewsDetector(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(FakeNewsDetector, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=vocab["<pad>"])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        out = self.fc(self.dropout(hidden[-1]))
        return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = FakeNewsDetector(len(vocab), embed_dim=128, hidden_dim=128, output_dim=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ----------------------------
# Step 5: Training
# ----------------------------
for epoch in range(5):  # Train for 5 epochs
    model.train()
    total_loss = 0
    for texts, labels in tqdm(train_loader):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# ----------------------------
# Step 6: Evaluation
# ----------------------------
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        preds = torch.argmax(outputs, dim=1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

print(classification_report(y_true, y_pred, target_names=["FAKE", "REAL"]))

# ----------------------------
# Step 7: Save Model
# ----------------------------
torch.save(model.state_dict(), "fake_news_model.pth")
print("Model saved as fake_news_model.pth")


  2%|▏         | 23/1123 [02:41<2:13:33,  7.29s/it]