In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
from collections import Counter

train_df = pd.read_csv("../data/processed/liar2_train.csv")

train_df["clean_statement"] = train_df["statement"].str.lower().str.replace(r"[^\w\s]", "", regex=True)
train_df["tokens"] = train_df["clean_statement"].apply(lambda x: x.split())

all_tokens = [token for tokens in train_df["tokens"] for token in tokens]
token_counts = Counter(all_tokens)

vocab = {"<PAD>": 0, "<UNK>": 1}
for i, (token, _) in enumerate(token_counts.items(), start=2):
    vocab[token] = i


In [4]:
vocab_size   = len(vocab)
pad_idx      = vocab["<PAD>"]


In [None]:
hidden_dim = 128

In [6]:
import torch
import torch.nn as nn

class FakeNewsClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super(FakeNewsClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return output


In [7]:
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 1281
output_dim = 6  # 6 classes (0 to 5)
pad_idx = vocab["<PAD>"]

model = FakeNewsClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx)


In [8]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [11]:
class Liar2Dataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.inputs[idx], dtype=torch.long)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_tensor, label_tensor

# Pad the encoded inputs
MAX_LEN = 30
def pad_sequence(seq, max_len):
    if len(seq) < max_len:
        return seq + [vocab["<PAD>"]] * (max_len - len(seq))
    else:
        return seq[:max_len]

train_df["input_ids"] = train_df["tokens"].apply(lambda tokens: [vocab.get(token, vocab["<UNK>"]) for token in tokens])
train_df["padded_ids"] = train_df["input_ids"].apply(lambda x: pad_sequence(x, MAX_LEN))

# Create Dataset and DataLoader
train_dataset = Liar2Dataset(train_df["padded_ids"].tolist(), train_df["label"].tolist())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [12]:
from tqdm import tqdm

EPOCHS = 5  # You can change this later
model.train()

for epoch in range(EPOCHS):
    epoch_loss = 0
    correct = 0
    total = 0
    
    for batch_inputs, batch_labels in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == batch_labels).sum().item()
        total += batch_labels.size(0)
    
    acc = correct / total
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {epoch_loss:.4f} - Accuracy: {acc:.4f}")


100%|██████████| 575/575 [02:12<00:00,  4.33it/s]


Epoch 1/5 - Loss: 1006.8689 - Accuracy: 0.2874


100%|██████████| 575/575 [02:13<00:00,  4.30it/s]


Epoch 2/5 - Loss: 1009.2454 - Accuracy: 0.2926


100%|██████████| 575/575 [02:22<00:00,  4.03it/s]


Epoch 3/5 - Loss: 938.6860 - Accuracy: 0.3159


100%|██████████| 575/575 [01:51<00:00,  5.14it/s]


Epoch 4/5 - Loss: 896.1878 - Accuracy: 0.3413


100%|██████████| 575/575 [02:04<00:00,  4.60it/s]

Epoch 5/5 - Loss: 856.0284 - Accuracy: 0.3626



