In [18]:
import numpy as np
import pandas as pd

In [19]:
dataset_dir = "datasets"
file = 'news_traindata' # 'WELFake_Dataset.csv'
nrows = 110000 * 0.9
vocab_file = f'vocabs/vokab_{file}_{nrows}.pkl'
model_file = f"models/simple_model_{file}_{nrows}.pth"

df = pd.read_csv(f'{dataset_dir}/{file}.csv', encoding='utf-8', nrows=nrows)
df = df.dropna()

df.head(20)

Unnamed: 0,type,title,content,label
0,satire,"Massive Incoming Comet, Earthquake and Tsunami...",It has been reported that there is a disaster ...,1
1,political,A Whisper at the University of Houston-Downtow...,"The newscaster said, ""UHD student is trying to...",1
2,clickbait,"TV show forces babies to cross-dress, pushes f...","NewsGender\n\nISLE OF WIGHT, England, August 2...",1
3,political,National Review Online,Cliffhanger\n\nWright and political wrongs.\n\...,1
4,clickbait,11 Reasons Why Liberals And Progressives Are M...,13415 SHARES Facebook Twitter Reddit Stumbleup...,1
5,political,Cruz Tells Evangelicals He Can Reverse Marriag...,Cruz Tells Evangelicals He Can Reverse Marriag...,1
6,unreliable,Cable: 1975BONN06291,Raw content\n\nLIMITED OFFICIAL USE PAGE 01 BO...,1
7,satire,"Champagne renowned for its urine-like flavour,...","Champagne renowned for its urine-like flavour,...",1
8,hate,Tarik Zahzah 2015-01-30 09-23-55,Click to share on Twitter (Opens in new window),1
9,reliable,NEWS SUMMARY,Nuclear Technology in Iran\n\nPresident Mohamm...,0


## Preprocessing

In [20]:
import os
import pickle
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from collections import Counter

MAX_VOCAB = 25000
special_tokens = ['<unk>', '<pad>']
tokenizer = get_tokenizer('basic_english')

def generate_vocabulary(df):
    counter = Counter()
    for text in df['content']:
        if pd.notna(text):
            counter.update(tokenizer(str(text)))
    
    most_common = [token for token, _ in counter.most_common(MAX_VOCAB - len(special_tokens))]
    
    vocab = build_vocab_from_iterator([most_common], specials=special_tokens)
    vocab.set_default_index(vocab['<unk>'])

    with open(vocab_file, 'wb') as f:
        pickle.dump(vocab, f)
    print(f"Vocabulary saved to '{vocab_file}'.")

    return vocab


if os.path.exists(vocab_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    print("Vocabulary loaded from 'vocab.pkl'.")

else:
    vocab = generate_vocabulary(df)



Vocabulary saved to 'vocabs/vokab_news_traindata_99000.0.pkl'.


In [26]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F

MAX_LENGTH = 2048

# Label encode the 'type' column
le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])

NUM_CLASSES = len(le.classes_)  # Number of unique classes
print(f"Number of classes: {NUM_CLASSES}")

encoded_texts_and_labels = []
for text, label in zip(df['content'], df['type_encoded']):
    if pd.notna(text):
        encoded = [vocab[token] for token in tokenizer(text)]
        
        if len(encoded) <= MAX_LENGTH:
            encoded_texts_and_labels.append((torch.tensor(encoded, dtype=torch.long), label))

# Separate encoded texts and labels
encoded_texts = [item[0] for item in encoded_texts_and_labels]
label_indices = torch.tensor([item[1] for item in encoded_texts_and_labels], dtype=torch.long)

# Create one-hot encoded labels
labels = F.one_hot(label_indices, num_classes=NUM_CLASSES).float()

# Pad sequences
padded_texts = pad_sequence(encoded_texts, batch_first=True, padding_value=vocab['<pad>'])

print(f"Filtered texts: {len(padded_texts)}, Labels: {labels.shape}")


Number of classes: 11
Filtered texts: 95566, Labels: torch.Size([95566, 11])


In [22]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_val, y_train, y_val = train_test_split(padded_texts, labels, test_size=0.1)

train_ds = NewsDataset(X_train, y_train)
val_ds = NewsDataset(X_val, y_val)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, pin_memory=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=32, num_workers=0)

In [23]:
from mulstage_model import CNN_BiLSTM
from torch import nn


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")
model = CNN_BiLSTM(vocab=vocab, vocab_size=len(vocab), embed_dim=100, hidden_dim=128, output_dim=NUM_CLASSES, pad_idx=vocab['<pad>'])
model.to(device)

print("done constructing model")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)   

print("done constructing optimizer")

from tqdm import tqdm

def train(model, loader):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(loader, desc="Training", leave=False):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            # print(preds.shape, yb.shape)
            preds_class = preds.argmax(dim=1)
            #v print(preds_class)
            labels_class = yb.argmax(dim=1)
            # print(labels_class)
            total_acc += (preds_class == labels_class).float().mean().item()
            # print(preds_class, labels_class, total_acc)
    return total_acc / len(loader)

print("training")

for epoch in range(5):
    loss = train(model, train_dl)
    acc = evaluate(model, val_dl)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Val Acc: {acc:.4f}")


Using device: mps
done constructing model
done constructing optimizer
training


                                                             

Epoch 1, Loss: 1.8776, Val Acc: 0.7315


                                                             

Epoch 2, Loss: 1.7816, Val Acc: 0.7616


                                                             

Epoch 3, Loss: 1.7560, Val Acc: 0.7680


                                                             

Epoch 4, Loss: 1.7392, Val Acc: 0.7800


                                                             

Epoch 5, Loss: 1.7274, Val Acc: 0.7855


In [24]:

for epoch in range(5):
    loss = train(model, train_dl)
    acc = evaluate(model, val_dl)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Val Acc: {acc:.4f}")

                                                             

Epoch 1, Loss: 1.7176, Val Acc: 0.7849


                                                             

Epoch 2, Loss: 1.7114, Val Acc: 0.7945


                                                             

Epoch 3, Loss: 1.7034, Val Acc: 0.7939


                                                             

Epoch 4, Loss: 1.6994, Val Acc: 0.7944


                                                             

Epoch 5, Loss: 1.6948, Val Acc: 0.7964


In [25]:
torch.save(model.state_dict(), model_file)
# for epoch in range(5):
#     loss = train(model, train_dl)
#     acc = evaluate(model, val_dl)
#     print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Val Acc: {acc:.4f}")