In [32]:
import numpy as np
import pandas as pd

In [33]:
dataset_dir = "datasets"
file = 'news_traindata' # 'WELFake_Dataset.csv'
nrows = 10000 #110000 * 0.9
vocab_file = f'vocabs/vokab_{file}_{nrows}.pkl'
model_file = f"models/stage1_model_{file}_{nrows}.pth"


df = pd.read_csv(f'{dataset_dir}/{file}.csv', encoding='utf-8', nrows=nrows)
df = df.dropna()


df.head()

Unnamed: 0,type,title,content,label
0,satire,"Massive Incoming Comet, Earthquake and Tsunami...",It has been reported that there is a disaster ...,1
1,political,A Whisper at the University of Houston-Downtow...,"The newscaster said, ""UHD student is trying to...",1
2,clickbait,"TV show forces babies to cross-dress, pushes f...","NewsGender\n\nISLE OF WIGHT, England, August 2...",1
3,political,National Review Online,Cliffhanger\n\nWright and political wrongs.\n\...,1
4,clickbait,11 Reasons Why Liberals And Progressives Are M...,13415 SHARES Facebook Twitter Reddit Stumbleup...,1


## Preprocessing

##### Generating/loading Vocabulary

In [34]:
import os
import pickle
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from collections import Counter

MAX_VOCAB = 25000
special_tokens = ['<unk>', '<pad>']
tokenizer = get_tokenizer('basic_english')

def generate_vocabulary(df):
    counter = Counter()
    for text in df['content']:
        if pd.notna(text):
            counter.update(tokenizer(str(text)))
    
    most_common = [token for token, _ in counter.most_common(MAX_VOCAB - len(special_tokens))]
    
    vocab = build_vocab_from_iterator([most_common], specials=special_tokens)
    vocab.set_default_index(vocab['<unk>'])

    with open(vocab_file, 'wb') as f:
        pickle.dump(vocab, f)
    print(f"Vocabulary saved to '{vocab_file}'.")

    return vocab


if os.path.exists(vocab_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    print(f"Vocabulary loaded from {vocab_file}.")

else:
    vocab = generate_vocabulary(df)

Vocabulary saved to 'vocabs/vokab_news_traindata_10000.pkl'.


##### Encoding the Content and label with vocabulary

In [35]:
import torch
from torch.nn.utils.rnn import pad_sequence

MAX_LENGTH = 2048

encoded_texts_and_labels = []
for text, label in zip(df['content'], df['label']):
    if pd.notna(text):
        # Tokenize and encode in one step with list comprehension
        encoded = [vocab[token] for token in tokenizer(text)]
        
        if len(encoded) <= MAX_LENGTH:  # Filter long sequences
            encoded_texts_and_labels.append((torch.tensor(encoded, dtype=torch.long), label))


# Separate encoded texts and labels
encoded_texts = [item[0] for item in encoded_texts_and_labels]
labels = torch.tensor([item[1] for item in encoded_texts_and_labels], dtype=torch.float)

# Pad sequences
padded_texts = pad_sequence(encoded_texts, batch_first=True, padding_value=vocab['<pad>'])

print(f"Filtered texts: {len(padded_texts)}, Labels: {len(labels)}")


Filtered texts: 9665, Labels: 9665


##### Creating DataLoader from test-train split

In [36]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_val, y_train, y_val = train_test_split(padded_texts, labels, test_size=0.1)

train_ds = NewsDataset(X_train, y_train)
val_ds = NewsDataset(X_val, y_val)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, pin_memory=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=32, num_workers=0)

## Training The Model

##### Loading the Model

In [37]:
from mulstage_model import CNN_BiLSTM
from torch import nn


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

model = CNN_BiLSTM(vocab=vocab, vocab_size=len(vocab), embed_dim=100, hidden_dim=128, output_dim=1, pad_idx=vocab['<pad>'])
model.to(device)


criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)   


Using device: mps


##### train and validate

In [38]:
from tqdm import tqdm

def train(model, loader):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(loader, desc="Training", leave=False):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            preds_class = (preds > 0.5).float()
            total_acc += (preds_class == yb).float().mean().item()
    return total_acc / len(loader)

print("training")

for epoch in range(10):
    loss = train(model, train_dl)
    acc = evaluate(model, val_dl)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Val Acc: {acc:.4f}")

                                                           

Epoch 1, Loss: 0.2005, Val Acc: 0.9698


                                                           

Epoch 2, Loss: 0.1036, Val Acc: 0.9718


                                                          

KeyboardInterrupt: 

##### saving the model

In [39]:
torch.save(model.state_dict(), model_file)


### Test the model's accuracy

In [40]:

test_acc = evaluate(model, val_dl)
print(f"Test Accuracy: {test_acc:.4f}")

Test Accuracy: 0.9738
