In [14]:
import numpy as np
import pandas as pd

In [15]:
dataset_dir = "datasets"
trainfile = 'news_traindata'
testfile = 'news_testdata'
nrows = 110000 * 0.9 
vocab_file = f'vocabs/vokab_{trainfile}_{nrows}.pkl'
model_file = f"models/stage1_model_{trainfile}_{nrows}.pth"


df = pd.read_csv(f'{dataset_dir}/{testfile}.csv', encoding='utf-8', nrows=nrows)
df = df.dropna()

df.head()

Unnamed: 0,type,title,content,label
0,conspiracy,The 9/11 Commission Didn’t Believe the Governm...,9/11 Commissioners Admit They Never Got the Fu...,1
1,clickbait,Former Watergate Prosecutors: There’s Already ...,5545 SHARES Facebook Twitter Reddit Stumbleupo...,1
2,clickbait,The Washington Times Archives,When Donald Trump takes office later this week...,1
3,hate,Hindu Group Criticizes Toronto School’s Muslim...,A Hindu group that regularly criticizes Islam ...,1
4,clickbait,She created jobs?! Flying ‘press conference’ p...,"Oh look, a press conference from the Hillary f...",1


In [16]:
import os
import pickle
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from collections import Counter

MAX_VOCAB = 25000
special_tokens = ['<unk>', '<pad>']
tokenizer = get_tokenizer('basic_english')


if os.path.exists(vocab_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    print("Vocabulary loaded from 'vocab.pkl'.")

else:
    print("Vocab not found: ", vocab_file)

Vocabulary loaded from 'vocab.pkl'.


In [17]:

import torch
from torch.nn.utils.rnn import pad_sequence

MAX_LENGTH = 2048

encoded_texts_and_labels = []
for text, label in zip(df['content'], df['label']):
    if pd.notna(text):
        # Tokenize and encode in one step with list comprehension
        encoded = [vocab[token] for token in tokenizer(text)]
        
        if len(encoded) <= MAX_LENGTH:  # Filter long sequences
            encoded_texts_and_labels.append((torch.tensor(encoded, dtype=torch.long), label))


# Separate encoded texts and labels
encoded_texts = [item[0] for item in encoded_texts_and_labels]
labels = torch.tensor([item[1] for item in encoded_texts_and_labels], dtype=torch.float)

# Pad sequences
padded_texts = pad_sequence(encoded_texts, batch_first=True, padding_value=vocab['<pad>'])

print(f"Filtered texts: {len(padded_texts)}, Labels: {len(labels)}")


Filtered texts: 10610, Labels: 10610


In [18]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    

test_ds = NewsDataset(padded_texts, labels)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)

In [21]:
from mulstage_model import CNN_BiLSTM
from torch import nn


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

model = CNN_BiLSTM(vocab=vocab, vocab_size=len(vocab), embed_dim=100, hidden_dim=128, output_dim=1, pad_idx=vocab['<pad>'])
model.to(device)


criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)   

model.load_state_dict(torch.load(model_file, map_location=device))

Using device: mps


<All keys matched successfully>

In [22]:
def evaluate(model, loader):
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            preds_class = (preds > 0.5).float()
            total_acc += (preds_class == yb).float().mean().item()
    return total_acc / len(loader)

test_acc = evaluate(model, test_dl)
print(f"Test accuracy: {test_acc:.4f}")

Test accuracy: 0.9822
