In [7]:
import numpy as np
import pandas as pd

In [8]:
dataset_dir = "datasets"
trainfile = 'news_traindata'
testfile = 'news_testdata' # 'WELFake_Dataset.csv'
nrows = 110000 * 0.9
vocab_file = f'vocabs/vokab_{trainfile}_{nrows}.pkl'
model_file = f"models/simple_model_{trainfile}_{nrows}.pth"

df = pd.read_csv(f'{dataset_dir}/{testfile}.csv', encoding='utf-8', nrows=nrows)
df = df.dropna()

df.head(20)

Unnamed: 0,type,title,content,label
0,conspiracy,The 9/11 Commission Didn’t Believe the Governm...,9/11 Commissioners Admit They Never Got the Fu...,1
1,clickbait,Former Watergate Prosecutors: There’s Already ...,5545 SHARES Facebook Twitter Reddit Stumbleupo...,1
2,clickbait,The Washington Times Archives,When Donald Trump takes office later this week...,1
3,hate,Hindu Group Criticizes Toronto School’s Muslim...,A Hindu group that regularly criticizes Islam ...,1
4,clickbait,She created jobs?! Flying ‘press conference’ p...,"Oh look, a press conference from the Hillary f...",1
5,rumor,Leading Tory MP calls on Theresa May to end bi...,PA Tim Loughton urged Theresa May to avoid try...,1
6,unreliable,Cable: 1976YAOUND03669,Tor\n\nTor is an encrypted anonymising network...,1
7,unreliable,Cable: 1975SANTIA07393,Tor\n\nTor is an encrypted anonymising network...,1
8,satire,Photogallery - Duterte asks why the USA did no...,Back to article: Duterte asks why the USA did ...,1
9,political,Electability: Hillary vs. Trump,"Amidst all the focus on Trump’s insanity, ther...",1


In [9]:
import os
import pickle
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from collections import Counter

MAX_VOCAB = 25000
special_tokens = ['<unk>', '<pad>']
tokenizer = get_tokenizer('basic_english')


if os.path.exists(vocab_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    print("Vocabulary loaded from 'vocab.pkl'.")

else:
    print("Vocab not found: ", vocab_file)


Vocabulary loaded from 'vocab.pkl'.


In [10]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F

MAX_LENGTH = 2048

# Label encode the 'type' column
le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])

NUM_CLASSES = len(le.classes_)  # Number of unique classes
print(f"Number of classes: {NUM_CLASSES}")

encoded_texts_and_labels = []
for text, label in zip(df['content'], df['type_encoded']):
    if pd.notna(text):
        encoded = [vocab[token] for token in tokenizer(text)]
        
        if len(encoded) <= MAX_LENGTH:
            encoded_texts_and_labels.append((torch.tensor(encoded, dtype=torch.long), label))

# Separate encoded texts and labels
encoded_texts = [item[0] for item in encoded_texts_and_labels]
label_indices = torch.tensor([item[1] for item in encoded_texts_and_labels], dtype=torch.long)

# Create one-hot encoded labels
labels = F.one_hot(label_indices, num_classes=NUM_CLASSES).float()

# Pad sequences
padded_texts = pad_sequence(encoded_texts, batch_first=True, padding_value=vocab['<pad>'])

print(f"Filtered texts: {len(padded_texts)}, Labels: {labels.shape}")


Number of classes: 11
Filtered texts: 10610, Labels: torch.Size([10610, 11])


In [17]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    

test_ds = NewsDataset(padded_texts, labels)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)

In [18]:
from mulstage_model import CNN_BiLSTM
from torch import nn


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")
model = CNN_BiLSTM(vocab=vocab, vocab_size=len(vocab), embed_dim=100, hidden_dim=128, output_dim=NUM_CLASSES, pad_idx=vocab['<pad>'])
model.to(device)

print("done constructing model")

model.load_state_dict(torch.load(model_file))


Using device: mps
done constructing model


<All keys matched successfully>

In [20]:
# check the basic fake-real separation accuracy

RELIABLE_IDX = 1

def evaluate_basic(model, loader):
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            # print(xb.shape, yb.shape)
            preds = model(xb)
            preds_class = preds.argmax(dim=1)
            #v print(preds_class)
            labels_class = yb.argmax(dim=1)
            # print(labels_class)
            preds_real = preds_class == RELIABLE_IDX
            labels_real = labels_class == RELIABLE_IDX
            total_acc += (preds_real == labels_real).float().mean().item()
            # print(preds_class, labels_class, total_acc)
    return total_acc / len(loader)

eval_basic = evaluate_basic(model, test_dl)
print(f"Basic evaluation accuracy: {eval_basic:.4f}")

Basic evaluation accuracy: 0.9457


In [21]:
def evaluate(model, loader):
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            # print(preds.shape, yb.shape)
            preds_class = preds.argmax(dim=1)
            #v print(preds_class)
            labels_class = yb.argmax(dim=1)
            # print(labels_class)
            total_acc += (preds_class == labels_class).float().mean().item()
            # print(preds_class, labels_class, total_acc)
    return total_acc / len(loader)

eval_basic = evaluate(model, test_dl)
print(f"Basic evaluation accuracy: {eval_basic:.4f}")

Basic evaluation accuracy: 0.8003
