In [70]:
import pandas as pd
import numpy as np

In [None]:
dataset_dir = 'datasets'
file = 'news_data_bin' # 'WELFake_Dataset.csv'
nrows = 10000
vocab_file = f'vocabs/vokab_{file}_{nrows}.pkl'
model_file = f"models/stage2_model_{file}_{nrows}.pth"

df = pd.read_csv(f'{dataset_dir}/{file}.csv', encoding='utf-8', nrows=nrows)
df = df.dropna()
df = df[df['label'] == 1]

df.head()

Unnamed: 0,type,title,content,label
0,junksci,First Certified Organic Fast Food Restaurant t...,by ARIANA MARISOL\n\nFast food restaurants are...,1
2,hate,UCLA Student Is Charged With Attempted Murder ...,A UCLA student allegedly stabbed a classmate f...,1
4,hate,National Vanguard,Transcript by Katana IT HAS NOW become absolut...,1
5,conspiracy,Truth Broadcast Network,448 Views0 Likes\n\nEurope has again been forc...,1
6,fake,Meet China’s Morlocks: 1 Million Beijing Resid...,Meet China’s Morlocks: 1 Million Beijing Resid...,1


In [None]:
import os
import pickle
from torchtext.data.utils import get_tokenizer
from collections import Counter

MAX_VOCAB = 25000
special_tokens = ['<unk>', '<pad>']
tokenizer = get_tokenizer('basic_english')


if os.path.exists(vocab_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    print("Vocabulary loaded from 'vocab.pkl'.")

else:
    print("Unable to load vocab.")
    


Vocabulary loaded from 'vocab.pkl'.


In [73]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F

MAX_LENGTH = 2048

# Label encode the 'type' column
le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])

NUM_CLASSES = len(le.classes_)  # Number of unique classes

encoded_texts_and_labels = []
for text, label in zip(df['content'], df['type_encoded']):
    if pd.notna(text):
        encoded = [vocab[token] for token in tokenizer(text)]
        
        if len(encoded) <= MAX_LENGTH:
            encoded_texts_and_labels.append((torch.tensor(encoded, dtype=torch.long), label))

# Separate encoded texts and labels
encoded_texts = [item[0] for item in encoded_texts_and_labels]
label_indices = torch.tensor([item[1] for item in encoded_texts_and_labels], dtype=torch.long)

# Create one-hot encoded labels
labels = F.one_hot(label_indices, num_classes=NUM_CLASSES).float()

# Pad sequences
padded_texts = pad_sequence(encoded_texts, batch_first=True, padding_value=vocab['<pad>'])

print(f"Filtered texts: {len(padded_texts)}, Labels: {labels.shape}")


Filtered texts: 8755, Labels: torch.Size([8755, 10])


In [74]:
df.head()
print(labels[0])

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])


In [75]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_val, y_train, y_val = train_test_split(padded_texts, labels, test_size=0.1)

train_ds = NewsDataset(X_train, y_train)
val_ds = NewsDataset(X_val, y_val)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, pin_memory=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=32, num_workers=0)

In [76]:
from mulstage_model import CNN_BiLSTM, FakeNewsClassifier
import torch.nn as nn

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")
sec_model = FakeNewsClassifier(input_dim= 100 * 3 + 128 * 2, num_classes=NUM_CLASSES)
sec_model.to(device)

pri_model = CNN_BiLSTM(vocab=vocab, vocab_size=len(vocab), embed_dim=100, hidden_dim=128, output_dim=1, pad_idx=vocab['<pad>'])
pri_model.to(device)

pri_model.load_state_dict(torch.load(f"model_{file}_{nrows}.pth", map_location=torch.device('cpu')))

print("done constructing model")

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(sec_model.parameters(), lr=1e-3)   

print("done constructing optimizer")
import torch
from tqdm import tqdm

# Modify the extract_features function to collect all features
def precompute_features(model, dataloader):
    """
    Precompute features from the primary model for all samples in the dataloader
    Returns:
        all_features: tensor of shape [num_samples, feature_dim]
        all_labels: tensor of shape [num_samples]
    """
    model.eval()  # Set model to evaluation mode
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for xb, yb in tqdm(dataloader, desc="Precomputing features"):
            xb, yb = xb.to(device), yb.to(device)
            features = model.extract_features(xb)
            all_features.append(features)
            all_labels.append(yb)
    
    # Concatenate all features and labels
    all_features = torch.cat(all_features, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    
    return all_features, all_labels

# Create a dataset class for precomputed features
class PrecomputedFeatureDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Precompute features for training and validation sets
print("Precomputing features for training set...")
train_features, train_labels = precompute_features(pri_model, train_dl)
print(f"Training features shape: {train_features.shape}")

print("Precomputing features for validation set...")
val_features, val_labels = precompute_features(pri_model, val_dl)
print(f"Validation features shape: {val_features.shape}")

# Create datasets and dataloaders for precomputed features
train_feature_dataset = PrecomputedFeatureDataset(train_features, train_labels)
val_feature_dataset = PrecomputedFeatureDataset(val_features, val_labels)

# Define batch size for the new dataloader (can be different than original)
batch_size = 64  # Adjust according to your needs

train_feature_dl = torch.utils.data.DataLoader(
    train_feature_dataset, 
    batch_size=batch_size,
    shuffle=True
)

val_feature_dl = torch.utils.data.DataLoader(
    val_feature_dataset, 
    batch_size=batch_size,
    shuffle=False
)

# Now modify the training function to use precomputed features
def train_with_precomputed(sec_model, loader):
    sec_model.train()
    total_loss = 0
    
    for features, labels in tqdm(loader, desc="Training", leave=False):
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        preds = sec_model(features)
        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(loader)

def evaluate_with_precomputed(sec_model, loader):
    sec_model.eval()
    total_acc = 0
    
    with torch.no_grad():
        for features, labels in loader:
            features, labels = features.to(device), labels.to(device)
            
            preds = sec_model(features)
            preds_class = preds.argmax(dim=1)
            labels_class = labels.argmax(dim=1)
            
            total_acc += (preds_class == labels_class).float().mean().item()
    
    return total_acc / len(loader)



Using device: mps
done constructing model
done constructing optimizer
Precomputing features for training set...


Precomputing features: 100%|██████████| 247/247 [00:59<00:00,  4.14it/s]


Training features shape: torch.Size([7879, 556])
Precomputing features for validation set...


Precomputing features: 100%|██████████| 28/28 [00:06<00:00,  4.01it/s]

Validation features shape: torch.Size([876, 556])





In [77]:

# Training loop using precomputed features
print("Training with precomputed features")
for epoch in range(200):
    loss = train_with_precomputed(sec_model, train_feature_dl)
    acc = evaluate_with_precomputed(sec_model, val_feature_dl)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Val Acc: {acc:.4f}")


Training with precomputed features


                                                            

Epoch 10, Loss: 1.1819, Val Acc: 0.5984


                                                            

Epoch 20, Loss: 1.0900, Val Acc: 0.6185


                                                            

Epoch 30, Loss: 1.0375, Val Acc: 0.6174


                                                            

Epoch 40, Loss: 1.0011, Val Acc: 0.6280


                                                           

Epoch 50, Loss: 0.9839, Val Acc: 0.6318


                                                           

Epoch 60, Loss: 0.9653, Val Acc: 0.6352


                                                            

Epoch 70, Loss: 0.9630, Val Acc: 0.6253


                                                            

Epoch 80, Loss: 0.9528, Val Acc: 0.6213


                                                            

Epoch 90, Loss: 0.9401, Val Acc: 0.6164


                                                            

Epoch 100, Loss: 0.9342, Val Acc: 0.6253


                                                            

Epoch 110, Loss: 0.9330, Val Acc: 0.6114


                                                            

Epoch 120, Loss: 0.9213, Val Acc: 0.6175


                                                            

Epoch 130, Loss: 0.9166, Val Acc: 0.6148


                                                            

Epoch 140, Loss: 0.9139, Val Acc: 0.6125


                                                            

Epoch 150, Loss: 0.9158, Val Acc: 0.6102


                                                            

Epoch 160, Loss: 0.9191, Val Acc: 0.6197


                                                            

Epoch 170, Loss: 0.9056, Val Acc: 0.6225


                                                            

Epoch 180, Loss: 0.9119, Val Acc: 0.6108


                                                            

Epoch 190, Loss: 0.9063, Val Acc: 0.6170


                                                            

KeyboardInterrupt: 