In [1]:
import pandas as pd
import numpy as np

The drive link for the datasets and trained models is "[Google Drive Link](https://drive.google.com/drive/folders/14Har_LxejVHaMf_sxMpqTThqUUR9CcSU?usp=drive_link)"

https://drive.google.com/drive/folders/14Har_LxejVHaMf_sxMpqTThqUUR9CcSU?usp=drive_link

In [2]:
dataset_dir = "datasets"

train_file = 'news_traindata'
train_nrows = 110000 * 0.9
train_df = pd.read_csv(f'{dataset_dir}/{train_file}.csv', encoding='utf-8', nrows=train_nrows)
train_df = train_df.dropna()

test_file = 'news_testdata'
test_nrows = 110000 * 0.1
test_df = pd.read_csv(f'{dataset_dir}/{test_file}.csv', encoding='utf-8', nrows=test_nrows)
test_df = test_df.dropna()

vocab_file = f'vocabs/vokab_{train_file}_{train_nrows}.pkl'
model_file = f"models/secondary_model_{train_file}_{train_nrows}.pth"
encoded_testfile = f"encoded/secondary_{test_file}_{test_nrows}.pt"
encoded_trainfile = f"encoded/secondary_{train_file}_{train_nrows}.pt"
label_encoding_file = f"encoded/secondary_label_encoding_{train_file}_{train_nrows}.pkl"

In [3]:
train_df['content'] = train_df['title'].fillna('') + ' ' + train_df['content'].fillna('')
test_df['content'] = test_df['title'].fillna('') + ' ' + test_df['content'].fillna('')

In [4]:
import os
import pickle
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from collections import Counter

MAX_VOCAB = 50000
special_tokens = ['<unk>', '<pad>']
tokenizer = get_tokenizer('basic_english')

def generate_vocabulary(df):
    counter = Counter()
    for text in df['content']:
        if pd.notna(text):
            counter.update(tokenizer(str(text)))
    
    most_common = [token for token, _ in counter.most_common(MAX_VOCAB - len(special_tokens))]
    
    vocab = build_vocab_from_iterator([most_common], specials=special_tokens)
    vocab.set_default_index(vocab['<unk>'])

    with open(vocab_file, 'wb') as f:
        pickle.dump(vocab, f)
    print(f"Vocabulary saved to '{vocab_file}'.")

    return vocab


if os.path.exists(vocab_file):
    with open(vocab_file, 'rb') as f:
        vocab = pickle.load(f)
    print(f"Vocabulary loaded from {vocab_file}.")

else:
    vocab = generate_vocabulary(train_df)

Vocabulary loaded from vocabs/vokab_news_traindata_99000.0.pkl.


In [5]:
train_df = train_df[train_df['label'] == 1]
test_df = test_df[test_df['label'] == 1]

In [6]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F

MAX_LENGTH = 4096

if os.path.exists(label_encoding_file):
    with open(label_encoding_file, 'rb') as f:
        le = pickle.load(f)
    print(f"Label encoder loaded from {label_encoding_file}.")
else:
    le = LabelEncoder()
    le.fit(train_df['type'])
    with open(label_encoding_file, 'wb') as f:
        pickle.dump(le, f)
    print(f"Label encoder saved to {label_encoding_file}.")

train_df['type_encoded'] = le.transform(train_df['type'])
test_df['type_encoded'] = le.transform(test_df['type'])
NUM_CLASSES = len(le.classes_)

def encode_text_secondary_model(vocab, df, encoded_file, le):
    if os.path.exists(encoded_file):
        data = torch.load(encoded_file)
        encoded_texts = data['inputs']
        labels = data['labels']
        return encoded_texts, labels
    
    encoded_texts = []
    labels = []

    for text, label in zip(df['content'], df['type_encoded']):
        if pd.notna(text):
            encoded = [vocab[token] for token in tokenizer(text)]
            
            if len(encoded) <= MAX_LENGTH:  # Filter long sequences
                encoded_texts.append(torch.tensor(encoded, dtype=torch.long))
                labels.append(label)

    encoded_texts = pad_sequence(encoded_texts, batch_first=True, padding_value=vocab['<pad>'])
    labels = torch.tensor(labels, dtype=torch.long)
    labels = F.one_hot(labels, num_classes=NUM_CLASSES).float()
    torch.save({'inputs': encoded_texts, 'labels': labels}, encoded_file)


    return encoded_texts, labels

train_encoded_texts, train_labels = encode_text_secondary_model(vocab, train_df, encoded_trainfile, le)
test_encoded_texts, test_labels = encode_text_secondary_model(vocab, test_df, encoded_testfile, le)

Label encoder loaded from encoded/secondary_label_encoding_news_traindata_99000.0.pkl.


In [7]:
print(train_labels[0])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])


In [8]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_ds = NewsDataset(train_encoded_texts, train_labels)
val_ds = NewsDataset(test_encoded_texts[:500], test_labels[:500])

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, pin_memory=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=32, num_workers=0)

In [9]:
from mulstage_model import SecondaryModel
from torch import nn


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
# device = torch.device('cpu')

print(f"Using device: {device}")

model = SecondaryModel(vocab=vocab, vocab_size=len(vocab), embed_dim=100, hidden_dim=128, output_dim=NUM_CLASSES, pad_idx=vocab['<pad>'])
model.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)   

Using device: mps


In [10]:
from tqdm import tqdm

def train(model, loader):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(loader, desc="Training", leave=False):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            preds_class = preds.argmax(dim=1)
            labels_class = yb.argmax(dim=1)
            total_acc += (preds_class == labels_class).float().mean().item()
    
    return total_acc / len(loader)

print("training")

for epoch in range(10):
    loss = train(model, train_dl)
    acc = evaluate(model, val_dl)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Val Acc: {acc:.4f}")

training


                                                            

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), model_file)