## Multilevel Classification

In [18]:
# import necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset   


from tqdm import tqdm

In [5]:
train_df = pd.read_csv('data/ModApte_train.csv')
test_df= pd.read_csv('data/ModApte_test.csv')

In [7]:
train_df.head()

Unnamed: 0,text,text_type,topics,lewis_split,cgis_split,old_id,new_id,places,people,orgs,exchanges,date,title
0,Showers continued throughout the week in\nthe ...,"""NORM""",['cocoa'],"""TRAIN""","""TRAINING-SET""","""5544""","""1""",['el-salvador' 'usa' 'uruguay'],[],[],[],26-FEB-1987 15:01:01.79,BAHIA COCOA REVIEW
1,The U.S. Agriculture Department\nreported the ...,"""NORM""",['grain' 'wheat' 'corn' 'barley' 'oat' 'sorghum'],"""TRAIN""","""TRAINING-SET""","""5548""","""5""",['usa'],[],[],[],26-FEB-1987 15:10:44.60,NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE
2,Argentine grain board figures show\ncrop regis...,"""NORM""",['veg-oil' 'linseed' 'lin-oil' 'soy-oil' 'sun-...,"""TRAIN""","""TRAINING-SET""","""5549""","""6""",['argentina'],[],[],[],26-FEB-1987 15:14:36.41,ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS
3,Moody's Investors Service Inc said it\nlowered...,"""NORM""",[],"""TRAIN""","""TRAINING-SET""","""5551""","""8""",['usa'],[],[],[],26-FEB-1987 15:15:40.12,USX &lt;X> DEBT DOWGRADED BY MOODY'S
4,Champion Products Inc said its\nboard of direc...,"""NORM""",['earn'],"""TRAIN""","""TRAINING-SET""","""5552""","""9""",['usa'],[],[],[],26-FEB-1987 15:17:11.20,CHAMPION PRODUCTS &lt;CH> APPROVES STOCK SPLIT


In [None]:
train_df['topics'] = train_df['topics'].apply(lambda x: x.split('/'))
test_df['topics'] = test_df['topics'].apply(lambda x: x.split('/'))

In [None]:
# prepare multi-label binarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_df['topics'])
y_test = mlb.transform(test_df['topics'])
num_classes = len(mlb.classes_)
print(f"Number of unique classes: {num_classes}")

Number of unique classes: 473




In [None]:
# Load Glove embeddings
embeddings_index = {}
with open("data/glove.6B.100d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print(f"Loaded {len(embeddings_index)} word vectors.")

Loaded 400000 word vectors.


In [20]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [21]:
# Text to Glove embedding vectors
def text_to_glove_vector(text):
    words = [w for w in text.lower().split() if w not in ENGLISH_STOP_WORDS]
    vectors = [embeddings_index[w] for w in words if w in embeddings_index]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

In [22]:
X_train = np.array([text_to_glove_vector(text) for text in tqdm(str(train_df['text']))])
X_test = np.array([text_to_glove_vector(text) for text in tqdm(str(test_df['text']))])


100%|██████████| 677/677 [00:00<00:00, 44636.39it/s]

100%|██████████| 677/677 [00:00<00:00, 115175.79it/s]


In [23]:
class ReutersDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

In [24]:
train_dataset = ReutersDataset(X_train, y_train)
test_dataset = ReutersDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [26]:
# Define simple MLP model
class MultiLabelMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MultiLabelMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

model = MultiLabelMLP(input_dim=100, hidden_dim=128, output_dim=num_classes)

In [30]:
creterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# make preadictions and check loss
model.eval()
with torch.no_grad():
    total_loss = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = creterion(outputs, labels)
        total_loss += loss.item()
    print(f"Test Loss: {total_loss / len(test_loader)}") 

Test Loss: 0.6956346305933866
