In [1]:
from torchtext import data, datasets
import torch
from sklearn.model_selection import train_test_split
import random
import numpy as np
import string
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import nltk
from nltk.corpus import stopwords
import seaborn as sns

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from collections import defaultdict, Counter
import string

In [12]:
from sklearn.model_selection import KFold

In [None]:
# For tokenization
TEXT = data.Field(tokenize='spacy',
    tokenizer_language='en_core_web_sm',
    batch_first=True,
    fix_length=50,
    lower=True,
    unk_token='<unk>',
    pad_token='<pad>'
    )

# For multi-class classification labels
LABEL = data.LabelField()

# Load the TREC dataset
train_data, test_data = datasets.TREC.splits(TEXT, LABEL, fine_grained=False)

downloading train_5500.label


.data\trec\train_5500.label: 100%|██████████| 336k/336k [00:01<00:00, 243kB/s]  


downloading TREC_10.label


.data\trec\TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 92.5kB/s]


In [4]:
label_set = set()
for i in train_data:
    label_set.add(i.label)

print(label_set)

{'LOC', 'ABBR', 'DESC', 'HUM', 'NUM', 'ENTY'}


In [5]:
TEXT.build_vocab(train_data, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

.vector_cache\glove.6B.zip: 862MB [06:20, 2.27MB/s]                               
100%|█████████▉| 399999/400000 [00:36<00:00, 10980.60it/s]


In [6]:
embedding_dim = 100
vocab = TEXT.vocab
vectors = vocab.vectors.clone()
glove_vocab = set(vocab.stoi.keys())

# Subword parameters
ngram_min = 3
ngram_max = 6

# Count word frequency in training data
word_counter = Counter()
for example in train_data:
    word_counter.update([w.lower().strip(string.punctuation) for w in example.text])

# Threshold to consider a word “frequent” (adjustable)
freq_threshold = 3

# <unk> vector
unk_vector = torch.zeros(embedding_dim)

def get_subwords(word, n_min=3, n_max=6):
    word = f"<{word.lower()}>"
    subwords = []
    for n in range(n_min, n_max+1):
        subwords += [word[i:i+n] for i in range(len(word)-n+1)]
    return subwords

def get_word_vector(word):
    w_clean = word.lower().strip(string.punctuation)
    
    if w_clean in glove_vocab:
        return vectors[vocab.stoi[w_clean]]
    
    # Subword averaging
    subwords = get_subwords(w_clean, ngram_min, ngram_max)
    subword_vecs = [vectors[vocab.stoi[sg]] for sg in subwords if sg in glove_vocab]
    if subword_vecs:
        return torch.stack(subword_vecs).mean(0)
    
    # Random vector for frequent OOVs
    if word_counter[w_clean] >= freq_threshold:
        return torch.randn(embedding_dim)
    
    # <unk> for rare OOVs
    return unk_vector

# Build embedding matrix
embedding_matrix = {}
for example in train_data:
    for w in example.text:
        if w not in embedding_matrix:
            embedding_matrix[w] = get_word_vector(w)

In [7]:
embedding_dim = 100
vocab = TEXT.vocab

# Create tensor for nn.Embedding
embedding_matrix_tensor = torch.zeros(len(vocab), embedding_dim)
for word, idx in vocab.stoi.items():
    if word in embedding_matrix:
        embedding_matrix_tensor[idx] = embedding_matrix[word]

# Create embedding layer (learnable)
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=False)
print("Embedding layer created with shape:", embedding_layer.weight.shape)


Embedding layer created with shape: torch.Size([8536, 100])


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [8]:
# Dataset class
class TrecDataset(Dataset):
    def __init__(self, examples, vocab, label_vocab, max_len=50):
        self.vocab = vocab
        self.label_vocab = label_vocab
        self.max_len = max_len
        self.data = []
        self.labels = []

        for ex in examples:
            indices = [vocab.stoi[w] for w in ex.text if w in vocab.stoi]
            # Pad/truncate
            if len(indices) < max_len:
                indices += [vocab.stoi['<pad>']] * (max_len - len(indices))
            else:
                indices = indices[:max_len]
            self.data.append(indices)
            self.labels.append(label_vocab.stoi[ex.label])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])


In [9]:
class RNNClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, output_dim, rnn_type='RNN', dropout=0.0):
        super().__init__()
        self.embedding = embedding_layer
        self.dropout = nn.Dropout(dropout)
        if rnn_type == 'RNN':
            self.rnn = nn.RNN(embedding_layer.embedding_dim, hidden_dim, batch_first=True)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_layer.embedding_dim, hidden_dim, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embedding_layer.embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)
        output, hidden = self.rnn(embedded)
        if isinstance(hidden, tuple):  # LSTM returns (h, c)
            hidden = hidden[0]
        sentence_repr = hidden.squeeze(0)
        sentence_repr = self.dropout(sentence_repr)
        logits = self.fc(sentence_repr)
        return logits


In [11]:
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        logits = model(batch_x)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            logits = model(batch_x)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == batch_y).sum().item()
            total += batch_y.size(0)
    return correct / total


In [14]:
learning_rates = [1e-3, 5e-4, 1e-4, 5e-3, 1e-2]
batch_sizes = [16, 32, 64, 128, 256]
max_len = 50
hidden_dim = 128
num_classes = len(LABEL.vocab)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_acc = 0
best_params = {}

for lr in learning_rates:
    print(f"\n=== Testing Learning Rate: {lr} ===")
    for bs in batch_sizes:
        print(f"\n--- Batch Size: {bs} ---")
        fold_acc = []
        for fold_idx, (train_idx, val_idx) in enumerate(kf.split(train_data.examples)):
            print(f"\n>> Fold {fold_idx+1} / {kf.n_splits}")
            # Prepare datasets
            train_subset = [train_data.examples[i] for i in train_idx]
            val_subset = [train_data.examples[i] for i in val_idx]

            train_dataset = TrecDataset(train_subset, vocab, LABEL.vocab, max_len)
            val_dataset = TrecDataset(val_subset, vocab, LABEL.vocab, max_len)
            train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=bs)

            # Initialize model
            model = RNNClassifier(embedding_layer, hidden_dim, num_classes).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            criterion = nn.CrossEntropyLoss()

            # --- Training per fold ---
            num_epochs = 50
            patience = 5
            epochs_no_improve = 0
            best_fold_acc = 0

            for epoch in range(num_epochs):
                train_loss = train_epoch(model, train_loader, optimizer, criterion)
                val_acc = evaluate(model, val_loader)
                print(f"Epoch {epoch+1}/{num_epochs} - Val Acc: {val_acc:.4f}")

                
                # Early stopping
                if val_acc > best_fold_acc:
                    best_fold_acc = val_acc
                    epochs_no_improve = 0
                else:
                    epochs_no_improve += 1
                    if epochs_no_improve >= patience:
                        print(f"Early stopping at epoch {epoch+1} for fold")
                        break
            print(f"Best Val Acc for Fold {fold_idx+1}: {best_fold_acc:.4f}")
            fold_acc.append(best_fold_acc)  # record best val acc for this fold

        avg_acc = np.mean(fold_acc)
        print(f"LR={lr}, BS={bs}, Avg CV Acc={avg_acc:.4f}")
        if avg_acc > best_acc:
            best_acc = avg_acc
            best_params = {'lr': lr, 'batch_size': bs}

print(f"\n=== Best Hyperparameters Found ===")
print(f"Learning Rate: {best_params['lr']}, Batch Size: {best_params['batch_size']}")
print(f"CV Accuracy: {best_acc:.4f}")


=== Testing Learning Rate: 0.001 ===

--- Batch Size: 16 ---

>> Fold 1 / 5
Epoch 1/50 - Val Acc: 0.2081
Epoch 2/50 - Val Acc: 0.2255
Epoch 3/50 - Val Acc: 0.2200
Epoch 4/50 - Val Acc: 0.2200
Epoch 5/50 - Val Acc: 0.2081
Epoch 6/50 - Val Acc: 0.2255
Epoch 7/50 - Val Acc: 0.2273
Epoch 8/50 - Val Acc: 0.2081
Epoch 9/50 - Val Acc: 0.2200
Epoch 10/50 - Val Acc: 0.2255
Epoch 11/50 - Val Acc: 0.2200
Epoch 12/50 - Val Acc: 0.2255
Early stopping at epoch 12 for fold
Best Val Acc for Fold 1: 0.2273

>> Fold 2 / 5
Epoch 1/50 - Val Acc: 0.2328
Epoch 2/50 - Val Acc: 0.2420
Epoch 3/50 - Val Acc: 0.2420
Epoch 4/50 - Val Acc: 0.2026
Epoch 5/50 - Val Acc: 0.2420
Epoch 6/50 - Val Acc: 0.2026
Epoch 7/50 - Val Acc: 0.2026
Early stopping at epoch 7 for fold
Best Val Acc for Fold 2: 0.2420

>> Fold 3 / 5
Epoch 1/50 - Val Acc: 0.2202
Epoch 2/50 - Val Acc: 0.2202
Epoch 3/50 - Val Acc: 0.1954
Epoch 4/50 - Val Acc: 0.2202
Epoch 5/50 - Val Acc: 0.2202
Epoch 6/50 - Val Acc: 0.2202
Early stopping at epoch 6 for 

KeyboardInterrupt: 