In [1]:
from torchtext import data, datasets
import torch
from sklearn.model_selection import train_test_split
import random
import numpy as np
import string
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import nltk
from nltk.corpus import stopwords
import seaborn as sns

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from collections import defaultdict, Counter
import string

In [3]:
from sklearn.model_selection import KFold

In [4]:
# For tokenization
TEXT = data.Field(tokenize='spacy',
    tokenizer_language='en_core_web_sm',
    batch_first=True,
    fix_length=50,
    lower=True,
    unk_token='<unk>',
    pad_token='<pad>'
    )

# For multi-class classification labels
LABEL = data.LabelField()

# Load the TREC dataset
train_data, test_data = datasets.TREC.splits(TEXT, LABEL, fine_grained=False)

downloading train_5500.label


.data\trec\train_5500.label: 100%|██████████| 336k/336k [00:01<00:00, 276kB/s]  


downloading TREC_10.label


.data\trec\TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 107kB/s] 


In [5]:
label_set = set()
for i in train_data:
    label_set.add(i.label)

print(label_set)

{'ABBR', 'LOC', 'ENTY', 'NUM', 'DESC', 'HUM'}


In [6]:
TEXT.build_vocab(train_data, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

.vector_cache\glove.6B.zip: 862MB [06:11, 2.32MB/s]                               
100%|█████████▉| 399999/400000 [00:10<00:00, 37352.49it/s]


In [7]:
embedding_dim = 100
vocab = TEXT.vocab
vectors = vocab.vectors.clone()
glove_vocab = set(vocab.stoi.keys())

# Subword parameters
ngram_min = 3
ngram_max = 6

# Count word frequency in training data
word_counter = Counter()
for example in train_data:
    word_counter.update([w.lower().strip(string.punctuation) for w in example.text])

# Threshold to consider a word “frequent” (adjustable)
freq_threshold = 3

# <unk> vector
unk_vector = torch.zeros(embedding_dim)

def get_subwords(word, n_min=3, n_max=6):
    word = f"<{word.lower()}>"
    subwords = []
    for n in range(n_min, n_max+1):
        subwords += [word[i:i+n] for i in range(len(word)-n+1)]
    return subwords

def get_word_vector(word):
    w_clean = word.lower().strip(string.punctuation)
    
    if w_clean in glove_vocab:
        return vectors[vocab.stoi[w_clean]]
    
    # Subword averaging
    subwords = get_subwords(w_clean, ngram_min, ngram_max)
    subword_vecs = [vectors[vocab.stoi[sg]] for sg in subwords if sg in glove_vocab]
    if subword_vecs:
        return torch.stack(subword_vecs).mean(0)
    
    # Random vector for frequent OOVs
    if word_counter[w_clean] >= freq_threshold:
        return torch.randn(embedding_dim)
    
    # <unk> for rare OOVs
    return unk_vector

# Build embedding matrix
embedding_matrix = {}
for example in train_data:
    for w in example.text:
        if w not in embedding_matrix:
            embedding_matrix[w] = get_word_vector(w)

In [8]:
embedding_dim = 100
vocab = TEXT.vocab

# Create tensor for nn.Embedding
embedding_matrix_tensor = torch.zeros(len(vocab), embedding_dim)
for word, idx in vocab.stoi.items():
    if word in embedding_matrix:
        embedding_matrix_tensor[idx] = embedding_matrix[word]

# Create embedding layer (learnable)
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=False)
print("Embedding layer created with shape:", embedding_layer.weight.shape)


Embedding layer created with shape: torch.Size([8536, 100])


In [None]:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

Using device: cpu


In [19]:
import itertools


In [13]:
# -----------------------------
# 1. Setup & Config
# -----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assume TEXT, LABEL, train_data, test_data, embedding_layer already exist
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))
print(f"Train: {len(train_data)}, Valid: {len(valid_data)}, Test: {len(test_data)}")

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
embedding_dim = embedding_layer.weight.size(1)

Train: 4416, Valid: 491, Test: 500


In [14]:
# -----------------------------
# 2. Model Definition
# -----------------------------
class RNNClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, num_layers, bidirectional, dropout, num_classes):
        super().__init__()
        self.embedding = embedding_layer
        emb_dim = self.embedding.weight.size(1)
        self.rnn = nn.GRU(
            emb_dim, hidden_dim, num_layers=num_layers,
            batch_first=True, bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, text):
        x = self.embedding(text)
        outputs, hidden = self.rnn(x)
        if self.rnn.bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden = hidden[-1]
        out = self.dropout(hidden)
        return self.fc(out)

In [16]:
# -----------------------------
# 2. Model Definition
# -----------------------------
class RNNClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, num_layers, bidirectional, dropout, num_classes):
        super().__init__()
        self.embedding = embedding_layer
        emb_dim = self.embedding.weight.size(1)
        self.rnn = nn.GRU(
            emb_dim, hidden_dim, num_layers=num_layers,
            batch_first=True, bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, text):
        x = self.embedding(text)
        outputs, hidden = self.rnn(x)
        if self.rnn.bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden = hidden[-1]
        out = self.dropout(hidden)
        return self.fc(out)

In [17]:
# -----------------------------
# 3. Training Function
# -----------------------------
def epoch_run(iterator, model, criterion, optimizer=None):
    model.train(optimizer is not None)
    total_loss, total_correct, total_count = 0, 0, 0

    for batch in iterator:
        text, labels = batch.text, batch.label
        logits = model(text)
        loss = criterion(logits, labels)

        if optimizer:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(1)
        total_correct += (preds == labels).sum().item()
        total_count += labels.size(0)

    return total_loss / total_count, total_correct / total_count


In [None]:
# -----------------------------
# 4. Grid Search Loop
# -----------------------------
HIDDEN_DIM = 128
NUM_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.3
NUM_CLASSES = len(LABEL.vocab)
EPOCHS = 10   # keep short for tuning
PATIENT_STOP = 2

batch_sizes = [32, 64, 128]
learning_rates = [1e-3, 2e-3, 3e-3, 5e-4]
results = []

for batch_size, lr in itertools.product(batch_sizes, learning_rates):
    print(f"\n🔍 Testing BATCH={batch_size}, LR={lr}")
    # Build iterators
    train_iter, valid_iter = data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=batch_size,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        device=device
    )

    # Rebuild embedding layer (fresh weights each trial)
    emb_layer = nn.Embedding.from_pretrained(
        embedding_layer.weight.data.clone(),
        freeze=False, padding_idx=PAD_IDX
    )

    model = RNNClassifier(
        embedding_layer=emb_layer,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL,
        dropout=DROPOUT,
        num_classes=NUM_CLASSES
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_acc = 0.0
    patience_counter = 0

    for epoch in range(EPOCHS):
        train_loss, train_acc = epoch_run(train_iter, model, criterion, optimizer)
        val_loss, val_acc = epoch_run(valid_iter, model, criterion)
        print(f"Epoch {epoch+1:02d}: train_acc={train_acc:.4f}, val_acc={val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENT_STOP:
                break

    results.append({
        "batch_size": batch_size,
        "lr": lr,
        "val_acc": best_val_acc
    })



🔍 Testing BATCH=32, LR=0.001
Epoch 01: train_acc=0.6689, val_acc=0.8513
Epoch 02: train_acc=0.8678, val_acc=0.8758
Epoch 03: train_acc=0.9201, val_acc=0.9002
Epoch 04: train_acc=0.9536, val_acc=0.8982
Epoch 05: train_acc=0.9764, val_acc=0.9124
Epoch 06: train_acc=0.9851, val_acc=0.9104
Epoch 07: train_acc=0.9914, val_acc=0.9145
Epoch 08: train_acc=0.9950, val_acc=0.8941
Epoch 09: train_acc=0.9964, val_acc=0.9022

🔍 Testing BATCH=32, LR=0.002
Epoch 01: train_acc=0.7335, val_acc=0.8656


In [None]:
# -----------------------------
# 5. Display Results
# -----------------------------
import pandas as pd
df_results = pd.DataFrame(results)
best_row = df_results.loc[df_results["val_acc"].idxmax()]
print("\n🏆 Best configuration:")
print(best_row)
print("\nAll results:\n", df_results.sort_values("val_acc", ascending=False))