In [1]:
from torchtext import data, datasets
import torch
from sklearn.model_selection import train_test_split
import random
import numpy as np
import string
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import nltk
from nltk.corpus import stopwords
import seaborn as sns

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from collections import defaultdict, Counter
import string

In [3]:
from sklearn.model_selection import KFold

In [4]:
# For tokenization
TEXT = data.Field(tokenize='spacy',
    tokenizer_language='en_core_web_sm',
    batch_first=True,
    fix_length=50,
    lower=True,
    unk_token='<unk>',
    pad_token='<pad>'
    )

# For multi-class classification labels
LABEL = data.LabelField()

# Load the TREC dataset
train_data, test_data = datasets.TREC.splits(TEXT, LABEL, fine_grained=False)

downloading train_5500.label


.data\trec\train_5500.label: 100%|██████████| 336k/336k [00:01<00:00, 276kB/s]  


downloading TREC_10.label


.data\trec\TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 107kB/s] 


In [5]:
label_set = set()
for i in train_data:
    label_set.add(i.label)

print(label_set)

{'ABBR', 'LOC', 'ENTY', 'NUM', 'DESC', 'HUM'}


In [6]:
TEXT.build_vocab(train_data, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

.vector_cache\glove.6B.zip: 862MB [06:11, 2.32MB/s]                               
100%|█████████▉| 399999/400000 [00:10<00:00, 37352.49it/s]


In [7]:
embedding_dim = 100
vocab = TEXT.vocab
vectors = vocab.vectors.clone()
glove_vocab = set(vocab.stoi.keys())

# Subword parameters
ngram_min = 3
ngram_max = 6

# Count word frequency in training data
word_counter = Counter()
for example in train_data:
    word_counter.update([w.lower().strip(string.punctuation) for w in example.text])

# Threshold to consider a word “frequent” (adjustable)
freq_threshold = 3

# <unk> vector
unk_vector = torch.zeros(embedding_dim)

def get_subwords(word, n_min=3, n_max=6):
    word = f"<{word.lower()}>"
    subwords = []
    for n in range(n_min, n_max+1):
        subwords += [word[i:i+n] for i in range(len(word)-n+1)]
    return subwords

def get_word_vector(word):
    w_clean = word.lower().strip(string.punctuation)
    
    if w_clean in glove_vocab:
        return vectors[vocab.stoi[w_clean]]
    
    # Subword averaging
    subwords = get_subwords(w_clean, ngram_min, ngram_max)
    subword_vecs = [vectors[vocab.stoi[sg]] for sg in subwords if sg in glove_vocab]
    if subword_vecs:
        return torch.stack(subword_vecs).mean(0)
    
    # Random vector for frequent OOVs
    if word_counter[w_clean] >= freq_threshold:
        return torch.randn(embedding_dim)
    
    # <unk> for rare OOVs
    return unk_vector

# Build embedding matrix
embedding_matrix = {}
for example in train_data:
    for w in example.text:
        if w not in embedding_matrix:
            embedding_matrix[w] = get_word_vector(w)

In [8]:
embedding_dim = 100
vocab = TEXT.vocab

# Create tensor for nn.Embedding
embedding_matrix_tensor = torch.zeros(len(vocab), embedding_dim)
for word, idx in vocab.stoi.items():
    if word in embedding_matrix:
        embedding_matrix_tensor[idx] = embedding_matrix[word]

# Create embedding layer (learnable)
embedding_layer = nn.Embedding.from_pretrained(embedding_matrix_tensor, freeze=False)
print("Embedding layer created with shape:", embedding_layer.weight.shape)


Embedding layer created with shape: torch.Size([8536, 100])


In [None]:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

Using device: cpu


In [26]:
import itertools, copy, pandas as pd


In [13]:
# -----------------------------
# 1. Setup & Config
# -----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assume TEXT, LABEL, train_data, test_data, embedding_layer already exist
train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))
print(f"Train: {len(train_data)}, Valid: {len(valid_data)}, Test: {len(test_data)}")

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
embedding_dim = embedding_layer.weight.size(1)

Train: 4416, Valid: 491, Test: 500


In [None]:
# -----------------------------
# Simple RNN classifier (tanh)
# -----------------------------
class RNNClassifier(nn.Module):
    def __init__(self, embedding_layer, hidden_dim, num_layers, bidirectional, dropout, num_classes):
        super().__init__()
        self.embedding = embedding_layer
        emb_dim = self.embedding.weight.size(1)
        self.rnn = nn.RNN(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            nonlinearity="tanh",      # simple RNN
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), num_classes)

    def forward(self, text):
        x = self.embedding(text)                     # [B, T, E]
        outputs, hidden = self.rnn(x)                # hidden: [L*D, B, H]
        if self.rnn.bidirectional:
            last = torch.cat((hidden[-2], hidden[-1]), dim=1)  # [B, 2H]
        else:
            last = hidden[-1]                                   # [B, H]
        return self.fc(self.dropout(last))

In [22]:
# -----------------------------
# Train/eval 1 epoch
# -----------------------------
def epoch_run(iterator, model, criterion, optimizer=None):
    model.train(optimizer is not None)
    tot_loss, tot_correct, tot_count = 0.0, 0, 0
    for batch in iterator:
        text, labels = batch.text, batch.label
        logits = model(text)
        loss = criterion(logits, labels)
        if optimizer:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)  # RNNs can explode
            optimizer.step()
        tot_loss += loss.item() * labels.size(0)
        tot_correct += (logits.argmax(1) == labels).sum().item()
        tot_count += labels.size(0)
    return tot_loss / max(tot_count,1), tot_correct / max(tot_count,1)


In [None]:
# -----------------------------
# Grid search with early stopping that SAVES BEST WEIGHTS
# -----------------------------
HIDDEN_DIM = 128
NUM_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.3
NUM_CLASSES = len(LABEL.vocab)
MAX_EPOCHS = 20
PATIENCE = 3

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

batch_sizes = [32, 64, 128]
learning_rates = [5e-4, 1e-3, 2e-3, 3e-3]

results = []

for bs, lr in itertools.product(batch_sizes, learning_rates):
    print(f"\n🔍 Testing BATCH={bs}, LR={lr}")
    train_iter, valid_iter = data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=bs,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        device=device
    )

    # fresh embedding weights each trial
    emb_layer = nn.Embedding.from_pretrained(
        embedding_layer.weight.data.clone(), freeze=False, padding_idx=PAD_IDX
    )
    model = RNNClassifier(
        embedding_layer=emb_layer,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL,
        dropout=DROPOUT,
        num_classes=NUM_CLASSES
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_acc = -1.0
    epochs_no_improve = 0
    best_model_state = None

    for epoch in range(1, MAX_EPOCHS + 1):
        train_loss, train_acc = epoch_run(train_iter, model, criterion, optimizer)
        val_loss, val_acc = epoch_run(valid_iter, model, criterion, optimizer=None)
        print(f"Epoch {epoch:02d}: train_acc={train_acc:.4f}, val_acc={val_acc:.4f}")

        # ---- your early stopping block (with saving best) ----
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_no_improve = 0
            best_model_state = copy.deepcopy(model.state_dict())  # keep BEST weights
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print("Early stopping")
                break

    # restore best weights before logging result (so this combo truly reflects its best)
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    results.append({"batch_size": bs, "lr": lr, "val_acc": best_val_acc})




🔍 Testing BATCH=32, LR=0.0005
Epoch 01: train_acc=0.6003, val_acc=0.7943
Epoch 02: train_acc=0.8252, val_acc=0.8411
Epoch 03: train_acc=0.8705, val_acc=0.8839
Epoch 04: train_acc=0.9017, val_acc=0.9063
Epoch 05: train_acc=0.9237, val_acc=0.8900
Epoch 06: train_acc=0.9386, val_acc=0.9022
Early stopping

🔍 Testing BATCH=32, LR=0.001
Epoch 01: train_acc=0.6771, val_acc=0.8534
Epoch 02: train_acc=0.8490, val_acc=0.8778
Epoch 03: train_acc=0.8961, val_acc=0.8982
Epoch 04: train_acc=0.9214, val_acc=0.9124
Epoch 05: train_acc=0.9450, val_acc=0.9022
Epoch 06: train_acc=0.9656, val_acc=0.8982
Early stopping

🔍 Testing BATCH=32, LR=0.002
Epoch 01: train_acc=0.7124, val_acc=0.8228
Epoch 02: train_acc=0.8594, val_acc=0.8676
Epoch 03: train_acc=0.9128, val_acc=0.8656
Epoch 04: train_acc=0.9402, val_acc=0.8574
Early stopping

🔍 Testing BATCH=32, LR=0.003
Epoch 01: train_acc=0.7251, val_acc=0.8513
Epoch 02: train_acc=0.8449, val_acc=0.8493
Epoch 03: train_acc=0.9096, val_acc=0.8676
Epoch 04: train_a

NameError: name 'pd' is not defined

In [27]:
df_results = pd.DataFrame(results).sort_values("val_acc", ascending=False)
best = df_results.iloc[0]
print("\n🏆 Best configuration:")
print(best)
print("\nAll results (sorted):\n", df_results.reset_index(drop=True))


🏆 Best configuration:
batch_size    32.000000
lr             0.001000
val_acc        0.912424
Name: 1, dtype: float64

All results (sorted):
     batch_size      lr   val_acc
0           32  0.0010  0.912424
1          128  0.0005  0.910387
2           64  0.0005  0.908350
3           32  0.0005  0.906314
4          128  0.0020  0.902240
5           64  0.0010  0.898167
6          128  0.0010  0.892057
7           64  0.0020  0.892057
8           64  0.0030  0.885947
9           32  0.0030  0.883910
10         128  0.0030  0.881874
11          32  0.0020  0.867617


In [None]:
# --- 1) Fix best batch size & lr from your previous grid search result `best` ---
best_batch_size = int(best["batch_size"])
best_lr = float(best["lr"])
print(f"\n✅ Using best hyperparams from LR/Batch search -> batch_size={best_batch_size}, lr={best_lr}")

# --- 2) Hidden-dim search using fixed (batch_size, lr) ---

# Search space (tweak as you like)
hidden_dims = [64, 96, 128, 192, 256, 384, 512]

results_hd = []

# Rebuild iterators ONCE with best batch size
train_iter, valid_iter = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=best_batch_size,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
)

for hd in hidden_dims:
    print(f"\n🧪 Testing hidden_dim={hd}")

    # fresh embedding weights for each trial
    emb_layer = nn.Embedding.from_pretrained(
        embedding_layer.weight.data.clone(), freeze=False, padding_idx=PAD_IDX
    )

    model = RNNClassifier(
        embedding_layer=emb_layer,
        hidden_dim=hd,
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL,
        dropout=DROPOUT,
        num_classes=NUM_CLASSES
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=best_lr)

    best_val_acc_hd = -1.0
    epochs_no_improve = 0
    best_model_state_hd = None

    for epoch in range(1, MAX_EPOCHS + 1):
        train_loss, train_acc = epoch_run(train_iter, model, criterion, optimizer)
        val_loss, val_acc = epoch_run(valid_iter, model, criterion, optimizer=None)
        print(f"Epoch {epoch:02d}: train_acc={train_acc:.4f}, val_acc={val_acc:.4f}")

        # Early stopping + save-best (your pattern)
        if val_acc > best_val_acc_hd:
            best_val_acc_hd = val_acc
            epochs_no_improve = 0
            best_model_state_hd = copy.deepcopy(model.state_dict())
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= PATIENCE:
                print("Early stopping")
                break

    # Restore best for this hidden_dim before recording
    if best_model_state_hd is not None:
        model.load_state_dict(best_model_state_hd)

    results_hd.append({"hidden_dim": hd, "val_acc": best_val_acc_hd})


In [None]:
# --- 3) Pick best hidden dim, print & save variable ---
df_hd = pd.DataFrame(results_hd).sort_values("val_acc", ascending=False).reset_index(drop=True)
best_hidden_dim = int(df_hd.loc[0, "hidden_dim"])

print("\n🏆 Best hidden_dim configuration:")
print(df_hd.loc[0])
print("\nAll hidden_dim results (sorted):")
print(df_hd)

# Variables now set for subsequent training:
print(f"\n👉 Final choice: best_batch_size={best_batch_size}, best_lr={best_lr}, best_hidden_dim={best_hidden_dim}")

------------------

In [None]:
import itertools, copy
import torch
import torch.nn as nn
import pandas as pd
from torchtext import data

# -----------------------------
# Generic helpers
# -----------------------------
def build_iters(batch_size, train_data, valid_data, device):
    return data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=batch_size,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        device=device
    )

def build_model(embedding_layer, pad_idx, hidden_dim, num_layers, bidirectional, dropout, num_classes, device):
    emb_layer = nn.Embedding.from_pretrained(
        embedding_layer.weight.data.clone(), freeze=False, padding_idx=pad_idx
    )
    model = RNNClassifier(
        embedding_layer=emb_layer,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        bidirectional=bidirectional,
        dropout=dropout,
        num_classes=num_classes
    ).to(device)
    return model

def epoch_run(iterator, model, criterion, optimizer=None):
    model.train(optimizer is not None)
    tot_loss, tot_correct, tot_count = 0.0, 0, 0
    for batch in iterator:
        text, labels = batch.text, batch.label
        logits = model(text)
        loss = criterion(logits, labels)
        if optimizer:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
        tot_loss   += loss.item() * labels.size(0)
        tot_correct += (logits.argmax(1) == labels).sum().item()
        tot_count  += labels.size(0)
    return tot_loss / max(tot_count,1), tot_correct / max(tot_count,1)

def train_one_config(batch_size, lr, hidden_dim, *,
                     num_layers, bidirectional, dropout, num_classes,
                     train_data, valid_data, pad_idx, device,
                     max_epochs=20, patience=3):
    """Train one (batch_size, lr, hidden_dim) with early stopping; return best_val_acc and best_state."""
    train_iter, valid_iter = build_iters(batch_size, train_data, valid_data, device)
    model = build_model(embedding_layer, pad_idx, hidden_dim, num_layers, bidirectional, dropout, num_classes, device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    best_val_acc = -1.0
    epochs_no_improve = 0
    best_state = None

    for epoch in range(1, max_epochs + 1):
        train_loss, train_acc = epoch_run(train_iter, model, criterion, optimizer)
        val_loss,   val_acc   = epoch_run(valid_iter, model, criterion, optimizer=None)
        print(f"Epoch {epoch:02d}: train_acc={train_acc:.4f}, val_acc={val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_no_improve = 0
            best_state = copy.deepcopy(model.state_dict())
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
    return best_val_acc, model


In [30]:
NUM_CLASSES   = len(LABEL.vocab)
NUM_LAYERS    = 1
BIDIRECTIONAL = True
DROPOUT       = 0.3
MAX_EPOCHS    = 20
PATIENCE      = 3

batch_sizes     = [32, 64, 128]
learning_rates  = [5e-4, 1e-3, 2e-3, 3e-3]
HIDDEN_DIM_FIXED = 128   # fixed while tuning (batch, lr)

results = []
for bs, lr in itertools.product(batch_sizes, learning_rates):
    print(f"\n🔍 Testing BATCH={bs}, LR={lr}")
    val_acc, _ = train_one_config(
        bs, lr, HIDDEN_DIM_FIXED,
        num_layers=NUM_LAYERS, bidirectional=BIDIRECTIONAL, dropout=DROPOUT, num_classes=NUM_CLASSES,
        train_data=train_data, valid_data=valid_data, pad_idx=PAD_IDX, device=device,
        max_epochs=MAX_EPOCHS, patience=PATIENCE
    )
    results.append({"batch_size": bs, "lr": lr, "val_acc": val_acc})

df_results = pd.DataFrame(results).sort_values("val_acc", ascending=False).reset_index(drop=True)
best = df_results.loc[0]
best_batch_size = int(best["batch_size"])
best_lr        = float(best["lr"])
print("\n🏆 Best (batch, lr):")
print(best)
print(f"\nSaved vars -> best_batch_size={best_batch_size}, best_lr={best_lr}")



🔍 Testing BATCH=32, LR=0.0005
Epoch 01: train_acc=0.6187, val_acc=0.8147
Epoch 02: train_acc=0.8102, val_acc=0.8391
Epoch 03: train_acc=0.8718, val_acc=0.8778
Epoch 04: train_acc=0.8981, val_acc=0.9043
Epoch 05: train_acc=0.9167, val_acc=0.8880
Epoch 06: train_acc=0.9366, val_acc=0.9043
Epoch 07: train_acc=0.9545, val_acc=0.8941
Early stopping

🔍 Testing BATCH=32, LR=0.001
Epoch 01: train_acc=0.6769, val_acc=0.8737
Epoch 02: train_acc=0.8406, val_acc=0.8635
Epoch 03: train_acc=0.9051, val_acc=0.8859
Epoch 04: train_acc=0.9282, val_acc=0.9022
Epoch 05: train_acc=0.9452, val_acc=0.8758
Epoch 06: train_acc=0.9654, val_acc=0.8859
Epoch 07: train_acc=0.9758, val_acc=0.8880
Early stopping

🔍 Testing BATCH=32, LR=0.002
Epoch 01: train_acc=0.6791, val_acc=0.8391
Epoch 02: train_acc=0.8576, val_acc=0.8819
Epoch 03: train_acc=0.9092, val_acc=0.8819
Epoch 04: train_acc=0.9466, val_acc=0.8819
Epoch 05: train_acc=0.9558, val_acc=0.9043
Epoch 06: train_acc=0.9817, val_acc=0.8900
Epoch 07: train_acc

In [31]:
hidden_dims = [64, 96, 128, 192, 256, 384, 512]
results_hd = []

for hd in hidden_dims:
    print(f"\n🧪 Testing hidden_dim={hd} (batch={best_batch_size}, lr={best_lr})")
    val_acc, _ = train_one_config(
        best_batch_size, best_lr, hd,
        num_layers=NUM_LAYERS, bidirectional=BIDIRECTIONAL, dropout=DROPOUT, num_classes=NUM_CLASSES,
        train_data=train_data, valid_data=valid_data, pad_idx=PAD_IDX, device=device,
        max_epochs=MAX_EPOCHS, patience=PATIENCE
    )
    results_hd.append({"hidden_dim": hd, "val_acc": val_acc})

df_hd = pd.DataFrame(results_hd).sort_values("val_acc", ascending=False).reset_index(drop=True)
best_hidden_dim = int(df_hd.loc[0, "hidden_dim"])
print("\n🏆 Best hidden_dim configuration:")
print(df_hd.loc[0])
print(f"\nSaved var -> best_hidden_dim={best_hidden_dim}")
print("\nAll hidden_dim results:\n", df_hd)



🧪 Testing hidden_dim=64 (batch=128, lr=0.0005)
Epoch 01: train_acc=0.3718, val_acc=0.4847
Epoch 02: train_acc=0.5661, val_acc=0.7006
Epoch 03: train_acc=0.6891, val_acc=0.7780
Epoch 04: train_acc=0.7538, val_acc=0.8086
Epoch 05: train_acc=0.8000, val_acc=0.8411
Epoch 06: train_acc=0.8379, val_acc=0.8554
Epoch 07: train_acc=0.8659, val_acc=0.8778
Epoch 08: train_acc=0.8915, val_acc=0.8819
Epoch 09: train_acc=0.9062, val_acc=0.8900
Epoch 10: train_acc=0.9214, val_acc=0.9002
Epoch 11: train_acc=0.9275, val_acc=0.8941
Epoch 12: train_acc=0.9418, val_acc=0.8900
Epoch 13: train_acc=0.9472, val_acc=0.8941
Early stopping

🧪 Testing hidden_dim=96 (batch=128, lr=0.0005)
Epoch 01: train_acc=0.3872, val_acc=0.5295
Epoch 02: train_acc=0.5942, val_acc=0.6802
Epoch 03: train_acc=0.7235, val_acc=0.7637
Epoch 04: train_acc=0.8030, val_acc=0.8411
Epoch 05: train_acc=0.8458, val_acc=0.8310
Epoch 06: train_acc=0.8755, val_acc=0.8880
Epoch 07: train_acc=0.8956, val_acc=0.8941
Epoch 08: train_acc=0.9101, v

------------kfold------


In [32]:
import itertools, copy, random
import numpy as np
import torch
import torch.nn as nn
from torchtext import data
from sklearn.model_selection import KFold

# -----------------------------
# Reproducibility
# -----------------------------
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# -----------------------------
# Generic helpers
# -----------------------------
def build_iters(batch_size, train_data, valid_data, device):
    return data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=batch_size,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        device=device
    )

def build_model(embedding_layer, pad_idx, hidden_dim, num_layers, bidirectional, dropout, num_classes, device):
    emb_layer = nn.Embedding.from_pretrained(
        embedding_layer.weight.data.clone(), freeze=False, padding_idx=pad_idx
    )
    model = RNNClassifier(
        embedding_layer=emb_layer,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        bidirectional=bidirectional,
        dropout=dropout,
        num_classes=num_classes
    ).to(device)
    return model

def epoch_run(iterator, model, criterion, optimizer=None):
    model.train(optimizer is not None)
    tot_loss, tot_correct, tot_count = 0.0, 0, 0
    for batch in iterator:
        text, labels = batch.text, batch.label
        logits = model(text)
        loss = criterion(logits, labels)
        if optimizer:
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimizer.step()
        tot_loss += loss.item() * labels.size(0)
        tot_correct += (logits.argmax(1) == labels).sum().item()
        tot_count += labels.size(0)
    return tot_loss / max(tot_count,1), tot_correct / max(tot_count,1)

# -----------------------------
# K-Fold training function
# -----------------------------
def train_kfold_config(k_folds, batch_size, lr, hidden_dim, *,
                       num_layers, bidirectional, dropout, num_classes,
                       full_dataset, pad_idx, device, embedding_layer,
                       max_epochs=20, patience=3, seed=42):

    set_seed(seed)  # 🔒 make results deterministic

    # Convert torchtext dataset examples to indices for sklearn KFold
    all_indices = np.arange(len(full_dataset))
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=seed)

    fold_results = []
    fold_models = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(all_indices), 1):
        print(f"\n===== Fold {fold}/{k_folds} =====")

        # Create new Dataset splits
        train_data = [full_dataset.examples[i] for i in train_idx]
        valid_data = [full_dataset.examples[i] for i in val_idx]
        train_data = data.Dataset(train_data, fields=full_dataset.fields)
        valid_data = data.Dataset(valid_data, fields=full_dataset.fields)

        # Build iterators
        train_iter, valid_iter = build_iters(batch_size, train_data, valid_data, device)

        # Build model, loss, optimizer
        model = build_model(embedding_layer, pad_idx, hidden_dim, num_layers, bidirectional, dropout, num_classes, device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        best_val_acc = -1.0
        epochs_no_improve = 0
        best_state = None

        # Train per fold
        for epoch in range(1, max_epochs + 1):
            train_loss, train_acc = epoch_run(train_iter, model, criterion, optimizer)
            val_loss, val_acc = epoch_run(valid_iter, model, criterion, optimizer=None)
            print(f"Epoch {epoch:02d}: train_acc={train_acc:.4f}, val_acc={val_acc:.4f}")

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                epochs_no_improve = 0
                best_state = copy.deepcopy(model.state_dict())
            else:
                epochs_no_improve += 1
                if epochs_no_improve >= patience:
                    print("Early stopping")
                    break

        # Load best model
        if best_state:
            model.load_state_dict(best_state)

        print(f"Fold {fold} best val_acc={best_val_acc:.4f}")
        fold_results.append(best_val_acc)
        fold_models.append(model)

    # Average accuracy
    mean_acc = np.mean(fold_results)
    print(f"\n===== K-Fold Results =====")
    for i, acc in enumerate(fold_results, 1):
        print(f"Fold {i}: {acc:.4f}")
    print(f"Average Val Accuracy: {mean_acc:.4f}")

    return mean_acc, fold_results, fold_models


In [None]:
# ===========================
# K-Fold grid search for (batch_size, lr)
# ===========================
import itertools, numpy as np, pandas as pd

# ---- Your fixed model/config bits ----
NUM_CLASSES   = len(LABEL.vocab)
NUM_LAYERS    = 1
BIDIRECTIONAL = True
DROPOUT       = 0.3
MAX_EPOCHS    = 20
PATIENCE      = 3
SEED          = 42
K_FOLDS       = 5

batch_sizes       = [32, 64, 128]
learning_rates    = [5e-4, 1e-3, 2e-3, 3e-3]
HIDDEN_DIM_FIXED  = 128   # fixed while tuning (batch, lr)

# IMPORTANT: the full dataset for K-Fold (use your combined training set)
FULL_DATASET = train_data   # <- if your full corpus variable has a different name, replace here

results = []

for bs, lr in itertools.product(batch_sizes, learning_rates):
    print(f"\n🔍 K-Fold Testing BATCH={bs}, LR={lr}")
    mean_acc, fold_accs, fold_models = train_kfold_config(
        k_folds=K_FOLDS,
        batch_size=bs,
        lr=lr,
        hidden_dim=HIDDEN_DIM_FIXED,
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL,
        dropout=DROPOUT,
        num_classes=NUM_CLASSES,
        full_dataset=FULL_DATASET,
        pad_idx=PAD_IDX,
        device=device,
        embedding_layer=embedding_layer,
        max_epochs=MAX_EPOCHS,
        patience=PATIENCE,
        seed=SEED
    )
    results.append({
        "batch_size": bs,
        "lr": lr,
        "cv_mean_acc": float(mean_acc),
        "cv_std": float(np.std(fold_accs)),
        "per_fold": [float(x) for x in fold_accs],  # optional: inspect later
    })

# Rank by mean CV acc (desc), then by lower std (tie-breaker)
df_results = (
    pd.DataFrame(results)
      .sort_values(["cv_mean_acc", "cv_std"], ascending=[False, True])
      .reset_index(drop=True)
)

best = df_results.loc[0]
best_batch_size = int(best["batch_size"])
best_lr         = float(best["lr"])

print("\n🏆 Best (batch, lr) by K-Fold:")
print(best[["batch_size","lr","cv_mean_acc","cv_std"]])
print(f"\nSaved vars -> best_batch_size={best_batch_size}, best_lr={best_lr}")



🔍 K-Fold Testing BATCH=32, LR=0.0005

===== Fold 1/5 =====
Epoch 01: train_acc=0.5614, val_acc=0.7534
Epoch 02: train_acc=0.8086, val_acc=0.8303
Epoch 03: train_acc=0.8692, val_acc=0.8473
Epoch 04: train_acc=0.8890, val_acc=0.8710
Epoch 05: train_acc=0.9077, val_acc=0.8676
Epoch 06: train_acc=0.9340, val_acc=0.8812
Epoch 07: train_acc=0.9414, val_acc=0.8710
Epoch 08: train_acc=0.9524, val_acc=0.8846
Epoch 09: train_acc=0.9689, val_acc=0.8880
Epoch 10: train_acc=0.9745, val_acc=0.8756
Epoch 11: train_acc=0.9836, val_acc=0.8812
Epoch 12: train_acc=0.9887, val_acc=0.8744
Early stopping
Fold 1 best val_acc=0.8880

===== Fold 2/5 =====
Epoch 01: train_acc=0.5468, val_acc=0.7271
Epoch 02: train_acc=0.8033, val_acc=0.8188
Epoch 03: train_acc=0.8647, val_acc=0.8437
Epoch 04: train_acc=0.8913, val_acc=0.8505
Epoch 05: train_acc=0.9142, val_acc=0.8686
Epoch 06: train_acc=0.9389, val_acc=0.8652
Epoch 07: train_acc=0.9462, val_acc=0.8664
Epoch 08: train_acc=0.9553, val_acc=0.8766
Epoch 09: train_

In [None]:
# ===========================
# K-Fold tuning for hidden_dim
# ===========================

hidden_dims = [64, 96, 128, 192, 256, 384, 512]
results_hd = []

for hd in hidden_dims:
    print(f"\n🧪 K-Fold Testing hidden_dim={hd} (batch={best_batch_size}, lr={best_lr})")
    mean_acc, fold_accs, fold_models = train_kfold_config(
        k_folds=K_FOLDS,
        batch_size=best_batch_size,
        lr=best_lr,
        hidden_dim=hd,
        num_layers=NUM_LAYERS,
        bidirectional=BIDIRECTIONAL,
        dropout=DROPOUT,
        num_classes=NUM_CLASSES,
        full_dataset=train_data,     # full dataset (same as before)
        pad_idx=PAD_IDX,
        device=device,
        embedding_layer=embedding_layer,
        max_epochs=MAX_EPOCHS,
        patience=PATIENCE,
        seed=SEED
    )

    results_hd.append({
        "hidden_dim": hd,
        "cv_mean_acc": float(mean_acc),
        "cv_std": float(np.std(fold_accs)),
        "per_fold": [float(x) for x in fold_accs],
    })

# Rank by mean CV acc (desc), tie-break by lower std
df_hd = (
    pd.DataFrame(results_hd)
      .sort_values(["cv_mean_acc", "cv_std"], ascending=[False, True])
      .reset_index(drop=True)
)

best_hidden_dim = int(df_hd.loc[0, "hidden_dim"])

print("\n🏆 Best hidden_dim configuration (K-Fold):")
print(df_hd.loc[0, ["hidden_dim","cv_mean_acc","cv_std"]])
print(f"\nSaved var -> best_hidden_dim={best_hidden_dim}")

print("\nAll hidden_dim results:\n", df_hd)
