In [1]:
# =============================
# Movie Review Sentiment Classifier with RNNs and LSTMs
# Dataset: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
# =============================

import re
import random
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Try torchtext for GloVe
try:
    from torchtext.vocab import GloVe
    TORCHTEXT_OK = True
except Exception:
    TORCHTEXT_OK = False

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# =============================
# Step 1: Load Dataset
# =============================
data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
df = pd.read_csv(data_path)
print(df.head())
print(df.sentiment.value_counts())

# =============================
# Step 2: Tokenization & Vocab
# =============================
def simple_tokenize(text: str):
    text = text.lower()
    return re.findall(r"[a-z0-9']+", text)

def build_vocab(texts, min_freq=2, max_vocab=40000):
    freq = {}
    for t in texts:
        for tok in simple_tokenize(t):
            freq[tok] = freq.get(tok, 0) + 1
    vocab = {"<pad>": 0, "<unk>": 1}
    sorted_items = sorted(freq.items(), key=lambda x: (-x[1], x[0]))
    for tok, c in sorted_items:
        if c < min_freq: 
            continue
        if len(vocab) >= max_vocab:
            break
        vocab[tok] = len(vocab)
    return vocab

class IMDBDataset(Dataset):
    def __init__(self, df, vocab, max_len=300):
        self.texts = df["review"].tolist()
        self.labels = df["sentiment"].map({"positive":1, "negative":0}).astype(int).tolist()
        self.vocab = vocab
        self.unk_id = self.vocab.get("<unk>", 1)
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def text_to_ids(self, text):
        tokens = simple_tokenize(text)
        ids = [self.vocab.get(tok, self.unk_id) for tok in tokens]
        return ids[:self.max_len]

    def __getitem__(self, idx):
        ids = self.text_to_ids(self.texts[idx])
        label = self.labels[idx]
        return torch.tensor(ids, dtype=torch.long), torch.tensor(label, dtype=torch.float32)

def collate_batch(batch, pad_id=0):
    sequences, labels = zip(*batch)
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)
    max_len = max(lengths).item()
    padded = torch.full((len(sequences), max_len), pad_id, dtype=torch.long)
    for i, seq in enumerate(sequences):
        padded[i, :len(seq)] = seq
    labels = torch.stack(labels, dim=0)
    return padded, lengths, labels

# =============================
# Step 3: Models
# =============================
class VanillaRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, bidirectional,
                 pad_idx, embedding_matrix=None, freeze_embeddings=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        if embedding_matrix is not None:
            with torch.no_grad():
                self.embedding.weight.copy_(embedding_matrix)
        self.embedding.weight.requires_grad = not freeze_embeddings

        self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=num_layers,
                          batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), 1)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, h_n = self.rnn(packed)
        if self.rnn.bidirectional:
            h_last = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_last = h_n[-1]
        return self.fc(h_last).squeeze(1)

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, bidirectional,
                 pad_idx, embedding_matrix=None, freeze_embeddings=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        if embedding_matrix is not None:
            with torch.no_grad():
                self.embedding.weight.copy_(embedding_matrix)
        self.embedding.weight.requires_grad = not freeze_embeddings

        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers,
                            batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim * (2 if bidirectional else 1), 1)

    def forward(self, x, lengths):
        emb = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.lstm(packed)
        if self.lstm.bidirectional:
            h_last = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_last = h_n[-1]
        return self.fc(h_last).squeeze(1)

# =============================
# Step 4: Training Utils
# =============================
def accuracy_from_logits(logits, y):
    preds = (torch.sigmoid(logits) >= 0.5).float()
    return (preds == y).float().mean().item()

def run_epoch(model, loader, optimizer, train=True):
    model.train(train)
    total_loss, total_acc, total_n = 0, 0, 0
    criterion = nn.BCEWithLogitsLoss()
    for x, lengths, y in loader:
        x, lengths, y = x.to(DEVICE), lengths.to(DEVICE), y.to(DEVICE)
        logits = model(x, lengths)
        loss = criterion(logits, y)
        if train:
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        acc = accuracy_from_logits(logits.detach(), y)
        total_loss += loss.item() * x.size(0)
        total_acc += acc * x.size(0)
        total_n += x.size(0)
    return total_loss/total_n, total_acc/total_n

def evaluate(model, loader):
    return run_epoch(model, loader, optimizer=None, train=False)

# =============================
# Step 5: GloVe Embeddings
# =============================
def load_glove_matrix(vocab, dim=100):
    vocab_size = len(vocab)
    matrix = torch.randn(vocab_size, dim) * 0.05
    matrix[vocab["<pad>"]] = 0.0
    if not TORCHTEXT_OK:
        print("Torchtext not available. Using random init for embeddings.")
        return matrix
    vectors = GloVe(name="6B", dim=dim)
    for token, idx in vocab.items():
        if token in vectors.stoi:
            matrix[idx] = vectors[token]
    return matrix

# =============================
# Step 6: Build DataLoaders
# =============================
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["sentiment"])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df["sentiment"])

vocab = build_vocab(train_df["review"].tolist(), min_freq=2, max_vocab=40000)
pad_idx = vocab["<pad>"]

train_ds = IMDBDataset(train_df, vocab, max_len=300)
val_ds   = IMDBDataset(val_df,   vocab, max_len=300)
test_ds  = IMDBDataset(test_df,  vocab, max_len=300)

collate = lambda batch: collate_batch(batch, pad_id=pad_idx)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=collate)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate)
test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False, collate_fn=collate)

# =============================
# Step 7: Run 4 Experiments
# =============================
def train_one(model_kind="rnn", embedding_kind="glove_static", epochs=3):
    if embedding_kind == "glove_static":
        emb_matrix = load_glove_matrix(vocab, dim=100)
        freeze = True
    else:
        emb_matrix = None
        freeze = False

    if model_kind == "rnn":
        model = VanillaRNNClassifier(len(vocab), 100, 128, num_layers=1,
                                     bidirectional=True, pad_idx=pad_idx,
                                     embedding_matrix=emb_matrix, freeze_embeddings=freeze)
    else:
        model = LSTMClassifier(len(vocab), 100, 128, num_layers=1,
                               bidirectional=True, pad_idx=pad_idx,
                               embedding_matrix=emb_matrix, freeze_embeddings=freeze)

    model.to(DEVICE)
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)

    best_acc, best_state = 0, None
    for ep in range(1, epochs+1):
        tr_loss, tr_acc = run_epoch(model, train_loader, optimizer, train=True)
        va_loss, va_acc = evaluate(model, val_loader)
        if va_acc > best_acc:
            best_acc, best_state = va_acc, model.state_dict()
        print(f"{embedding_kind}+{model_kind.upper()} | Epoch {ep}: "
              f"Train Acc={tr_acc:.4f}, Val Acc={va_acc:.4f}")
    if best_state: model.load_state_dict(best_state)
    te_loss, te_acc = evaluate(model, test_loader)
    print(f"[TEST] {embedding_kind}+{model_kind.upper()} Acc={te_acc:.4f}")
    return te_acc

results = {}
for emb in ["glove_static", "trainable"]:
    for mk in ["rnn", "lstm"]:
        results[f"{emb}+{mk}"] = train_one(mk, emb, epochs=3)

print("\n=== Final Test Accuracies ===")
for k, v in results.items():
    print(f"{k:24s}: {v:.4f}")


Using device: cpu
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
Torchtext not available. Using random init for embeddings.
glove_static+RNN | Epoch 1: Train Acc=0.5504, Val Acc=0.5767
glove_static+RNN | Epoch 2: Train Acc=0.6117, Val Acc=0.6328
glove_static+RNN | Epoch 3: Train Acc=0.6361, Val Acc=0.6290
[TEST] glove_static+RNN Acc=0.6358
Torchtext not available. Using random init for embeddings.
glove_static+LSTM | Epoch 1: Train Acc=0.5779, Val Acc=0.6302
glove_static+LSTM | Epoch 2: Train Acc=0.6572, Val Acc=0.6482
glove_static+LSTM | Epoch 3: Train Acc=0.6787, Val Acc=0.6895
[TEST] glo