In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

PART1


In [None]:
!pip install -q tqdm
import os, re, io
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import torch.optim as optim



In [None]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df['label'] = (df['sentiment'] == 'positive').astype(int)
df = df[['review', 'label']]

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['text'] = df['review'].apply(clean_text)

def tokenize(s):
    return s.split()

MAX_VOCAB = 20000
counter = Counter()
for t in df['text']:
    counter.update(tokenize(t))
most_common = counter.most_common(MAX_VOCAB-2)
itos = ['<PAD>', '<OOV>'] + [w for w,_ in most_common]
stoi = {w:i for i,w in enumerate(itos)}
vocab_size = len(itos)
print("Vocab size:", vocab_size)


Vocab size: 20000


In [None]:
MAX_LEN = 200

def text_to_sequence(text, stoi, max_len=MAX_LEN):
    toks = tokenize(text)
    seq = [stoi.get(t, 1) for t in toks]
    if len(seq) >= max_len:
        return seq[:max_len]
    else:
        return seq + [0]*(max_len - len(seq))

df['seq'] = df['text'].apply(lambda x: text_to_sequence(x, stoi, MAX_LEN))

train_val, test = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)
train, valid = train_test_split(train_val, test_size=0.111111, stratify=train_val['label'], random_state=42)

print(len(train), len(valid), len(test))


40000 5000 5000


In [None]:
GLOVE_PATH = "/kaggle/input/glove6b100dtxt/glove.6B.100d.txt"  # download from: https... (put in working dir)
EMB_DIM = 100

emb_index = {}
with open(GLOVE_PATH, 'r', encoding='utf8') as f:
    for line in f:
        parts = line.rstrip().split(' ')
        word = parts[0]
        vec = np.asarray(parts[1:], dtype='float32')
        if vec.shape[0] == EMB_DIM:
            emb_index[word] = vec

embedding_matrix = np.random.normal(scale=0.6, size=(vocab_size, EMB_DIM)).astype(np.float32)
embedding_matrix[0] = np.zeros(EMB_DIM, dtype=np.float32)

found = 0
for i, token in enumerate(itos):
    if token in emb_index:
        embedding_matrix[i] = emb_index[token]
        found += 1
print(f"Found {found}/{vocab_size} tokens in GloVe")


Found 19248/20000 tokens in GloVe


In [None]:
class IMDBDataset(Dataset):
    def __init__(self, df):
        self.seqs = df['seq'].tolist()
        self.labels = df['label'].astype(np.int64).tolist()
    def __len__(self):
        return len(self.seqs)
    def __getitem__(self, idx):
        return torch.tensor(self.seqs[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float)

BATCH_SIZE = 64
train_loader = DataLoader(IMDBDataset(train), batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(IMDBDataset(valid), batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(IMDBDataset(test), batch_size=BATCH_SIZE, shuffle=False)


In [None]:
class VanillaRNN(nn.Module):
    def __init__(self, embedding_matrix, emb_dim=EMB_DIM, hidden_size=128, num_layers=1, bidirectional=False, freeze_emb=True):
        super().__init__()
        vocab_size = embedding_matrix.shape[0]
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix))
        if freeze_emb:
            self.embedding.weight.requires_grad = False
        self.rnn = nn.RNN(input_size=emb_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        mult = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_size*mult, 1)
    def forward(self, x):
        emb = self.embedding(x)
        out, h_n = self.rnn(emb)
        if self.rnn.bidirectional:
            hidden = torch.cat([h_n[-2], h_n[-1]], dim=1)
        else:
            hidden = h_n[-1]
        logits = self.fc(hidden).squeeze(1)
        return logits

device = "cuda" if torch.cuda.is_available() else "cpu"
model = VanillaRNN(embedding_matrix, hidden_size=128, bidirectional=True, freeze_emb=True).to(device)
print(model)


VanillaRNN(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (rnn): RNN(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
EPOCHS = 5

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()
    losses, preds_all, trues_all = [], [], []
    with torch.set_grad_enabled(train):
        for x, y in tqdm(loader, leave=False):
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            losses.append(loss.item())
            probs = torch.sigmoid(logits).detach().cpu().numpy()
            preds_all += (probs >= 0.5).astype(int).tolist()
            trues_all += y.cpu().numpy().astype(int).tolist()
    avg_loss = np.mean(losses)
    f1 = f1_score(trues_all, preds_all)
    acc = accuracy_score(trues_all, preds_all)
    return avg_loss, f1, acc

for epoch in range(1, EPOCHS+1):
    train_loss, train_f1, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_f1, val_acc = run_epoch(valid_loader, train=False)
    print(f"Epoch {epoch}: Train loss {train_loss:.4f} f1 {train_f1:.4f} | Val loss {val_loss:.4f} f1 {val_f1:.4f}")


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 1: Train loss 0.6815 f1 0.5642 | Val loss 0.7092 f1 0.6679


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 2: Train loss 0.6652 f1 0.5907 | Val loss 0.6890 f1 0.6087


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 3: Train loss 0.6837 f1 0.5605 | Val loss 0.6707 f1 0.4608


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 4: Train loss 0.6788 f1 0.5703 | Val loss 0.6561 f1 0.6154


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 5: Train loss 0.6715 f1 0.5882 | Val loss 0.6727 f1 0.6725


In [None]:
test_loss, test_f1, test_acc = run_epoch(test_loader, train=False)
print(f"Test F1: {test_f1:.4f} | Test Acc: {test_acc:.4f}")


  0%|          | 0/79 [00:00<?, ?it/s]

Test F1: 0.6707 | Test Acc: 0.5890


In [None]:
class LSTMClassifier(VanillaRNN):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # override rnn with LSTM
        self.rnn = nn.LSTM(
            input_size=self.embedding.embedding_dim,
            hidden_size=128,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.fc = nn.Linear(128 * 2, 1)  # 256 → 1

    def forward(self, x):
        embedded = self.embedding(x)
        output, (h_n, c_n) = self.rnn(embedded)

        if self.rnn.bidirectional:
            hidden = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            hidden = h_n[-1]

        logits = self.fc(hidden).squeeze(1)
        return logits



criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
EPOCHS = 5

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()
    losses, preds_all, trues_all = [], [], []
    with torch.set_grad_enabled(train):
        for x, y in tqdm(loader, leave=False):
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            losses.append(loss.item())
            probs = torch.sigmoid(logits).detach().cpu().numpy()
            preds_all += (probs >= 0.5).astype(int).tolist()
            trues_all += y.cpu().numpy().astype(int).tolist()
    avg_loss = np.mean(losses)
    f1 = f1_score(trues_all, preds_all)
    acc = accuracy_score(trues_all, preds_all)
    return avg_loss, f1, acc

for epoch in range(1, EPOCHS+1):
    train_loss, train_f1, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_f1, val_acc = run_epoch(valid_loader, train=False)
    print(f"Epoch {epoch}: Train loss {train_loss:.4f} f1 {train_f1:.4f} | Val loss {val_loss:.4f} f1 {val_f1:.4f}")



  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 1: Train loss 0.6802 f1 0.5516 | Val loss 0.6375 f1 0.6910


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 2: Train loss 0.4667 f1 0.7865 | Val loss 0.4049 f1 0.8370


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 3: Train loss 0.3860 f1 0.8296 | Val loss 0.3810 f1 0.8303


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 4: Train loss 0.3591 f1 0.8436 | Val loss 0.3533 f1 0.8419


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 5: Train loss 0.3429 f1 0.8511 | Val loss 0.3554 f1 0.8332


### 2. Eval

In [None]:
test_loss, test_f1, test_acc = run_epoch(test_loader, train=False)
print(f"Test F1: {test_f1:.4f} | Test Acc: {test_acc:.4f}")



  0%|          | 0/79 [00:00<?, ?it/s]

Test F1: 0.8422 | Test Acc: 0.8468


In [None]:
model = VanillaRNN(embedding_matrix, hidden_size=128, bidirectional=True, freeze_emb=False).to(device)
print(model)


criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
EPOCHS = 5

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()
    losses, preds_all, trues_all = [], [], []
    with torch.set_grad_enabled(train):
        for x, y in tqdm(loader, leave=False):
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            losses.append(loss.item())
            probs = torch.sigmoid(logits).detach().cpu().numpy()
            preds_all += (probs >= 0.5).astype(int).tolist()
            trues_all += y.cpu().numpy().astype(int).tolist()
    avg_loss = np.mean(losses)
    f1 = f1_score(trues_all, preds_all)
    acc = accuracy_score(trues_all, preds_all)
    return avg_loss, f1, acc

for epoch in range(1, EPOCHS+1):
    train_loss, train_f1, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_f1, val_acc = run_epoch(valid_loader, train=False)
    print(f"Epoch {epoch}: Train loss {train_loss:.4f} f1 {train_f1:.4f} | Val loss {val_loss:.4f} f1 {val_f1:.4f}")


VanillaRNN(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (rnn): RNN(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 1: Train loss 0.6642 f1 0.6071 | Val loss 0.6523 f1 0.7217


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 2: Train loss 0.6779 f1 0.5664 | Val loss 0.6750 f1 0.6657


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 3: Train loss 0.6744 f1 0.5740 | Val loss 0.6854 f1 0.2034


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 4: Train loss 0.6115 f1 0.6679 | Val loss 0.6278 f1 0.5888


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 5: Train loss 0.6351 f1 0.6243 | Val loss 0.6489 f1 0.6828


In [None]:
test_loss, test_f1, test_acc = run_epoch(test_loader, train=False)
print(f"Test F1: {test_f1:.4f} | Test Acc: {test_acc:.4f}")



  0%|          | 0/79 [00:00<?, ?it/s]

Test F1: 0.6958 | Test Acc: 0.6454


In [None]:
model = LSTMClassifier(
    embedding_matrix,
    hidden_size=128,
    bidirectional=True,
    freeze_emb= False
).to(device)
print(model)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
EPOCHS = 5

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()
    losses, preds_all, trues_all = [], [], []
    with torch.set_grad_enabled(train):
        for x, y in tqdm(loader, leave=False):
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            losses.append(loss.item())
            probs = torch.sigmoid(logits).detach().cpu().numpy()
            preds_all += (probs >= 0.5).astype(int).tolist()
            trues_all += y.cpu().numpy().astype(int).tolist()
    avg_loss = np.mean(losses)
    f1 = f1_score(trues_all, preds_all)
    acc = accuracy_score(trues_all, preds_all)
    return avg_loss, f1, acc

for epoch in range(1, EPOCHS+1):
    train_loss, train_f1, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_f1, val_acc = run_epoch(valid_loader, train=False)
    print(f"Epoch {epoch}: Train loss {train_loss:.4f} f1 {train_f1:.4f} | Val loss {val_loss:.4f} f1 {val_f1:.4f}")



LSTMClassifier(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (rnn): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 1: Train loss 0.5991 f1 0.6501 | Val loss 0.3621 f1 0.8551


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 2: Train loss 0.2967 f1 0.8795 | Val loss 0.2859 f1 0.8826


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 3: Train loss 0.2091 f1 0.9204 | Val loss 0.2768 f1 0.8849


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 4: Train loss 0.1482 f1 0.9496 | Val loss 0.3097 f1 0.8812


  0%|          | 0/625 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

Epoch 5: Train loss 0.0989 f1 0.9690 | Val loss 0.3595 f1 0.8764


In [None]:
test_loss, test_f1, test_acc = run_epoch(test_loader, train=False)
print(f"Test F1: {test_f1:.4f} | Test Acc: {test_acc:.4f}")



  0%|          | 0/79 [00:00<?, ?it/s]

Test F1: 0.8792 | Test Acc: 0.8820
