# 1. Install Dependencies

In [18]:
# Install required libraries
!pip install -U datasets torch
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


# 2. Preprocess Data

In [19]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import BertTokenizerFast
import time

BATCH_SIZE = 32
MAX_LEN = 512
HIDDEN_SIZE = 128
NUM_LAYERS = 1
EPOCHS = 20
EMBED_DIM = 768
NUM_CLASSES = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
dataset = load_dataset("imdb")

class IMDBDataset(Dataset):
    def __init__(self, split):
        self.texts = dataset[split]['text']
        self.labels = dataset[split]['label']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoded = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].squeeze(0)
        return input_ids, torch.tensor(self.labels[idx], dtype=torch.long)

train_data = IMDBDataset('train')
test_data = IMDBDataset('test')
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE)

print(len(train_loader.dataset.labels))

25000


# 3. Results

In [20]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers,
                            bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        hn = torch.cat((hn[-2], hn[-1]), dim=1)  # concat last fwd & bwd hidden states
        return self.fc(hn)


model = BiLSTMClassifier(
    vocab_size=tokenizer.vocab_size,
    embed_dim=EMBED_DIM,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_classes=NUM_CLASSES
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train():
    model.train()
    total, correct = 0, 0
    start = time.time()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        pred = outputs.argmax(1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)
    print(f"Train Acc: {correct/total*100:.2f}% | Time: {time.time() - start:.1f}s")


def evaluate(best_acc, patience):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            pred = outputs.argmax(1)
            correct += (pred == labels).sum().item()
            total += labels.size(0)
    acc = correct/total
    print(f"Test Acc: {acc*100:.2f}%")

    if acc > best_acc:
        best_acc = acc
        patience = 2
    else:
        patience -= 1
    return best_acc, patience

best_acc = 0
patience = 2
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train()
    best_acc, patience = evaluate(best_acc, patience)

    if patience == 0:
      print("Early stopping!")
      break



Epoch 1/20
Train Acc: 65.00% | Time: 69.1s
Test Acc: 73.10%

Epoch 2/20
Train Acc: 81.43% | Time: 67.9s
Test Acc: 77.33%

Epoch 3/20
Train Acc: 88.99% | Time: 67.7s
Test Acc: 78.52%

Epoch 4/20
Train Acc: 93.04% | Time: 68.8s
Test Acc: 78.90%

Epoch 5/20
Train Acc: 95.37% | Time: 67.9s
Test Acc: 78.04%

Epoch 6/20
Train Acc: 97.24% | Time: 67.6s
Test Acc: 79.18%

Epoch 7/20
Train Acc: 97.81% | Time: 67.9s
Test Acc: 80.79%

Epoch 8/20
Train Acc: 98.74% | Time: 68.9s
Test Acc: 78.41%

Epoch 9/20
Train Acc: 99.00% | Time: 68.0s
Test Acc: 77.67%
Early stopping!


Best accuracy obtained: 80.79%