# 1. Install Dependencies

In [None]:
!pip install -U datasets torch
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


# 2. Preprocess Data

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import BertTokenizerFast
import time

BATCH_SIZE = 32
MAX_LEN = 256
HIDDEN_SIZE = 128
NUM_LAYERS = 2
EPOCHS = 20
EMBED_DIM = 128
NUM_CLASSES = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
dataset = load_dataset("imdb")

class IMDBDataset(Dataset):
    def __init__(self, split):
        self.texts = dataset[split]['text']
        self.labels = dataset[split]['label']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoded = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].squeeze(0)
        return input_ids, torch.tensor(self.labels[idx], dtype=torch.long)

train_data = IMDBDataset('train')
test_data = IMDBDataset('test')
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

print(len(train_loader.dataset.labels))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


25000


# 3. Results

In [None]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers,
                            bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        # Concatenate last hidden states for forward and backward LSTM
        hn = torch.cat((hn[-2], hn[-1]), dim=1)  
        return self.fc(hn)


model = BiLSTMClassifier(
    vocab_size=tokenizer.vocab_size,
    embed_dim=EMBED_DIM,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    num_classes=NUM_CLASSES
).to(device)

model

BiLSTMClassifier(
  (embedding): Embedding(30522, 128, padding_idx=0)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [4]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Number of trainable parameters:", pytorch_total_params)

Number of trainable parameters: 4566786


In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train():
    model.train()
    total, correct = 0, 0
    start = time.time()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        pred = outputs.argmax(1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)
    print(f"Train Acc: {correct/total*100:.2f}% | Time: {time.time() - start:.1f}s")


def evaluate(best_acc, patience):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            pred = outputs.argmax(1)
            correct += (pred == labels).sum().item()
            total += labels.size(0)
    acc = correct/total
    print(f"Test Acc: {acc*100:.2f}%")

    if acc > best_acc:
        best_acc = acc
        patience = 2
    else:
        patience -= 1
    return best_acc, patience

best_acc = 0
patience = 2
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train()
    best_acc, patience = evaluate(best_acc, patience)

    if patience == 0:
      print("Early stopping!")
      break



Epoch 1/20
Train Acc: 62.72% | Time: 49.4s
Test Acc: 66.91%

Epoch 2/20
Train Acc: 77.91% | Time: 49.0s
Test Acc: 76.90%

Epoch 3/20
Train Acc: 83.56% | Time: 49.2s
Test Acc: 82.49%

Epoch 4/20
Train Acc: 87.78% | Time: 48.3s
Test Acc: 83.24%

Epoch 5/20
Train Acc: 91.02% | Time: 49.1s
Test Acc: 84.56%

Epoch 6/20
Train Acc: 93.93% | Time: 49.3s
Test Acc: 85.10%

Epoch 7/20
Train Acc: 96.01% | Time: 48.9s
Test Acc: 84.83%

Epoch 8/20
Train Acc: 97.76% | Time: 48.8s
Test Acc: 84.56%
Early stopping!


Best accuracy obtained: 85.10%