# Movie Review Sentiment Analysis

## Dependency Management

In [1]:
# Check installed versions
!pip list | grep torch

torch                                 2.3.1+cu121
torchao                               0.10.0
torchdata                             0.11.0
torchsummary                          1.5.1
torchtext                             0.18.0
torchtune                             0.6.1


In [2]:
# This cell fixes the torch/torchtext compatibility issue in Colab.
# Run this once at the beginning of your session.
!pip uninstall -y torch torchtext torchaudio torchvision
!pip install torch==2.3.1 torchtext==0.18.0 --extra-index-url https://download.pytorch.org/whl/cu121

Found existing installation: torch 2.3.1+cu121
Uninstalling torch-2.3.1+cu121:
  Successfully uninstalled torch-2.3.1+cu121
Found existing installation: torchtext 0.18.0
Uninstalling torchtext-0.18.0:
  Successfully uninstalled torchtext-0.18.0
[0mLooking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu121
Collecting torch==2.3.1
  Using cached https://download.pytorch.org/whl/cu121/torch-2.3.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.9 MB)
Collecting torchtext==0.18.0
  Using cached torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Using cached torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
Installing collected packages: torch, torchtext
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
timm 1.0.19 requires torchvision, which is not installed.
fastai 2.8.4 requires torchvision>=0.11, which is not ins

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import time
from tqdm import tqdm
import math



## Configuration

In [20]:
VOCAB_SIZE = 10000
EMBEDDING_DIM = 256 # Also known as d_model in Transformers
HIDDEN_DIM = 512 # Size of the feed-forward network
N_CLASSES = 2
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-4 # Transformers often benefit from a smaller learning rate
MODEL_SAVE_PATH = "sentiment_transformer.pth"

In [36]:
# Transformer-specific config
N_HEADS = 4 # Number of attention heads
N_ENCODER_LAYERS = 6 # Number of Transformer Encoder layers
DROPOUT = 0.1

In [37]:
# Set device
DEVICE = torch.device("cpu")
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
print(f"Using device: {DEVICE}")

Using device: cuda


## Data Preparation

In [38]:
print("Loading dataset and building vocabulary...")
dataset = load_dataset("imdb")
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

train_iter = (item['text'] for item in dataset['train'])
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"], max_tokens=VOCAB_SIZE)
vocab.set_default_index(vocab["<unk>"])

Loading dataset and building vocabulary...


In [39]:
class ImdbFromScratchDataset(Dataset):
    def __init__(self, dataset_split, vocab, tokenizer):
        self.dataset = dataset_split
        self.vocab = vocab
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        text = item['text']
        label = item['label']
        token_ids = self.vocab(self.tokenizer(text))
        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

In [40]:
def collate_batch(batch):
    labels, texts = [], []
    for (text, label) in batch:
        labels.append(label)
        texts.append(text)

    labels_tensor = torch.tensor(labels, dtype=torch.long)
    texts_padded = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=vocab['<pad>'])

    return texts_padded, labels_tensor

In [41]:
train_dataset = ImdbFromScratchDataset(dataset['train'], vocab, tokenizer)
test_dataset = ImdbFromScratchDataset(dataset['test'], vocab, tokenizer)

In [42]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_batch)
print(f"Vocabulary size: {len(vocab)}")
print("Data pipeline prepared.")

Vocabulary size: 10000
Data pipeline prepared.


## Build Model

In [43]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: [seq_len, batch_size, embedding_dim]
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [44]:
class TransformerSentimentClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_heads, n_encoder_layers, hidden_dim, output_dim, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.pos_encoder = PositionalEncoding(embed_dim, dropout)

        # Define a single Transformer Encoder Layer
        encoder_layers = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        # Stack multiple encoder layers together
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=n_encoder_layers)

        self.fc = nn.Linear(embed_dim, output_dim)
        self.d_model = embed_dim

    def forward(self, text):
        # text shape: [batch size, seq len]

        # Generate a mask for the padding tokens
        # The mask should be True for tokens that should be ignored
        src_key_padding_mask = (text == vocab['<pad>'])

        embedded = self.embedding(text) * math.sqrt(self.d_model)
        # embedded shape: [batch size, seq len, embedding dim]

        # Note: PyTorch TransformerEncoder expects [seq_len, batch_size, embed_dim]
        # if batch_first=False (default). We are using batch_first=True.
        positioned = self.pos_encoder(embedded)

        # Pass through the Transformer Encoder
        encoder_output = self.transformer_encoder(positioned, src_key_padding_mask=src_key_padding_mask)
        # encoder_output shape: [batch size, seq len, embedding dim]

        # We will use the output of the first token ([CLS] token's equivalent) for classification
        # or we can average all the outputs
        pooled_output = encoder_output.mean(dim=1)
        # pooled_output shape: [batch size, embedding dim]

        return self.fc(pooled_output)

In [45]:
model = TransformerSentimentClassifier(
    len(vocab), EMBEDDING_DIM, N_HEADS, N_ENCODER_LAYERS, HIDDEN_DIM, N_CLASSES, DROPOUT, vocab['<pad>']
).to(DEVICE)

## Training and Evaluation Methods

In [46]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss().to(DEVICE)

In [47]:
def train_epoch(model, iterator, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for text, labels in tqdm(iterator, desc="Training"):
        text, labels = text.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(text)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(iterator)


In [48]:
def eval_model(model, iterator, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for text, labels in tqdm(iterator, desc="Evaluating"):
            text, labels = text.to(device), labels.to(device)
            predictions = model(text)
            loss = criterion(predictions, labels)
            total_loss += loss.item()
            preds = torch.argmax(predictions, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    return total_loss / len(iterator), accuracy

## Main Training Loop

In [49]:
print("Starting training...")
total_start_time = time.time()

for epoch in range(EPOCHS):
    epoch_start_time = time.time()

    print(f'--- Epoch {epoch + 1}/{EPOCHS} ---')
    train_loss = train_epoch(model, train_loader, optimizer, criterion, DEVICE)
    val_loss, val_accuracy = eval_model(model, test_loader, criterion, DEVICE)

    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    print(f'Training Loss: {train_loss:.4f}')
    print(f'Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.4f}')
    print(f"Epoch {epoch + 1} duration: {epoch_duration:.2f} seconds")

total_end_time = time.time()
print("\n--- Training Complete ---")
print(f"Total training time: {(total_end_time - total_start_time):.2f} seconds")
torch.save(model.state_dict(), MODEL_SAVE_PATH)
print(f"Model saved to {MODEL_SAVE_PATH}")

Starting training...
--- Epoch 1/3 ---


Training: 100%|██████████| 782/782 [07:32<00:00,  1.73it/s]
Evaluating: 100%|██████████| 782/782 [00:29<00:00, 26.50it/s]


Training Loss: 0.5366
Validation Loss: 0.5697 | Validation Accuracy: 0.7592
Epoch 1 duration: 481.70 seconds
--- Epoch 2/3 ---


Training: 100%|██████████| 782/782 [07:32<00:00,  1.73it/s]
Evaluating: 100%|██████████| 782/782 [00:29<00:00, 26.74it/s]


Training Loss: 0.4117
Validation Loss: 0.5511 | Validation Accuracy: 0.7544
Epoch 2 duration: 481.70 seconds
--- Epoch 3/3 ---


Training: 100%|██████████| 782/782 [07:32<00:00,  1.73it/s]
Evaluating: 100%|██████████| 782/782 [00:29<00:00, 26.89it/s]


Training Loss: 0.3684
Validation Loss: 0.4953 | Validation Accuracy: 0.8165
Epoch 3 duration: 481.19 seconds

--- Training Complete ---
Total training time: 1444.59 seconds
Model saved to sentiment_transformer.pth
