The IMDB dataset is a popular dataset used for sentiment analysis tasks. It consists of a collection of movie reviews, along with their corresponding sentiment labels (positive or negative).

To work with the IMDB dataset in this Jupyter Notebook, we have already imported the necessary modules and loaded the dataset using the `torchtext.datasets.IMDB` class. The dataset has been split into training and testing sets.

## Setup

In [2]:
from torch import nn
import torch
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
import torch.nn.utils.rnn as rnn_utils
import torch.nn.functional as F
from src.data import DataLoaderScratch
from src.trainer import TrainerScratch
from src.optimizers import SGDScratch
from src.losses import CrossEntropyScratch
from src.metrics import AccuracyScratch
from src.functions import conv2d, maxpool2d

## Data Preprocessing

In [3]:
# Load the IMDB dataset
train_iter, test_iter = IMDB(split=('train', 'test'))
train_iter, test_iter = list(train_iter), list(test_iter)  # Convert iterators to lists

# Tokenize the text data
tokenizer = get_tokenizer('basic_english')

# Build a vocabulary
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# Building the vocabulary
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>", "<pad>"], min_freq=1)

# Set the default index for unknown tokens
vocab.set_default_index(vocab["<unk>"])

# Numericalize the text data
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

# Function to collate data samples into batches
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for (_label, _text) in batch:
        label_list.append(int(_label == 'pos'))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    labels = torch.tensor(label_list, dtype=torch.int64)
    texts = rnn_utils.pad_sequence(text_list, batch_first=True)
    lengths = torch.tensor(lengths, dtype=torch.int64)
    return labels, texts, lengths

# Convert lists to datasets for DataLoader compatibility
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_batch)

# Example batch verification
for labels, texts, lengths in train_dataloader:
    print(f"Batch labels: {labels.shape}")
    print(f"Batch texts: {texts.shape}")
    print(f"Batch lengths: {lengths.shape}")
    break  # To show an example batch, break here

Batch labels: torch.Size([8])
Batch texts: torch.Size([8, 850])
Batch lengths: torch.Size([8])


## Implementation from Scratch

In [4]:
def relu(x):
    out = torch.maximum(x, torch.zeros(1))
    return out

def softmax(X):
    X_exp = torch.exp(X)
    X_softmax = X_exp / X_exp.sum(axis=1, keepdims=True)
    return X_softmax

def log_loss(y_pred, y):
    y_one_hot = nn.functional.one_hot(y)
    loss = -(y_one_hot * torch.log(y_pred)).sum(axis=1).mean()
    return loss

In [53]:
batch_size = 8
vocab_size = len(vocab)
hidden_size = 50
sequence_length = 615

E = nn.Parameter(torch.randn(vocab_size, 300) * 0.1)

Wax = torch.randn(50, 300)
Waa = torch.randn(50, 50)
Wa = nn.Parameter(torch.hstack([Wax, Waa]) * 0.1)
Wya = nn.Parameter(torch.randn(2, hidden_size) * 0.1)

parameters = [E, Wa]
optimizer = SGDScratch(parameters, lr=0.1)

In [59]:

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:

        # Unpack the batch
        targets, inputs, lengths = batch

        # Zero the gradients
        optimizer.zero_grad()

        # Get the batch size
        batch_size = inputs.shape[0]
        # Get the maximum sequence length of the batch
        sequence_length = max(lengths)

        # One hot encode the inputs
        Ox = nn.functional.one_hot(inputs, num_classes=vocab_size).float()
        # Repeat the embedding matrix along the batch dimension
        E_repeated = E.unsqueeze(0).repeat(batch_size, 1, 1)
        # Batch multuply the one hot encoded inputs with the embedding matrix
        Ex = torch.bmm(Ox, E_repeated).squeeze(-1)

        # Initialize the hidden state
        a = torch.zeros(batch_size, hidden_size)

        # Forward pass
        for sequence_index in range(sequence_length):
            # Get the current input
            ex = Ex[:, sequence_index, :]

            # Repeat the weigth matrix along the batch dimension
            Wa_repeated = Wa.unsqueeze(0).repeat(batch_size, 1, 1)

            # Calculate the new hidden state
            a = torch.bmm(Wa_repeated, torch.hstack([a, ex]).unsqueeze(-1)).squeeze(-1)

            # Calculate the output
            if sequence_index == sequence_length-1:
                Wya_repeated = Wya.unsqueeze(0).repeat(batch_size, 1, 1)
                y_pred = softmax(torch.bmm(Wya_repeated, a.unsqueeze(-1)).squeeze(-1))
                loss = log_loss(y_pred, targets)
                loss.backward()
                optimizer.step()

KeyboardInterrupt: 

In [140]:
x = x[0] @ E

torch.Size([615, 300])

In [None]:
F.bmm()

In [141]:
x.shape

torch.Size([8, 615, 100684])

In [64]:
A = torch.zeros(8)
E = torch.randn() * 0.1


tensor([[279,  13,   9,  ...,   0,   0,   0],
        [ 51,  26, 741,  ...,   0,   0,   0],
        [ 13,  33, 215,  ...,   0,   0,   0],
        ...,
        [ 14,  10,   6,  ...,   0,   0,   0],
        [ 14,  21,  17,  ...,   0,   0,   0],
        [ 13,   9, 152,  ...,   0,   0,   0]])

In [66]:
sequence_length = texts.shape[1]
for sequence_index in range(sequence_length):
    

In [67]:
sequence_length

615

In [None]:
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        inputs, targets = batch
    
        # Zero gradients
        optimizer.zero_grad()
    
        # CONV + POOL Layer 1 (sees 1x28x28 and outputs 16x14x14)
        padding = calculate_same_padding(input_size=28, kernel_size=3, stride=1)
        Z1 = conv2d(inputs, W1, padding=padding) + b1
        A1 = relu(Z1)
        P1 = maxpool2d(A1, kernel_size=2, stride=2)
    
        # CONV + POOL Layer 2 (sees 16x14x14 and outputs 32x7x7)
        padding = calculate_same_padding(input_size=14, kernel_size=3, stride=1)
        Z2 = conv2d(P1, W2, padding=padding) + b2
        A2 = relu(Z2)
        P2 = maxpool2d(A2, kernel_size=2, stride=2)
    
        # FC Layer (sees 32x7x7 and outputs 32*7*7x10)
        P2_flat = P2.flatten(start_dim=1)
        Z3 = P2_flat @ W3 + b3
        y_pred = softmax(Z3)
    
        # Calculate Loss
        loss = log_loss(y_pred, targets)
    
        # Compute gradients
        loss.backward()
    
        # Update parameters
        optimizer.step()
        total_loss += loss.item()

    print(total_loss / len(train_dataloader))

2.285222887992859
2.1936349272727966
2.0781480073928833
