# Dependencies

In [1]:
import collections

import datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import torch.optim as optim
import torchtext

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Seeding to ensure reproducibility
seed = 0

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# IMDB Dataset

Loading the dataset is very easy using the `datasets` package! The command below will download (if required in the *data/classification* directory) and load the IMDB dataset in a variable named `dataset`.

In [3]:
dataset = datasets.load_dataset("zapsdcn/imdb", cache_dir="../data/classification/")

The loaded `dataset` is already split into training, validation and testing sets. Looking at the outputs, we can see the number of examples (`num_rows`) in each split and three features named `id`, `text` and `label`.

In [4]:
train_dataset = dataset['train'] 
validation_dataset = dataset['validation']
test_dataset = dataset['test']

train_dataset, validation_dataset, test_dataset

(Dataset({
     features: ['id', 'text', 'label'],
     num_rows: 20000
 }),
 Dataset({
     features: ['id', 'text', 'label'],
     num_rows: 5000
 }),
 Dataset({
     features: ['id', 'text', 'label'],
     num_rows: 25000
 }))

Looking at one such training example, we can see that 
- `id` contains information related to some index. Don't bother too much about this as this is not relevant for the application at hand.
- `text` is where we have the actual review!
- `label` denotes whether it's a good (if 1) or bad (if 0) review! 

In [5]:
train_dataset[-3]

{'id': 'train_10348',
 'text': 'This movie is very good. The screenplay is enchanting. But Meryl Streep is most impressive. Her performance is excellent. She brings me to go into the heart of her role.',
 'label': 1}

## Tokenization and Vocabulary Building

Computers only understand numbers. So we have to find a way to modify such that we can represent each word as a number. But first we have to split sentences into individual words. This is called Tokenization and is done using a tokenizer! 

I'm using a simple tokenizer provided by `torchtext`.

In [6]:
tokenizer = torchtext.data.utils.get_tokenizer("basic_english")

As expected, a tokenizer splits a text into words!

In [None]:
tokenizer(train_dataset[-3]["text"])

Let's do this for all of the training, validation and testing dataset.

In [7]:
def tokenize(example, tokenizer, max_length):
    tokens = tokenizer(example["text"])[:max_length]
    return {"tokens": tokens}

In [8]:
# Truncating the max length of a review because some of them ramble on for quite some time!
max_length = 512

train_dataset = train_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
validation_dataset = validation_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_dataset = test_dataset.map(
    tokenize, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

Let's build a volcabulary out of these tokens.

In [9]:
# Only selecting words at occur at least 5 times in the whole dataset
min_freq = 5

# Special tokens for unknown words and paddings (these will come in later)
special_tokens = ["<unk>", "<pad>"]

vocab = torchtext.vocab.build_vocab_from_iterator(
    train_dataset["tokens"],
    min_freq=min_freq,
    specials=special_tokens,
)

In [None]:
len(vocab), vocab.get_itos()

Making sure that words that are not in the vocabulary gets mapped to `"<unk>"` and assiging a token for padding as well (we'll cover padding later).

In [11]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

vocab.set_default_index(unk_index)

vocab["some_token_that_is_not_in_vocab"]

0

Now with this we can easily map any review to a sequence of numbers!

In [None]:
vocab.lookup_indices(train_dataset[-3]["tokens"])

Let's do this in a more refined way for all of the datasets and store these number in the `id` feature!

In [12]:
def numericalize(example, vocab):
    id = vocab.lookup_indices(example["tokens"])
    return {"id": id}

In [13]:
train_dataset = train_dataset.map(numericalize, fn_kwargs={"vocab": vocab})
validation_dataset = validation_dataset.map(numericalize, fn_kwargs={"vocab": vocab})
test_dataset = test_dataset.map(numericalize, fn_kwargs={"vocab": vocab})

Converting to torch tensors to be loaded into Transformer!

In [14]:
train_dataset = train_dataset.with_format(type="torch", columns=["id", "label"])
validation_dataset = validation_dataset.with_format(type="torch", columns=["id", "label"])
test_dataset = test_dataset.with_format(type="torch", columns=["id", "label"])

## Data Loader

Let's now pad the sequences to make them of equal lengths and then load the dataset into pytorch data loaders.

In [15]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["id"] for i in batch]
        batch_ids = torch.nn.utils.rnn.pad_sequence(
            batch_ids, padding_value=pad_index, batch_first=True
        )
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"id": batch_ids, "label": batch_label}
        return batch

    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [16]:
# Batch Size
batch_size = 512

train_data_loader = get_data_loader(train_dataset, batch_size, pad_index, shuffle=True)
validation_data_loader = get_data_loader(validation_dataset, batch_size, pad_index)
test_data_loader = get_data_loader(test_dataset, batch_size, pad_index)

You can think of pytorch data loaders as an iterable over which you can loop and get batches one by one!

In [17]:
for batch in train_data_loader:
    print(batch["id"].shape)

torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([512, 512])
torch.Size([32, 512])


# Transformers Neural Network

## Self Attention

In [17]:
ex_review = train_dataset[-3]["id"][:5]
token_embedding = torch.nn.Embedding(embedding_dim=7, num_embeddings=len(vocab))
emb_token = token_embedding(ex_review)
emb_token

tensor([[-0.2282,  0.2800,  0.0732,  1.1133,  0.2823,  0.4342,  0.4569],
        [-0.0899,  0.7298, -1.8453, -0.1021, -1.0335, -0.3126,  0.2458],
        [ 1.7067,  2.3804, -1.0670,  1.1149, -0.1407,  0.8058,  0.3276],
        [ 0.9929, -0.2065, -0.2448, -0.2793, -0.2769,  0.7489, -0.6435],
        [-0.4781,  1.3892, -0.5023,  1.6797, -1.0240, -0.5753, -1.4325]],
       grad_fn=<EmbeddingBackward0>)

In [17]:
X = np.load("../anim/ex_review_emb.npy")
X = torch.tensor(X)
X = X.unsqueeze(dim=0)
X, X.shape

(tensor([[[ 1.7333, -0.2087, -0.1684,  0.8662, -0.7795,  1.2341, -0.6230],
          [-1.3696, -0.8951,  0.8114, -1.0988, -1.2615,  0.4062, -0.3548],
          [-0.4408,  0.2652, -1.0598,  1.9366,  0.5431, -1.2966, -0.1174],
          [-0.8789, -0.2537, -0.0356,  0.8743,  0.3683,  1.5304,  1.6146],
          [ 0.9612,  0.1940, -0.7919, -1.4849, -0.6838,  0.1440,  1.0798]]]),
 torch.Size([1, 5, 7]))

In [15]:
# Self Attention
def self_attention(X):
    # Weight Matrix
    W = torch.bmm(X, X.transpose(1, 2))
    W = W / (7**(1/2)) # Scaling for stability
    W = F.softmax(W, dim=-1)

    # Output Matrix
    y = torch.bmm(W, X)
    
    return y

In [18]:
y = self_attention(X)
y, y.shape

(tensor([[[ 1.2696, -0.1809, -0.2182,  0.6464, -0.6488,  1.0064, -0.2963],
          [-1.0309, -0.7250,  0.5664, -0.8579, -1.0547,  0.4712, -0.1181],
          [-0.3646,  0.1967, -0.9301,  1.7236,  0.4302, -0.9713, -0.0155],
          [-0.6338, -0.2349, -0.0945,  0.6974,  0.1573,  1.1944,  1.2254],
          [ 0.6570,  0.0204, -0.5302, -0.9239, -0.6334,  0.3498,  0.7731]]]),
 torch.Size([1, 5, 7]))

In [19]:
np.save("../anim/ex_review_out.npy", y)

In [20]:
idx = torch.randperm(X.shape[1])
idx

tensor([4, 0, 1, 3, 2])

In [21]:
X, X[0][idx]

(tensor([[[ 1.7333, -0.2087, -0.1684,  0.8662, -0.7795,  1.2341, -0.6230],
          [-1.3696, -0.8951,  0.8114, -1.0988, -1.2615,  0.4062, -0.3548],
          [-0.4408,  0.2652, -1.0598,  1.9366,  0.5431, -1.2966, -0.1174],
          [-0.8789, -0.2537, -0.0356,  0.8743,  0.3683,  1.5304,  1.6146],
          [ 0.9612,  0.1940, -0.7919, -1.4849, -0.6838,  0.1440,  1.0798]]]),
 tensor([[ 0.9612,  0.1940, -0.7919, -1.4849, -0.6838,  0.1440,  1.0798],
         [ 1.7333, -0.2087, -0.1684,  0.8662, -0.7795,  1.2341, -0.6230],
         [-1.3696, -0.8951,  0.8114, -1.0988, -1.2615,  0.4062, -0.3548],
         [-0.8789, -0.2537, -0.0356,  0.8743,  0.3683,  1.5304,  1.6146],
         [-0.4408,  0.2652, -1.0598,  1.9366,  0.5431, -1.2966, -0.1174]]))

In [None]:
y_ = self_attention(X[0][idx])
y_, y_.shape