In [1]:
# Create a simple transformer encoder with pytorch to be used for sentiment analysis on the IMDB dataset
# 25000 comments on IMDB will be fed into an encoder to see if the comments are postive or negative. 
# this is an enhanced notebook that is originally derived from the NYU Deep Learning Class 2020

import numpy as np
import torch
from torch import nn, optim
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets

In [2]:
# Set the device to use GPU if available 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Allow randomness just once running this notebook
torch.manual_seed(0)

<torch._C.Generator at 0x7f912249ce30>

In [3]:
# Download the torchtext to access the dataset
import torchtext.data as data
import torchtext.datasets as datasets

In [4]:
# Set the lemgth of each comment (word per instances) made at the IMDB dataset. 
max_len = 200 # max number of words for each instance 
text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = data.LabelField(sequential=False, dtype=torch.long)
datasets.IMDB.download('./') # download the dataset to the working folder
ds_train, ds_test = datasets.IMDB.splits(text, label, path='./imdb/aclImdb/') # split the data into train and test




In [5]:
# Check the size of the training and test sets
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

train :  25000
test :  25000
train.fields : {'text': <torchtext.data.field.Field object at 0x7f9123d44d10>, 'label': <torchtext.data.field.LabelField object at 0x7f9123c72e90>}


In [6]:
# Split the training data (90-10) into training data and validation data
ds_train, ds_valid = ds_train.split(0.9)
print('train : ', len(ds_train))
print('valid : ', len(ds_valid))
print('test : ', len(ds_test))

# View one of the instances of the training data
print(ds_train[0].text, len(ds_train[0].text))

train :  22500
valid :  2500
test :  25000
['if', 'you', 'have', 'not', 'heard', 'of', 'this', 'film', 'from', 'walt', 'disney', 'pictures,', 'do', 'not', 'worry', 'about', 'it.', 'it', 'would', 'be', 'classed', 'along', 'the', 'other', 'films', 'by', 'disney', 'that', 'are', 'meant', 'for', 'educational', 'purposes', 'like', '"family', 'planning".<br', '/><br', '/>it', 'was', 'co-produced', 'with', 'kotex', 'to', 'teach', 'pre-teen', 'girls', 'about', 'menstruation,', 'supposably.', 'it', 'only', 'educates', 'at', 'a', 'superficial', 'level,', 'so', 'it', 'does', 'not', 'go', 'into', 'heavy', 'detail', 'for', 'the', 'animated', '"ram\'s', 'head"/', 'reproductive', 'system', 'sequence.<br', '/><br', '/>the', 'film', 'does', 'show', '"the', 'wonderful', 'world', 'of', 'disney"', 'elements', 'like', 'the', 'turning', 'of', 'the', 'page', 'and', 'the', 'use', 'of', 'animation', 'to', 'tell', 'the', 'story.<br', '/><br', '/>this', 'film', 'is', 'impossible', 'to', 'find,', 'so', 'if', 'you

In [7]:
# Convert each instance in train set into an array (each word is a value 0-50000) and train labels into 0 or 1
# the word capacity we choose is 50K to name each word a number (max number of words)
num_words = 50_000  

# since we picked max length 200 words for each instance, the output of each instance is an array with max 200 elements
# if an instance has less than 200 words, the remaining of the elements will be "1" when it is numericalized
text.build_vocab(ds_train, max_size=num_words) # at this point, text data is numericalized, each word is now a number
label.build_vocab(ds_train) # label data is numericalized as well (neg:0 or pos:1)

In [8]:
# Regroup instances under batches, each batch having 64 instances (a value between 32-500 is a good choice)
batch_size = 64
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False)



In [9]:
# Turn the train_loader batch into an iterable format 
train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
for batch in train_iterator: # everytime it is called, a new batch is called with 64 instances each having 200 values  
    x = batch.text.to(device)
    y = batch.label.to(device)




In [10]:
# Check one single instance of a typical batch
print(x[13], x.shape) # x torch.Size([64, 200])
print(y[0:5], y.shape) # y torch.Size([64])

tensor([ 1426,   209,   859,   781,     6,    26,     3,   104,  1517,    50,
            9,    14,     3,  3613,  1426,    10,    20,    14, 17271,     6,
          477,     9,   197,    12,    14,   376, 19984,    30,     5,     2,
          115,   122,     9,    62,  4998,    13,  4739,    44,     3,   104,
         2452,    18,   209,   756,    45,    25,   460,     0,     0,    25,
          128,    78,  1256,    26,   301,    17,     0,     0,    10,    14,
            2,    20,   178, 37471,  7557,    57,     0,    58,    10,    30,
            7,   157,   821,    37,     9,  1107,    44,    39,     3,    56,
          291,    12,  1544,    28,   545,  1090,     4,    45,    25,   460,
            2,    82,   647,    25,   128,    37,     0,     0,    44,    39,
            3,   233,    20,    17,    46,  1047,   811,    16,   111,  4074,
         8951,   110,   205,   341,  3462,     4,   363,    11,    44,     3,
          838,  1307,   188,  2509,    13,  8320,     1,     1, 

In [11]:
# See how nn.Embedding layer works

# first step is to encode these numericalized arrays in our dataset, adding each value a dimension 
# nn.Embedding is a linear layer (M x N matrix), with M: number of words and N: size of each word vector.
# multiplying a one-hot vector with the embedding outputs the embedded item, in other words, embedding is a look up.

# word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
# position_embeddings = nn.Embedding(max_position_embeddings, d_model)
# there is also sinusoidal embeddings needed to be added to word + positional embeddings for large problems

# Example for a basic embedding:
word_to_ix = {"welcome": 0, "to": 1, "pythonic": 2, "fool": 3}
embeds = nn.Embedding(4, 8)  # 4 words in vocab, a new dimension for each word is created with len:8
print(embeds)
lookup_tensor = torch.tensor([word_to_ix["fool"]], dtype=torch.long)
embeded_value = embeds(lookup_tensor)
print(embeded_value)


Embedding(4, 8)
tensor([[ 0.7502, -0.5855, -0.1734,  0.1835,  1.3894,  1.5863,  0.9463, -0.8437]],
       grad_fn=<EmbeddingBackward>)


In [12]:
# See how positional embedding layer works

length = 6
a = torch.ones([3,6], dtype = torch.long)
position_ids = torch.arange(length, dtype=torch.long) # (max_seq_length)
position_ids = position_ids.unsqueeze(0).expand_as(a) # (bs, max_seq_length)
print(position_ids, position_ids.shape, '\n')

vocab_size = 100
d_model_test = 5
 # nn.Embedding takes a 2d and outputs 3d: Takes (len,dim) and outputs (len,dim,model) with a vocabulary embedding. 
embedding_layer = nn.Embedding(vocab_size, d_model_test, padding_idx=1)
pos_embeddings = embedding_layer(position_ids) 
print(pos_embeddings.shape)
print(pos_embeddings)

tensor([[0, 1, 2, 3, 4, 5],
        [0, 1, 2, 3, 4, 5],
        [0, 1, 2, 3, 4, 5]]) torch.Size([3, 6]) 

torch.Size([3, 6, 5])
tensor([[[-0.6136,  0.0316, -0.4927,  0.2484,  0.4397],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [-0.2897,  0.0525,  0.5229,  2.3022, -1.4689],
         [-1.5867, -0.6731,  0.8728,  1.0554,  0.1778],
         [-0.2303, -0.3918,  0.5433, -0.3952, -0.4462],
         [ 0.7440,  1.5210,  3.4105, -1.5312, -1.2341]],

        [[-0.6136,  0.0316, -0.4927,  0.2484,  0.4397],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [-0.2897,  0.0525,  0.5229,  2.3022, -1.4689],
         [-1.5867, -0.6731,  0.8728,  1.0554,  0.1778],
         [-0.2303, -0.3918,  0.5433, -0.3952, -0.4462],
         [ 0.7440,  1.5210,  3.4105, -1.5312, -1.2341]],

        [[-0.6136,  0.0316, -0.4927,  0.2484,  0.4397],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [-0.2897,  0.0525,  0.5229,  2.3022, -1.4689],
         [-1.5867, -0.6731, 

In [13]:
# Embeddings Class

# batch size: 64, max sequence length: 200, vocab_size: 50000, max_pos_embed: 10000, d_model: 32 will be chosen 
# p attr. is also needed if an additional sinusoidal embedding is used

batch_size = 64
d_model = 32
max_position_embeddings = 10000
vocab_size = 50000
max_seq_length = 200 # (also actual seq_length since all sequences are padded to 200 words per instance already)


class Embeddings(nn.Module): 
    '''takes an input with dimensions: [bs, max_seq_lenght] and outputs embeddings: [bs, max_seq_length, d_model]'''
    
    def __init__(self, d_model, vocab_size, max_position_embeddings): 
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model, padding_idx=1)
        self.position_embeddings = nn.Embedding(max_position_embeddings, d_model)
        self.LayerNorm = nn.LayerNorm(d_model, eps=1e-12)

        
    def forward(self, input_ids): # each input_ids has 64 instances, each instance has 200 values size: [64,200]
        seq_length = input_ids.size(1) # size: (batch size, seq length)
        position_ids = torch.arange(seq_length, dtype=torch.long) # (seqmax_seq_length)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length)
        
        # prepare the embeddings
        word_embeddings = self.word_embeddings(input_ids) # shape: (bs, seq_length, d_model)
        position_embeddings = self.position_embeddings(position_ids) # shape: (bs, seq_length, d_model)
        embeddings = word_embeddings + position_embeddings # depending on the problem, some coefficients can be used           
        normalized_embeddings = self.LayerNorm(embeddings) # normalize the values in the embeddings layers
        
        # output dimensions are: [bs, max_seq_length, d_model]
        return normalized_embeddings
    

In [14]:
# Multi-Head Attention Class
# num_heads = 8

class MultiHeadAttention(nn.Module):  
    '''Takes an input with dimensions: [bs, max_seq_lenght, d_model]. Does the dot product with WQ,WK,WV layers. 
       Splits these values into multiple attention heads. Makes the calculation for scaled dot product attention. 
       Combines the attention heads into the original dimensions. Passes the result from a linear layer.'''
    
    def __init__(self, d_model, num_heads, d_input=None):  # a "p" positional embedding vector could be added here 
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        if d_input is None:
            d_xq = d_xk = d_xv = d_model
        else:
            d_xq, d_xk, d_xv = d_input
            
        # Make sure that the embedding dimension of model is a multiple of number of heads
        assert d_model % self.num_heads == 0

        self.d_k = d_model // self.num_heads  # dimension per head
        
        # Initialize the q,k,v matrices
        self.W_q = nn.Linear(d_xq, d_model, bias=False) # shape: [d_model, d_model]
        self.W_k = nn.Linear(d_xk, d_model, bias=False)
        self.W_v = nn.Linear(d_xv, d_model, bias=False)
        
        # Outputs of all sub-layers need to be of dimension d_model
        self.W_h = nn.Linear(d_model, d_model)
        
        
    def forward(self, x): # queries, keys, and values will be learned  
        batch_size, seq_length, d_model = x.size() # embedded inputs have 3 dimensions
        
        # self.W_q(x) will yield a matrix with size (batch_size, seq_length, d_model) same shape of the input
        W_q_x = self.W_q(x) # (batch_size, seq_length, d_model) Ex: torch.Size([64, 200, 32]) 
        W_k_x = self.W_k(x)
        W_v_x = self.W_v(x)                      
        
        # split heads [batch_size, seq_length, d_model] --> [batch_size, n_heads, seq_length, d_k]
        Q = W_q_x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2) # (batch_size, n_heads, seq_length, d_k)
        K = W_q_x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2) # Ex: torch.Size([64, 8, 200, 4])
        V = W_q_x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2) 
        
        # Calculate the attention weights for each of the heads
        H_cat, A = self.scaled_dot_product_attention(Q, K, V)
        
        # Regroup the heads
        H_cat = H_cat.transpose(1, 2).contiguous().view(batch_size, seq_length, d_model)
        
        # Final linear layer  
        H = self.W_h(H_cat) # (batch_size, seq_length, d_model)
        
        return H, A
    
    
    def scaled_dot_product_attention(self, Q, K, V):
        
        # Scale the value by d_k so that the soft(arg)max doesnt saturate
        Q = Q / np.sqrt(self.d_k)                         # (batch_size, n_heads, seq_length, d_k)
        scores = torch.matmul(Q, K.transpose(2,3))        # K.T == (batch_size, n_heads, d_k, seq_length)
        
        # scores will have a dimension of: (bs, n_heads, seq_length, seq_length) where q_length == k_length == max_seq
        A = nn.Softmax(dim=-1)(scores)   # (bs, n_heads, seq_length, seq_length) over the last column 
        
        # (bs, n_heads, seq_length, seq_length).(batch_size, n_heads, seq_length, d_k) = (bs, n_heads, seq_length, d_k)
        H = torch.matmul(A, V) # (bs, n_heads, seq_length, d_k)

        return H, A 

In [15]:
# Reminder of the values that will be used in this model

# hidden_dim = 300
# batch_size = 64
# d_model = 32
# num_heads = 8
# max_position_embeddings = 10000
# vocab_size = 50000
# max_seq_length = 200 # (also actual seq_length since all sequences are padded to 200 words per instance already)
# no p value is used

In [16]:
# Create an Encoder Class and a Two-Layer NN 

class TwoLayerNN(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super().__init__()
        self.L1 = nn.Linear(d_model,hidden_dim)
        self.L2 = nn.Linear(hidden_dim, d_model)

    def forward(self, x):
        x = self.L1(x)
        # x = nn.ReLU(x) # raises error since torch can't locate the dimensions of where to apply ReLU
        x = self.L2(x)
        return x
    
    
class Encoder(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, input_vocab_size, maximum_position_encoding):
        super().__init__()
        self.d_model = d_model
        self.embedding = Embeddings(d_model, input_vocab_size, maximum_position_encoding) # init Embeddings object
        self.mha = MultiHeadAttention(d_model, num_heads) # init MultiHeadAttention object
        self.NN = TwoLayerNN(d_model, hidden_dim) # init TwoLayeNN object
        self.layernorm1 = nn.LayerNorm(normalized_shape = d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape = d_model, eps=1e-6)
        
        
    def forward(self, x):
        # Transform the inputs to embedded values (batch_size, input_seq_length, d_model)
        x = self.embedding(x) 
        
        # input the embedded values to Multi Head Attn.
        attn_output, _ = self.mha(x)  # (batch_size, input_seq_len, d_model)
        
        # do layer normalization after adding the residual connection 
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
        
        # feed forward the attention+residual connection to a 2-layer-NN
        output = self.NN(out1)  # (batch_size, input_seq_len, d_model)
        
        # do layer normalization after adding residual connection 
        out2 = self.layernorm2(out1 + output)  # (batch_size, input_seq_len, d_model)
        
        # return the transformer's encoder output
        return out2  # (batch_size, input_seq_len, d_model)
    
    

In [17]:
# Create a class to transform the ouputs of the encoder to match the labels (similar to binary logistic regression)

class TransformerClassifier(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim, input_vocab_size, maximum_position_encoding=10000):
        super().__init__()
        
        self.encoder = Encoder(d_model, num_heads, hidden_dim, input_vocab_size,
                         maximum_position_encoding=10000)
        self.dense = nn.Linear(d_model, 2) # answers are either 0 or 1

    def forward(self, x):
        x = self.encoder(x)  
        x, _ = torch.max(x, dim=1)  # torch max returns: (max values, argmax values)
        x = self.dense(x)
        return x 

In [18]:
# Instantiate the model object

model = TransformerClassifier(d_model=32, num_heads=8, hidden_dim=400, input_vocab_size=50002, 
                              maximum_position_encoding =10000)
model.to(device)

TransformerClassifier(
  (encoder): Encoder(
    (embedding): Embeddings(
      (word_embeddings): Embedding(50002, 32, padding_idx=1)
      (position_embeddings): Embedding(10000, 32)
      (LayerNorm): LayerNorm((32,), eps=1e-12, elementwise_affine=True)
    )
    (mha): MultiHeadAttention(
      (W_q): Linear(in_features=32, out_features=32, bias=False)
      (W_k): Linear(in_features=32, out_features=32, bias=False)
      (W_v): Linear(in_features=32, out_features=32, bias=False)
      (W_h): Linear(in_features=32, out_features=32, bias=True)
    )
    (NN): TwoLayerNN(
      (L1): Linear(in_features=32, out_features=400, bias=True)
      (L2): Linear(in_features=400, out_features=32, bias=True)
    )
    (layernorm1): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
    (layernorm2): LayerNorm((32,), eps=1e-06, elementwise_affine=True)
  )
  (dense): Linear(in_features=32, out_features=2, bias=True)
)

In [19]:
# Set the optimizer and training epochs 
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
epochs = 10

In [20]:
# Train the model

def train(train_loader, valid_loader):
    
    for epoch in range(1, epochs+1):
        train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
        nb_batches_train = len(train_loader)
        train_acc = 0
        model.train()
        losses = 0.0

        for batch in train_iterator: # 🤪
            x = batch.text.to(device)
            y = batch.label.to(device)
            
            out = model(x)  

            loss = torch.nn.functional.cross_entropy(out, y)  
            
            model.zero_grad()  

            loss.backward()  
            losses += loss.item()

            optimizer.step()  
                        
            train_acc += (out.argmax(1) == y).cpu().numpy().mean()
        
        sum_valid_acc = 0   
        for batch in valid_iterator:
            model.eval()
            acc = 0 
            x = batch.text.to(device)
            y = batch.label.to(device)
                
            out = model(x)
            sum_valid_acc += (out.argmax(1) == y).cpu().numpy().mean()
            valid_acc = sum_valid_acc / len(valid_loader)
        
        print(f"Training loss at epoch {epoch} is {losses / nb_batches_train}")
        print(f"Training accuracy: {train_acc / nb_batches_train}")
        print('Evaluating on validation:' , valid_acc)

In [21]:
train(train_loader, valid_loader)

Training loss at epoch 1 is 0.6719875825061039
Training accuracy: 0.5813703440656566
Evaluating on validation: 0.640625
Training loss at epoch 2 is 0.6052326389842413
Training accuracy: 0.6760130602904041
Evaluating on validation: 0.685546875
Training loss at epoch 3 is 0.5212189660153606
Training accuracy: 0.7437756470959596
Evaluating on validation: 0.734765625
Training loss at epoch 4 is 0.4232503722675822
Training accuracy: 0.8075185448232323
Evaluating on validation: 0.771875
Training loss at epoch 5 is 0.331117995633659
Training accuracy: 0.8587042297979799
Evaluating on validation: 0.790625
Training loss at epoch 6 is 0.25776643604463473
Training accuracy: 0.897189670138889
Evaluating on validation: 0.81015625
Training loss at epoch 7 is 0.20072476316074078
Training accuracy: 0.9246221985479799
Evaluating on validation: 0.829296875
Training loss at epoch 8 is 0.1499159182421863
Training accuracy: 0.9470190183080809
Evaluating on validation: 0.834765625
Training loss at epoch 9 i

In [22]:
# Test the model's accuracy

test_iterator = iter(test_loader)  
nb_batches = len(test_loader)
model.eval()
sum_test_acc = 0 
    
for batch in iter(test_loader):
    x = batch.text.to(device)
    y = batch.label.to(device)
    out = model(x)
    sum_test_acc += (out.argmax(1) == y).cpu().numpy().mean()
print(f"test accuracy: {sum_test_acc/ nb_batches}")

test accuracy: 0.8175271739130434


In [24]:
# Not too bad. The model's accuracy can be further improved by stacking more layers and training it longer time. 
# The model is an overfit on the training set