In [110]:
import torch

from torchtext import data
from torchtext.data import BucketIterator

from torchtext import datasets

from tqdm import tqdm

In [20]:
SEED=42

torch.manual_seed(SEED)

torch.backends.cudnn.deterministic= True

In [29]:
Text = data.Field(tokenize='spacy' , include_lengths=True)
Label = data.LabelField(dtype=torch.float)

In [37]:
#load imdb dataset

train_data , test_data  = datasets.IMDB.splits(Text , Label)

In [38]:
import random #70/30 split

train_data , valid_data = train_data.split(random_state= random.seed(SEED))

In [39]:
for data in train_data:
    print(data.text)
    print(data.label)
    break

['And', 'how', 'many', 'actors', 'can', 'he', 'get', 'to', 'stand', 'in', 'for', 'his', 'own', 'neurotic', ',', 'compulsive', 'uber', '-', 'New', 'Yorker', 'persona', '?', 'In', 'this', 'film', 'Woody', 'is', 'played', 'by', 'Will', 'Ferrell', 'in', 'what', 'is', 'mercifully', 'less', 'a', 'direct', 'impersonation', 'than', 'the', 'one', 'Kenneth', 'Branagh', 'did', 'in', '"', 'Celebrity', '.', '"', 'It', "'s", 'an', 'annoyingly', 'repetitive', 'story', 'now', ':', 'nebbishy', ',', 'neurotic', 'man', 'with', 'a', 'wife', 'or', 'girlfriend', 'falls', 'madly', 'in', 'love', 'with', 'a', 'shiksa', 'queen', 'upon', 'which', 'he', 'projects', 'all', 'manner', 'of', 'perfection', '.', 'Everyone', 'lives', 'in', 'perfect', 'gigantic', 'apartments', 'in', 'great', 'Manhattan', 'neighborhoods', ',', 'everyone', 'constantly', 'patronizes', 'expensive', ',', 'exclusive', 'restaurants', 'during', 'which', 'all', 'the', 'characters', 'relate', 'fascinating', 'anecdotes', 'and', 'discuss', 'arcane',

In [41]:
#use glove vectors

MAX_VOCAB_SIZE=25000

Text.build_vocab(
train_data , 
    max_size=MAX_VOCAB_SIZE,
    vectors ='glove.6B.100d',
    unk_init=torch.Tensor.normal_
)

Label.build_vocab(train_data)

In [42]:
Batch_size=64



train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = Batch_size,
    sort_within_batch = True)



# Build the model


In [43]:
import torch.nn as nn

In [85]:
class RNN(nn.Module):
    
    def __init__(self , vocab_size , embedding_dim , 
                hidden_dim , output_dim , n_layers ,bidirectional , dropout , pad_idx ):
        
        super().__init__()
        
        self.embedding =nn.Embedding(vocab_size , embedding_dim ,padding_idx=pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim , hidden_dim , num_layers=n_layers , bidirectional=bidirectional , dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim*2 , output_dim)
        
        self.dropout = nn.Dropout(dropout)
    
    
    def forward(self,text,text_lengths):
        
        #text = [sent len , batch_size]
        
        
        embedded = self.dropout(self.embedding(text))
        #embedded = [sent len , batchsize, emb dim]
        
        #pack sequence
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded , text_lengths)
        
        packed_output , (hidden ,cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        
        output , output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        
        
        #output = [sent len , batch size , hid dim * num directions]
        #output over padding tokens are zero tokens
        
        #hidden = [num layrs * num directions , batch size , hid dim]
        
        #cell = [num layers * num directions , batch size , hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:])
        # hidden layers
        #apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:] , hidden[-1,:,:]) , dim=1))
        
        #hidden = [batch size , hid dim * num directions]
        
        return self.fc(hidden)
        
        
        

In [86]:
INPUT_DIM = len(Text.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM=1
N_LAYERS=2
BIDIRECTIONAL=True
DROPOUT=0.5
PAD_IDX = Text.vocab.stoi[Text.pad_token]

model = RNN(INPUT_DIM , 
           EMBEDDING_DIM , 
           HIDDEN_DIM,
           OUTPUT_DIM,
           N_LAYERS,
           BIDIRECTIONAL,
           DROPOUT,
           PAD_IDX)


In [88]:
model

RNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [90]:
#print number of trainable parameters

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f' The model has {count_parameters(model): ,} trainable parameters')

 The model has  4,810,857 trainable parameters




The final addition is copying the pre-trained word embeddings we loaded earlier into the embedding layer of our model.

We retrieve the embeddings from the field's vocab, and check they're the correct size, [vocab size, embedding dim]


In [93]:
pretrained_embeddings = Text.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([25002, 100])




We then replace the initial weights of the embedding layer with the pre-trained embeddings.

Note: this should always be done on the weight.data and not the weight!


In [94]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.4611, -0.0639, -1.3667,  ...,  1.6309, -0.0847,  1.0844],
        [ 0.1954, -1.3350,  0.3945,  ..., -0.9228, -1.2620,  1.0861],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.7836,  0.9079,  0.9177,  ...,  0.7037,  1.4912,  0.5677],
        [ 0.1329,  0.1716,  0.7947,  ..., -0.4911, -0.1513,  0.3190],
        [ 0.2416,  0.0783, -0.8522,  ..., -0.1595,  0.6774,  0.2029]])



As our unk and pad token aren't in the pre-trained vocabulary they have been initialized using unk_init (an $\mathcal{N}(0,1)$ distribution) when building our vocab. It is preferable to initialize them both to all zeros to explicitly tell our model that, initially, they are irrelevant for determining sentiment.

We do this by manually setting their row in the embedding weights matrix to zeros. We get their row by finding the index of the tokens, which we have already done for the padding index.

Note: like initializing the embeddings, this should be done on the weight.data and not the weight!


In [95]:
UNK_IDX = Text.vocab.stoi[Text.unk_token]

In [97]:
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [98]:
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.7836,  0.9079,  0.9177,  ...,  0.7037,  1.4912,  0.5677],
        [ 0.1329,  0.1716,  0.7947,  ..., -0.4911, -0.1513,  0.3190],
        [ 0.2416,  0.0783, -0.8522,  ..., -0.1595,  0.6774,  0.2029]])


We can now see the first two rows of the embedding weights matrix have been set to zeros. As we passed the index of the pad token to the padding_idx of the embedding layer it will remain zeros throughout training, however the <unk> token embedding will be learned.

# Train the model

In [99]:
import torch.optim as optim

In [100]:
optimizer=optim.Adam(model.parameters())

In [101]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [102]:
criterion = nn.BCEWithLogitsLoss()


In [103]:
def binary_accuracy(preds , y):
    
    #round predictions to the closet integer
    
    rounded_preds = torch.round(torch.sigmoid(preds))
    
    correct = (rounded_preds == y).float()
    
    acc = correct.sum() / len(correct)
    
    return acc

In [111]:
def train(model , iterator , optimizer,criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in tqdm(iterator):
        optimizer.zero_grad()
        
        text , text_lengths = batch.text
        
        predictions = model(text , text_lengths).squeeze(1)
        
        loss = criterion(predictions , batch.label)
        
        acc = binary_accuracy(predictions , batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        epoch_acc += acc.item()
        
    return epoch_loss/len(iterator) , epoch_acc/len(iterator)

In [112]:
def evaluate(model , iterator , criterion):
    
    epoch_loss = 0
    
    epoch_acc =0
    
    model.eval()
    
    with torch.no_grad():
        
        for batch in tqdm(iterator):
            
            text, text_lengths = batch.text
            
            predictions = model(text , text_lengths).squeeze(1) #->[batch size]
            
            loss = criterion(predictions , batch.label)
            
            acc = binary_accuracy(predictions , batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
    return epoch_loss/len(iterator) , epoch_acc/len(iterator)

In [None]:
# import time

# def epoch_time(start_time , end_time):
#     elapsed_time = end_time - start_time
#     elapsed_mins = int(elapsed_time/60)
#     elapsed_secs = int(elapsed_time - (elapsed_mins*60))
#     return elapsed_mins , elapsed_secs
    
    
    

In [None]:
N_epochs = 5

best_valid_loss = float('inf')

for epoch in range(N_epochs):
    
#     start_time = time.time()
    
    train_loss , train_acc = train(model , train_iterator , optimizer,criterion)
    valid_loss , valid_acc = evaluate(model , valid_iterator , criterion)
    
#     end_time = time.time()
    
#     epoch_mins,  epoch_secs = epoch_time(start_time , end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict() , 'tut2-model.pt')
        
    
#     print(f'epoch : {epoch+1:02} | epoch time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain loss: {train_loss:.3f} | Train acc: {train_acc*100:.2f}%')
    print(f'\t Val loss: {valid_loss:.3f} | val acc: {valid_acc*100:.2f}%')

  1%|▌                                                                                 | 2/274 [00:31<51:05, 11.27s/it]