# Sentance entailment with Pytorch using vanila RNN/LSTM/GRU
## This notebook has the follwing topics 
    1. Dataset Preprocessing using torchtext
    2. Training details 
    3. Inference
## Observations
    1. Since the architecture is very simple the accuracy is very bad
    2. The model tends to overfit

In [1]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm as tqdm

from torchtext.data import Field, BucketIterator, TabularDataset
import spacy

import torch
import torch.nn as nn
from torchtext import data
import torch.optim as optim

In [2]:
## GLOBAL SETTINGS
SEED = 2021
BATCH_SIZE = 32
visualize_index = 10 # Index to be used to test/visualize items
label_dict = {0 : 'contradiction', 1 : 'entailment', 2 : 'neutral'} # output label and there index

# 1. Dataset Preprocessing using torchtext

In [3]:
# load data
train_df = pd.read_csv("../dataset/assignment_data_set/train.csv")
print(f"Number of training sentence pairs : {train_df.shape[0]}")

# Display 10 random rows from the data.
train_df.sample(visualize_index)

Number of training sentence pairs : 9349


Unnamed: 0,gold_label,sentence1,sentence2
8491,neutral,A white dog with long hair jumps to catch a re...,A white chihuahua mix with long curly hair is ...
234,contradiction,A young girl wearing a pink coat plays with a ...,A girl is wearing a blue jacket.
3104,contradiction,"two wrestlers wrestling, the one on the bottom...",The good friends eat ice cream together
1806,entailment,A man in a black shirt is playing golf outside.,The man wearing the black shirt plays a game o...
4096,neutral,"A woman with dark hair wearing a dark shirt, j...",A reporter is doing a live news report.
8132,entailment,A young child is looking at a commuter train s...,The child is on a train seat.
7152,contradiction,A football team getting ready for the coin tos...,The stadium was packed with people.
5114,contradiction,Two dogs playing in the snow.,The two dogs are inside sleeping by the firepl...
9174,contradiction,The streets are busy and people contemplate th...,People are screaming
8295,entailment,The dog is walking in the snow.,The snow is outside


In [4]:
print('training labels')
train_df.gold_label.value_counts()

training labels


entailment       3166
contradiction    3114
neutral          3069
Name: gold_label, dtype: int64

In [5]:
# Helper function to create sentence pairs
def create_sen_pair(dataframe):
    final_df = []
    for idx in tqdm(dataframe.index):
        each_sen_pair = [
            dataframe.iloc[idx]["sentence1"], dataframe.iloc[idx]["sentence2"]
        ]
        label = dataframe.iloc[idx]["gold_label"]
        final_df.append([each_sen_pair, label])
    return final_df

# Tokinization
spacy_en = spacy.load('en')

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
train_df_processed = create_sen_pair(train_df)

100%|██████████| 9349/9349 [00:01<00:00, 7162.42it/s]


In [7]:
# process data using torchtext FIELD

SEN_1 = Field(sequential=True,
              tokenize=tokenize_en,
              use_vocab=True,
              lower=True,
              batch_first=True)
SEN_2 = Field(sequential=True,
              tokenize=tokenize_en,
              use_vocab=True,
              lower=True,
              batch_first=True)
LABEL =  data.LabelField()

fields = [('sen_1', SEN_1), ('sen_2', SEN_2),('label', LABEL)]
example = [
    data.Example.fromlist([train_df_processed[i][0][0],train_df_processed[i][0][1],train_df_processed[i][1]],
                          fields) for i in tqdm(range(len(train_df_processed)))
]

print('---------------check out one example---------------')
vars(example[10])

100%|██████████| 9349/9349 [00:00<00:00, 13758.54it/s]

---------------check out one example---------------





{'sen_1': ['three',
  'women',
  ',',
  'two',
  'wearing',
  'red',
  'shirts',
  'and',
  'one',
  'wearing',
  'a',
  'purple',
  'shirt',
  ',',
  'and',
  'a',
  'man',
  ',',
  'wearing',
  'a',
  'light',
  'blue',
  'shirt',
  ',',
  'jump',
  'on',
  'a',
  'basketball',
  'court',
  'with',
  'balls',
  'in',
  'their',
  'hands',
  '.'],
 'sen_2': ['three', 'people', "'s", 'are', 'eating', 'in', 'hotel', '.'],
 'label': 'contradiction'}

In [8]:
# creating dataset
train_Dataset = data.Dataset(example, fields)
(train_data, valid_data) = train_Dataset.split(split_ratio=[0.90, 0.10],
                                               random_state=random.seed(SEED))

print('---------------check out train and validation dataset---------------')
print(vars(train_data.examples[10]))

---------------check out train and validation dataset---------------
{'sen_1': ['a', 'young', 'child', ',', 'wearing', 'a', 'pink', '-', 'polkadotted', 'outfit', ',', 'smiles', 'at', 'the', 'camera', 'as', 'she', 'lays', 'on', 'a', 'white', ',', 'shaggy', 'rug', '.'], 'sen_2': ['the', 'child', 'is', 'in', 'stripe', ','], 'label': 'contradiction'}


In [9]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 8414
Number of validation examples: 935


In [10]:
# use pretrained glove embedding for words
SEN_1.build_vocab(train_data,vectors="glove.6B.100d")
SEN_2.build_vocab(train_data,vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

print('Size of input vocab : ', len(SEN_1.vocab),len(SEN_2.vocab))
print('Size of label vocab : ', len(LABEL.vocab))
print('Top 10 words appreared repeatedly :',
      list(SEN_1.vocab.freqs.most_common(10)))
print('Labels : ', LABEL.vocab.stoi)

Size of input vocab :  4066 4621
Size of label vocab :  3
Top 10 words appreared repeatedly : [('a', 15756), ('.', 8030), ('in', 5118), ('the', 3803), ('and', 3013), ('on', 2690), ('of', 2499), ('man', 2469), ('is', 2294), ('with', 2261)]
Labels :  defaultdict(None, {'entailment': 0, 'contradiction': 1, 'neutral': 2})


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iterator, valid_iterator = data.BucketIterator.splits((train_data, valid_data),
                                                            batch_size = BATCH_SIZE, 
                                                            sort=False,
                                                            device = device)

In [12]:
print('checking out one batch')
x = next(iter(train_iterator))
x.sen_1.shape,x.sen_2.shape

checking out one batch


(torch.Size([32, 36]), torch.Size([32, 28]))

# 2.Training details

In [13]:
class classifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,
                 n_layers,dropout):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)

        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          batch_first=True)

        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, sen_1, sen_2):

        #text = [sent len, batch size]
        #embedded = [sent len, batch size, emb dim]
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]

        embedded_sen1 = self.dropout(self.embedding(sen_1))
        embedded_sen2 = self.dropout(self.embedding(sen_2))
        x = torch.cat([embedded_sen1, embedded_sen2], 1)
        _, hidden_t = self.rnn(x)
        output = self.fc(hidden_t.squeeze(0))

        return output

In [14]:
INPUT_DIM = len(SEN_1.vocab)+len(SEN_2.vocab)
EMBEDDING_DIM = 100
N_LAYERS = 1
HIDDEN_DIM = 256
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.3

model = classifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)
print(model)

classifier(
  (embedding): Embedding(8687, 100)
  (rnn): GRU(100, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,144,415 trainable parameters


In [16]:
# load pretrained embeddings 
pretrained_embeddings = torch.cat((SEN_1.vocab.vectors,SEN_2.vocab.vectors),dim=0)
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.2709,  0.0440, -0.0203,  ..., -0.4923,  0.6369,  0.2364],
        ...,
        [-0.3389,  0.2919,  0.2993,  ...,  0.2409,  0.2894, -0.6609],
        [-0.0205, -0.4512,  0.6993,  ...,  0.0381,  0.2786,  0.2889],
        [-0.0473,  0.4594,  1.0032,  ..., -0.2536,  0.3862,  0.1045]])

In [17]:
# training configs
optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [18]:
# accuracy metric
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).to(device)

In [19]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.sen_1,batch.sen_2)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.sen_1,batch.sen_2)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [22]:
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'final_model_rnn.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 1.094 | Train Acc: 36.06%
	 Val. Loss: 1.089 |  Val. Acc: 39.14%
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 1.052 | Train Acc: 43.36%
	 Val. Loss: 1.028 |  Val. Acc: 46.28%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 1.002 | Train Acc: 49.31%
	 Val. Loss: 1.006 |  Val. Acc: 47.47%
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 0.947 | Train Acc: 54.38%
	 Val. Loss: 1.000 |  Val. Acc: 49.29%
Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 0.884 | Train Acc: 59.88%
	 Val. Loss: 0.966 |  Val. Acc: 53.72%
Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 0.817 | Train Acc: 64.12%
	 Val. Loss: 0.965 |  Val. Acc: 54.55%
Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.748 | Train Acc: 67.86%
	 Val. Loss: 0.990 |  Val. Acc: 55.34%
Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.695 | Train Acc: 70.87%
	 Val. Loss: 1.004 |  Val. Acc: 54.66%
Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.640 | Train Acc: 73.84%
	 Val. Loss: 1.023 |  Val. Acc: 54.51%
Epoch: 10 | Epoch Time: 0m 1

# 3. Inference

In [23]:
# Helper functiont for testing
def check_similarity(s1, s2):
    indexed_1 = [SEN_1.vocab.stoi[t] for t in s1]
    indexed_2 = [SEN_2.vocab.stoi[t] for t in s2]
    tensor_1, tensor_2 = torch.LongTensor(indexed_1).to(
        device), torch.LongTensor(indexed_2).to(device)
    tensor_1 = tensor_1.unsqueeze(0)
    tensor_2 = tensor_2.unsqueeze(0)
    preds = model(tensor_1, tensor_2)
    max_preds = preds.argmax(dim=1).to('cpu').numpy()[0]
    return act_label, label_dict[max_preds]

In [24]:
# Testing from validation data as we are not training on validation data

index = random.randint(0,len(valid_data.examples))
s1,s2 = valid_data[index].sen_1,valid_data[index].sen_2
act_label = valid_data[index].label
pred_label = check_similarity(s1,s2)

print('testing the model with the following sentence pair \n')
print('sentence 1 --> ', " ".join(s1))
print('sentence 2 --> ', " ".join(s2))
print('actual label --> ',act_label)
print('predicted label --> ',pred_label[1])

testing the model with the following sentence pair 

sentence 1 -->  an overweight man in a blue and black hooded sweatshirt works on a laptop computer outdoors .
sentence 2 -->  a heavyset man in a hoodie works in a laptop while outside .
actual label -->  entailment
predicted label -->  contradiction
