# Summation & LSTM Model - MultiNLI Dataset

The following notebook contains the implementation of the baseline summation model, and the standard LSTM model for the MultiNLI dataset

In [1]:
# Import the necessary libraries

import re
from numpy import array
from numpy import asarray
from numpy import zeros

import tensorflow as tf
import keras
import keras.backend as K
from keras.models import Model
from keras.regularizers import L2
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Input, LSTM
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /home/aakashj2412/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Helper function that cleans the input data and enumerates the labels

def extract(s):
    s = re.sub('\\(', '', s)
    s = re.sub('\\)', '', s)
    s = re.sub('\\s{2,}', ' ', s)
    return s.strip()

labels = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

In [9]:
# Function that reads data and parses data from file

def readFileData(filePath, t):
    with open(filePath, 'r') as f:
        inputRows = [row.split('\t') for row in f.readlines()[1:]]

    
    if t == 't':
        inputPremises = [extract(row[1]) for row in inputRows if row[0] in labels]
        inputHypotheses = [extract(row[2]) for row in inputRows if row[0] in labels]
        inputLabels = [labels[row[0]] for row in inputRows if row[0] in labels]
        f.close()

        return [inputPremises, inputHypotheses, inputLabels]

    else:
        test, val = train_test_split(inputRows, test_size=0.2, random_state=42)
        
        testPremises = [extract(row[1]) for row in test if row[0] in labels]
        testHypotheses = [extract(row[2]) for row in test if row[0] in labels]
        testLabels = [labels[row[0]] for row in test if row[0] in labels]

        valPremises = [extract(row[1]) for row in val if row[0] in labels]
        valHypotheses = [extract(row[2]) for row in val if row[0] in labels]
        valLabels = [labels[row[0]] for row in val if row[0] in labels]

        return [[testPremises, testHypotheses, testLabels], [valPremises, valHypotheses, valLabels]]

In [10]:
# Reading train and test data

datasetPath = "../../Datasets/MultiNLI/"
trainData = readFileData(f'{datasetPath}multinli_1.0_train.txt', 't')
testData, validationData = readFileData(f'{datasetPath}multinli_1.0_dev_matched.txt', 'v')

print(len(trainData[0]))
print(len(validationData[0]))
print(len(testData[0]))

392702
1965
7850


In [11]:
# Hyperparameters

maxLen = 401
epochs = 1000
batchSize = 128
gloveDimension = 300
hiddenDimension = 100
regularization = 4e-6

In [12]:
# Tokenizer to generate the vocabulary of the system

tokenizer = Tokenizer()
tokenizer.fit_on_texts(trainData[0] + trainData[1])
vocabSize = len(tokenizer.word_index)+1

In [13]:
# Convert the train data to sequences as per the vocabulary
trainData[0] = tokenizer.texts_to_sequences(trainData[0])
trainData[1] = tokenizer.texts_to_sequences(trainData[1])

# Pad or trim all generated sequences to the same max sentence length
trainData[0] = pad_sequences(trainData[0], maxLen, padding='post')
trainData[1] = pad_sequences(trainData[1], maxLen, padding='post')

# Transform the labels to one-hot encoding
trainData[2] = tf.keras.utils.to_categorical(trainData[2], num_classes=3)

In [14]:
# Convert the test data to sequences as per the vocabulary
testData[0] = tokenizer.texts_to_sequences(testData[0])
testData[1] = tokenizer.texts_to_sequences(testData[1])

# Pad or trim all generated sequences to the same max sentence length
testData[0] = pad_sequences(testData[0], maxLen, padding='post')
testData[1] = pad_sequences(testData[1], maxLen, padding='post')

# Transform the labels to one-hot encoding
testData[2] = tf.keras.utils.to_categorical(testData[2], num_classes=3)

In [15]:
# Convert the validation data to sequences as per the vocabulary
validationData[0] = tokenizer.texts_to_sequences(validationData[0])
validationData[1] = tokenizer.texts_to_sequences(validationData[1])

# Pad or trim all generated sequences to the same max sentence length
validationData[0] = pad_sequences(validationData[0], maxLen, padding='post')
validationData[1] = pad_sequences(validationData[1], maxLen, padding='post')

# Transform the labels to one-hot encoding
validationData[2] = tf.keras.utils.to_categorical(validationData[2], num_classes=3)

In [16]:
# Import the GloVe embeddings and generate embeddings dictionary

embeddingsDict = dict()
glovePath = '../../Datasets/GloVe/'
glove = open(f'{glovePath}glove.840B.300d.txt', encoding='utf8')

for line in glove:
    records = line.split()
    word = ''.join(records[:-300])
    vectorDimensions = asarray(records[-300:], dtype='float32')
    embeddingsDict[word] = vectorDimensions

glove.close()

In [17]:
# Iterate through the embeddings and store only those that are present in our vocabulary
embeddingsMat = zeros((vocabSize, gloveDimension))
for word, index in tokenizer.word_index.items():
    if index % 2500 == 0:
        print(index)
    vec = embeddingsDict.get(word)
    if vec is not None:
        embeddingsMat[index] = vec

2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000


In [18]:
# Define the embedding layer for our baseline RNN model
embed = Embedding(vocabSize, gloveDimension, weights=[embeddingsMat], input_length=maxLen, trainable=False)

# As Premise and Hypothesis are distinct and are to be inputted separately, define two inputs and embed
premise = Input(shape=(maxLen,), dtype='int32')
hypothesis = Input(shape=(maxLen,), dtype='int32')

premInput = embed(premise)
hypoInput = embed(hypothesis)

convert = Dense(hiddenDimension, activation='tanh', input_shape=(gloveDimension,))

premInput = convert(premInput)
hypoInput = convert(hypoInput)

In [19]:
# Once the sentence embeddings have been generated, generate a matrix of dimensions maxLen X gloveDimension
# On adding maxLen, we get a single embedding vector of length gloveDimension

rnn = keras.layers.core.Lambda(lambda x: K.sum(x, axis=1), output_shape=hiddenDimension)
# rnn = LSTM(hiddenDimension, dropout=0.2)

In [20]:
# Apply batch normalization to the two input embeddings separately

premInput = rnn(premInput)
hypoInput = rnn(hypoInput)
premInput = tf.keras.layers.BatchNormalization()(premInput)
hypoInput = tf.keras.layers.BatchNormalization()(hypoInput)

In [21]:
# Joint is a concatenated embeddings layer, generated from the premise and hypothesis inputs
# Dilution of probability 0.2, to assist in regularization
joint = keras.layers.concatenate([premInput, hypoInput])
joint = Dropout(0.2)(joint)
for i in range(3):
    joint = Dense(2*hiddenDimension, activation='tanh', kernel_regularizer=L2(regularization))(joint)
    joint = Dropout(0.2)(joint)
    joint = tf.keras.layers.BatchNormalization()(joint)

# 3 layers of the TanH activation function, along with L2 regularization.
# The final decision is based on the Softmax function
pred = Dense(3, activation='softmax')(joint)

In [22]:
# Defining the final models input and output format, as well as compilation parameters

model = Model(inputs=[premise, hypothesis], outputs=pred)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

## Summation Model

The following subsection trains the model and reports the findings for the summation model

In [20]:
# Fitting the model using the train data

callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=5)
model.fit([array(trainData[0]), array(trainData[1])], array(trainData[2]), batch_size=batchSize, epochs=epochs, callbacks=[callback], validation_data=[[array(validationData[0]), array(validationData[1])], array(validationData[2])])

2022-05-01 09:44:38.100954: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 629894008 exceeds 10% of free system memory.
2022-05-01 09:44:38.337863: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 629894008 exceeds 10% of free system memory.


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000


<keras.callbacks.History at 0x7f4adc21c490>

In [27]:
# Fitting the model using the train data

callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=15)
model.fit([array(trainData[0]), array(trainData[1])], array(trainData[2]), batch_size=batchSize, epochs=epochs, callbacks=[callback], validation_data=[[array(validationData[0]), array(validationData[1])], array(validationData[2])])


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000

<keras.callbacks.History at 0x5b63aa24f099>



In [21]:
# Evaluating accuracy on the trained model

loss, acc = model.evaluate([array(testData[0]), array(testData[1])], array(testData[2]), batch_size=256)
print('Loss = ', loss)
print('Acc = ', acc)

Loss =  0.7935532331466675
Acc =  0.6574522256851196


In [None]:
model.save('../../Models/summation_multiNLI.h5')

In [None]:
ccc = keras.models.load_model('../../Models/summation_multiNLI.h5')
l, a = ccc.evaluate([array(testData[0]), array(testData[1])], array(testData[2]), batch_size=256)
print(a)

We have the following results for the summation model, for the SNLI dataset:

- Training Accuracy: 65.39%
- Validation Accuracy: 67.23%
- Test Accuracy: 65.74%

## LSTM Model

The following subsection trains the model and reports the findings for the LSTM model

In [29]:
# Fitting the model using the train data

callback = EarlyStopping(monitor='val_loss', min_delta=0, patience=5)
model.fit([array(trainData[0]), array(trainData[1])], array(trainData[2]), batch_size=batchSize, epochs=epochs, callbacks=[callback], validation_data=[[array(validationData[0]), array(validationData[1])], array(validationData[2])])


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000

<keras.callbacks.History at 0x6c63ba75f17a>



In [None]:
model.save('../../Models/LSTM_multiNLI.h5')

In [None]:
model = keras.models.load_model('../../Models/LSTM_multiNLI.h5')

In [30]:
# Evaluating accuracy on the trained model

loss, acc = model.evaluate([array(testData[0]), array(testData[1])], array(testData[2]), batch_size=256)
print('Loss = ', loss)
print('Acc = ', acc)


Loss =  0.8935532331466675
Acc =  0.6576725454179533



We can thus observe the following results for the simple LSTM model, for the MultiNLI dataset:

- Training Accuracy: 67.41%
- Validation Accuracy: 65.98%
- Test Accuracy: 65.76%