# Natural Language Inference

## Data is in the form of: Premise, Hypothesis, Label
## Label taking the values 1 or 0 (1: entailment, 0: contradiction or neutrality)

### Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf

### We will be using GloVe embeddings for this task to convert the words to vectors

In [2]:
# Load GloVe embeddings
def load_glove_embeddings(embedding_path):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

In [3]:
# Function to create sentence embeddings

# The function is modelled after the approach described in "A large annotated corpus for learning natural language inference"
# by Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning with a few modifications.
# The original aproach for their baseline model used a sum of word embeddings for each sentence. 
# We use the same approach but normalize the sum by dividing it by the number of words in the sentence.

def sentence_embedding(sentence, embeddings_index):
    words = sentence.split()
    embedding_dim = next(iter(embeddings_index.values())).shape[0]
    sentence_embedding = np.zeros(embedding_dim)
    for word in words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            sentence_embedding += embedding_vector
    return (sentence_embedding + 1) / (len(words) + 1)

In [4]:
# Create sentence embeddings index
embedding_path = "./input/embeddings/glove.6B/glove.6B.300d.txt"
embeddings_index = load_glove_embeddings(embedding_path)

Loading GloVe embeddings...
Loaded 400001 word vectors.


In [5]:
# Load the training data
data_path = "./data/train.csv"
df = pd.read_csv(data_path)

# Print rows with missing values or NaN
print(df[df.isnull().any(axis=1)])
print(df[df.isna().any(axis=1)])

# Replace missing values or NaN with empty strings
df = df.fillna('')
df = df.replace(np.nan, '', regex=True)

df

Empty DataFrame
Columns: [premise, hypothesis, label]
Index: []
Empty DataFrame
Columns: [premise, hypothesis, label]
Index: []


Unnamed: 0,premise,hypothesis,label
0,"However, Fort Charles was rebuilt as a militar...",Fort Charles was rebuilt as an amusement park ...,0
1,Buchanan's The Democrats and Republicans have...,THe parties will never be similar.,0
2,In order to review an acquisition that is usin...,The auditor only reviews the acquisition itsel...,0
3,Three young people sit outside and engage with...,There is a tin can and string telephone.,0
4,The lucrative tin mines of Kuala Lumpur in the...,The Chinese labor was seen as less costly and ...,1
...,...,...,...
26939,Information in agencies' plans and reports pro...,"Thanks to agencies' plans and reports, over $3...",0
26940,"He is the Mr. Magoo of scientific theory, geni...",He understands everything he can't see.,0
26941,"Over the past 25 years, the Postal Service has...",Classifying mail is important to the function ...,1
26942,Whoever first stepped ashore on Madeira discov...,The British discovered the Canary Islands first.,0


In [6]:
# Create sentence embeddings for the training data
premise_embeddings = [sentence_embedding(sentence.lower(), embeddings_index) for sentence in df['premise']]
hypothesis_embeddings = [sentence_embedding(sentence.lower(), embeddings_index) for sentence in df['hypothesis']]

### Train-Test Split

In [7]:
X = np.hstack((np.array(premise_embeddings), np.array(hypothesis_embeddings)))
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Define the Model

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(600,)),  # input shape is twice the GloVe embedding dimension for premise and hypothesis
    tf.keras.layers.Dense(500, activation='tanh'),
    tf.keras.layers.Dense(400, activation='tanh'),
    tf.keras.layers.Dense(300, activation='tanh'),
    tf.keras.layers.Dense(300, activation='tanh'),
    tf.keras.layers.Dense(300, activation='tanh'),
    tf.keras.layers.Dense(200, activation='tanh'),
    tf.keras.layers.Dense(2, activation='softmax')
])

# Compile the model
model.compile(optimizer='adadelta',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Print number of model parameters
model.summary()

In [9]:
# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/100
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5103 - loss: 0.7029 - val_accuracy: 0.5594 - val_loss: 0.6829
Epoch 2/100
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5615 - loss: 0.6798 - val_accuracy: 0.5979 - val_loss: 0.6703
Epoch 3/100
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5882 - loss: 0.6720 - val_accuracy: 0.6085 - val_loss: 0.6653
Epoch 4/100
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5922 - loss: 0.6663 - val_accuracy: 0.6211 - val_loss: 0.6599
Epoch 5/100
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5984 - loss: 0.6637 - val_accuracy: 0.6294 - val_loss: 0.6564
Epoch 6/100
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5978 - loss: 0.6622 - val_accuracy: 0.6331 - val_loss: 0.6528
Epoch 7/100
[1m607/60

### Save the model

In [10]:
# Save the model
filename = "./models/deep_learning/model.keras"
model.save(filename)

In [25]:
# Load the model
# loaded_model = tf.keras.models.load_model(filename)