# Natural Language Inference using BiLSTM

### Data is in the form: premise, hypothesis, label
### with label being either 1 (entailment), 0 (neutral, or contradiction)

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import xgboost as xgb
import sys

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
def load_glove_embeddings(embedding_path):
    print("Loading GloVe embeddings...")
    embeddings_index = {}
    with open(embedding_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

def sentence_embedding(sentence, embeddings_index):
    words = sentence.split()
    embedding_dim = next(iter(embeddings_index.values())).shape[0]
    sentence_embedding = np.zeros(embedding_dim)
    for word in words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            sentence_embedding += embedding_vector
    return sentence_embedding / len(words)

embedding_path = "./input/embeddings/glove.6B/glove.6B.300d.txt"
embeddings_index = load_glove_embeddings(embedding_path)

Loading GloVe embeddings...
Loaded 400001 word vectors.


In [7]:
data_path = "./data/train.csv"
df = pd.read_csv(data_path)

In [8]:
premise_embeddings = [sentence_embedding(sentence.lower(), embeddings_index) for sentence in df['premise']]
hypothesis_embeddings = [sentence_embedding(sentence.lower(), embeddings_index) for sentence in df['hypothesis']]

In [14]:
X = np.hstack((np.array(premise_embeddings), np.array(hypothesis_embeddings)))
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(600,)),  # input shape is twice the GloVe embedding dimension for premise and hypothesis
    tf.keras.layers.Dense(200, activation='tanh'),
    tf.keras.layers.Dense(200, activation='tanh'),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 classes: entailment, contradiction, neutral
])

# Compile the model
model.compile(optimizer='adadelta',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/50
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5114 - loss: 0.9614 - val_accuracy: 0.5144 - val_loss: 0.8313
Epoch 2/50
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5266 - loss: 0.8066 - val_accuracy: 0.5278 - val_loss: 0.7585
Epoch 3/50
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5296 - loss: 0.7510 - val_accuracy: 0.5371 - val_loss: 0.7292
Epoch 4/50
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5299 - loss: 0.7272 - val_accuracy: 0.5431 - val_loss: 0.7149
Epoch 5/50
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5410 - loss: 0.7131 - val_accuracy: 0.5436 - val_loss: 0.7069
Epoch 6/50
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5422 - loss: 0.7073 - val_accuracy: 0.5515 - val_loss: 0.7013
Epoch 7/50
[1m607/607[0m 

In [22]:
dev_data_path = "./data/dev.csv"
dev_df = pd.read_csv(dev_data_path)

# Print all rows in the dev dataframe where there is a NaN value
print(dev_df[dev_df.isna().any(axis=1)])

# Remove those rows
dev_df = dev_df.dropna()

print(dev_df[dev_df.isna().any(axis=1)])

# Test the model on the dev set
premise_embeddings = [sentence_embedding(sentence.lower(), embeddings_index) for sentence in dev_df['premise']]
hypothesis_embeddings = [sentence_embedding(sentence.lower(), embeddings_index) for sentence in dev_df['hypothesis']]
X_dev = np.hstack((np.array(premise_embeddings), np.array(hypothesis_embeddings)))
y_dev = dev_df['label'].values

loss, accuracy = model.evaluate(X_dev, y_dev)
print(f"Dev Loss: {loss}")
print(f"Dev Accuracy: {accuracy}")


                                                premise hypothesis  label
3126  Tony  Shoes (so Clinton will have Shoes and So...        NaN      1
3970                            Saint-Germain-des-Pr??s        NaN      1
Empty DataFrame
Columns: [premise, hypothesis, label]
Index: []
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420us/step - accuracy: 0.6076 - loss: 0.6611
Dev Loss: 0.6605627536773682
Dev Accuracy: 0.6103934645652771
