## Project initialization and import libraries

In [1]:
from time import time

import numpy as np

import matplotlib.pyplot as plt
import datetime

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM


Using TensorFlow backend.


## Import training and test data from csv file

In [2]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Processing by cleaning data and creating embedding matrix

In [16]:
import re
from gensim.models import KeyedVectors

def get_word_list(text):
    
    text = str(text)
    text = text.lower()

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text


from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


vocabulary = dict()
inverse_vocabulary = ['<unk>']
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

ques_columns = ['question1', 'question2']

for dataset in [train_df]:
    for index, row in dataset.iterrows():
        for question in ques_columns:
            conversionToNumber = []
            for word in get_word_list(row[question]):

                if word in stop_words and word not in word2vec.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    conversionToNumber.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    conversionToNumber.append(vocabulary[word])

            dataset.set_value(index, question, conversionToNumber)
            

#initialize embedding matrix
embeddings = 1 * np.random.randn(len(vocabulary) + 1, 300)
embeddings[0] = 0 

for word, pos in vocabulary.items():
    if word in word2vec.vocab:
        embeddings[pos] = word2vec.word_vec(word)

del word2vec



### Split the dataset into training, test, and validation and perform padding to the length 256


In [35]:
max_seq_length = 256
training_data_size = 304290
validation_data_size = 50000
test_data_size = 50000

X = train_df[['question1', 'question2']]
Y = train_df['is_duplicate']

X_train = X.iloc[0:training_data_size]
X_train = {'q1': X_train.question1, 'q2': X_train.question2}

Y_train = Y.iloc[0:training_data_size]
Y_train = Y_train.values

X_validation = X.iloc[training_data_size:training_data_size+validation_data_size]
X_validation = {'q1': X_validation.question1, 'q2': X_validation.question2}

Y_validation = Y.iloc[training_data_size:training_data_size+validation_data_size]
Y_validation = Y_validation.values

X_test = X.iloc[training_data_size+validation_data_size:training_data_size+validation_data_size+test_data_size]
X_test = {'q1': X_test.question1, 'q2': X_test.question2}

Y_test = Y.iloc[training_data_size+validation_data_size:training_data_size+validation_data_size+test_data_size]
Y_test = Y_test.values

import itertools

# Perform Padding to make equal to max_seq_length
for dataset, pos in itertools.product([X_train, X_validation, X_test], ['q1', 'q2']):
    dataset[pos] = pad_sequences(dataset[pos], maxlen=max_seq_length)


## Build and save model 

In [36]:
from keras.layers import Lambda
import keras.backend as K
import h5py
from keras.optimizers import Adadelta, Adam

n_hidden = 50
batch_size = 64
n_epoch = 10

def manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

embedding_layer = Embedding(len(embeddings), 300, weights=[embeddings], input_length=max_seq_length, trainable=False)

input_question1 = Input(shape=(max_seq_length,), dtype='int32')
input_question2 = Input(shape=(max_seq_length,), dtype='int32')
encoded_question1 = embedding_layer(input_question1)
encoded_question2 = embedding_layer(input_question2)

lstm1 = LSTM(n_hidden)
lstm2 = LSTM(n_hidden)
output_question1 = lstm1(encoded_question1)
output_question2 = lstm2(encoded_question2)

malstm_distance = Lambda(lambda x: manhattan_distance(x[0], x[1]), lambda x: (x[0][0], 1))([output_question1, output_question2])

# Build the model
lstm_model = Model([input_question1, input_question2], [malstm_distance])
lstm_model.compile(loss='mean_squared_error', optimizer=Adam(clipnorm=1.25), metrics=['accuracy'])

training_start_time = time() 

malstm_trained = lstm_model.fit([X_train['q1'], X_train['q2']], Y_train, batch_size=batch_size, epochs=n_epoch,
                            validation_data=([X_validation['q1'], X_validation['q2']], Y_validation))


required_time = datetime.timedelta(seconds=time()-training_start_time)
print("Total training time is {}".format(required_time))

lstm_model.save('model/train.h5')

Train on 104200 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2
Total training time is 0:26:19.639511


## Load model from directory

In [37]:
lstm_model = Model([input_question1, input_question2], [malstm_distance])
lstm_model.load_weights('model/train.h5')

## Validate and predict on trained model

In [32]:
predictions_validation = lstm_model.predict([X_validation['q1'], X_validation['q2']], batch_size=batch_size)
predictions_validation = [item for sublist in predictions_validation for item in sublist]

predictions_test = lstm_model.predict([X_test['q1'], X_test['q2']], batch_size=batch_size)
predictions_test = [item for sublist in predictions_test for item in sublist]

## Get the predicted values

In [33]:
predicted_values = []

for prediction in predictions_test:
    if prediction > 0.5:
        predicted_values.append(True)
    else:
        predicted_values.append(False)

correct_predictions = sum(predicted_values == (Y_test == 1))
print(correct_predictions)

3259


## Show results using confusion matrix

In [34]:
from sklearn.metrics import confusion_matrix
import sklearn.metrics as skm

# Compute confusion matrix
y_test = Y_test
y_pred = predicted_values
y_pred_prob = predictions_test

cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix')
print(cm)

rec = skm.recall_score(y_test, y_pred)
prec = skm.precision_score(y_test, y_pred)
f1 = skm.f1_score(y_test, y_pred)
acc = skm.accuracy_score(y_test, y_pred)

print("Accuracy: ", acc)
print("Recall: ", rec)
print("Precision: ", prec)
print("F1 score: ", f1)


Confusion matrix
[[3020  142]
 [1599  239]]
Accuracy:  0.6518
Recall:  0.13003264417845484
Precision:  0.6272965879265092
F1 score:  0.21541234790446145


## Test on unlabeled data

In [24]:
for testset in [test_df[0:10]]:
    for index, row in testset.iterrows():
        
        conversionToNumber = []
        for word in get_word_list(row['question1']):

            if word in stop_words:
                continue

            if word not in vocabulary:
                conversionToNumber.append(0)
            else:
                #print(vocabulary[word])
                conversionToNumber.append(vocabulary[word])

        testset.set_value(index, question, conversionToNumber)
        
        conversionToNumber = []
        for word in get_word_list(row['question2']):

            if word in stop_words:
                continue

            if word not in vocabulary:
                conversionToNumber.append(0)
            else:
                #print(word)
                conversionToNumber.append(vocabulary[word])

        testset.set_value(index, question, conversionToNumber)
            
test_X = test_df[['question1', 'question2']]

test_X = test_X.iloc[0:10]

test_X = {'q1': test_X.question1, 'q2': test_X.question2} 

for testset, pos in itertools.product([test_X], ['q1', 'q2']):
    testset[pos] = pad_sequences(testset[pos], maxlen=max_seq_length)
    
predictions_test = lstm_model.predict([test_X['q1'], test_X['q2']], batch_size=batch_size)
predictions_test = [item for sublist in predictions_test for item in sublist]

print(predictions_test)

  app.launch_new_instance()


ValueError: invalid literal for int() with base 10: 'How does the Surface Pro himself 4 compare with iPad Pro?'