In [33]:
# import numpy as np
# import pandas as pd


# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

# import keras.backend as K
# from keras.models import Model
# from keras.layers import Input
# from keras.layers import Embedding
# from keras.layers import LSTM
# from keras.layers import Merge
# from keras.layers import Dropout
# from keras.layers import Dense
# from keras.layers.merge import concatenate
# from keras.layers.normalization import BatchNormalization
# from keras.callbacks import EarlyStopping
# from keras.callbacks import ModelCheckpoint


# from keras.utils import plot_model

# from sklearn.model_selection import train_test_split

# from gensim.models import KeyedVectors

# import pickle as pkl

# # import pydot

## Siamese LSTM

Here is an implementation of a Manhattan Siamese LSTM for the questiond deduplication problem. 

Siamese LSTMs are a deep neural network archetecture, where multiple inputs share the same LSTM strucuture and weights. 

A particular type of Siamese LSTM, caleld the Siamese Manhattan LSTM (MaLSTM) have been shown to be useful in learning Sentence Similarity, and are good learning semantic relationships between sentences [Mueller, Thyagarajan; 2016](http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf)


Here is a simple implementation of the Siamese Manhattan LSTM for the quora deduplication problem

### 0. Load data and resources

The data is already pre-processed in the preprocessing/string_cleaning.Rmd notebook.

I've also trained a Tokenizer object on the entire dataset, which will allow me to convert sentences into index vectors - a format usable by neural network models.

#### 0.0 Load Data

In [3]:
data = pd.read_csv("../data/processed/train.csv")


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,1,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,2,1,3,4,What is the story of Kohinoor Koh - i - Noor D...,What would happen if the Indian government sto...,0
2,3,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,4,3,7,8,Why am I mentally very lonely ? How can I solv...,Find the remainder when math 23 24 math is div...,0
4,5,4,9,10,"Which one dissolve in water quickly sugar , sa...",Which fish would survive in salt water ?,0


#### 0.1 Load Tokenizer

This converts sentences into index vectors in a consistent way. 

In [7]:
# load pickled word tokenzier
with open("../models/tokenizer.pickle", "rb") as handle:
    tokenizer = pkl.load(handle)

Using TensorFlow backend.


In [14]:
# The number of types in the joint datset
vocab_size = len(tokenizer.word_index) + 1
vocab_size

93261

In [8]:
# Test to see if things look ok. 18065 is the index for <UNK>
tokenizer.texts_to_sequences(["this is text. It has a word never seen before. Namely: cockadoodledoo"])

[[67, 3, 740, 19, 69, 6, 239, 378, 466, 184, 18065]]

### 1. A function for tokenizing, encoding and padding sequences

In [9]:

# A function which takes in a numpy array of quesitions (strings)
# and returns padded index vectors usable by deep learning models. 
def encode_and_pad(questions, sequence_length = 25):
    # questions encoded as index vectors
    encoded = tokenizer.texts_to_sequences(questions)
    # padded squences to be of length [sequence_length]
    padded = pad_sequences(encoded, 
                            maxlen = sequence_length,
                            padding = "post", 
                            truncating = "post")
    return(padded)

### 2. Load Embedding Matrix

In [12]:
# load the pretrained fasttext embeddings (this takes a while)
embedding_model = KeyedVectors.load_word2vec_format('../data/embeddings/wiki.en.vec')

In [15]:
# Each row in the matrix is the embedding of one word in the dataset. 
# The row index corresponds to the integer ecoding of that word. 
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in embedding_model:
        embedding_matrix[i] = embedding_model[word]

### 3. Setting Up testing environment

I spit the data into a training/dev/test split using a 60:20:20 split. 

The seed is set to `550` for consistency with my previous work and for future reproducability. 

In [16]:
# Split data into training and test sets.
X_train_and_dev, X_test, y_train_and_dev, y_test = train_test_split(
    data[data.columns - ["is_duplicate"]], data['is_duplicate'], \
    test_size=0.2, random_state=550)

  app.launch_new_instance()


In [17]:
X_train, X_dev, y_train, y_dev = train_test_split(
    X_train_and_dev, y_train_and_dev, test_size=0.25, random_state=550)

In [18]:
# sanity check
print(data.shape)
print
print(X_train.shape)
print(X_dev.shape)
print(X_test.shape)
print 
print (y_train.shape)
print(y_dev.shape)
print(y_test.shape)

(404288, 7)

(242572, 6)
(80858, 6)
(80858, 6)

(242572,)
(80858,)
(80858,)


### 4. Build Model Archetecutre

Credit to Elior Cohen for [this awesome blog post!](https://medium.com/mlreview/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07)

Model specific parameters that dictate achetecture. 

In [26]:
N_HIDDEN_UNITS = 60
N_EPOCHS = 25
BATCH_SIZE = 64 #(404288 / 6317 )
SEQUENCE_LENGTH = 30

Now, we need the Manhattan similarity function. This will be used for the merging layer: 

The Manhattan similarity between two vectors $v_1$ and $v_2$ is: 
$$
ManhattanSim(v_1, v_2) = \exp(-||{v_1 - v_2||_1})
$$

In [23]:
# Credit Elior Cohen
def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

In [27]:
# Define the input units - one for each question
input1 = Input(shape=(SEQUENCE_LENGTH,), name="Question1-Input")
input2 = Input(shape=(SEQUENCE_LENGTH,), name="Question2-Input")

In [29]:
# add Embedding layer on top of first input 
embedding1 = Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = SEQUENCE_LENGTH,
                     trainable = False)(input1)

In [28]:
# add Embedding layer on top of second input
embedding2 = Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = SEQUENCE_LENGTH,
                     trainable = False)(input2)

In [30]:
# a shared LSTM unit. This is what makes it a Siamese LSTM
shared_lstm = LSTM(N_HIDDEN_UNITS)

In [31]:
# get the outpouts of the two inputs, applied to the same LSTM unit
left_output = shared_lstm(embedding1)
right_output = shared_lstm(embedding2)

In [35]:
# Merge the two outputs using Manhattan similarity 
merged = Merge(mode=lambda x:\
               exponent_neg_manhattan_distance(x[0], x[1]), \
               output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

  from ipykernel import kernelapp as app


In [37]:
# instantiate model
MaLSTM = Model([input1, input2], [merged])

In [38]:
MaLSTM.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])

In [39]:
MaLSTM.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Question1-Input (InputLayer)    (None, 30)           0                                            
__________________________________________________________________________________________________
Question2-Input (InputLayer)    (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 30, 300)      27978300    Question1-Input[0][0]            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 300)      27978300    Question2-Input[0][0]            
__________________________________________________________________________________________________
lstm_1 (LS

In [41]:
plot_model(MaLSTM, "./diagrams/MaLSTM.png")

![](./diagrams/MaLSTM.png)

In [None]:
MaLSTM.fit([X_train['question1'], X_train['question2']], y_train, \
           batch_size = BATCH_SIZE, nb_epoch = N_EPOCHS, \
           validation_data = ([X_validation['question1'], X_validation['question2']], y_dev))

In [None]:
MaLSTM.save("../models/siamese_lstm.h5")