In [42]:
# import numpy as np
# import pandas as pd


# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

# from keras.models import Model
# from keras.layers import Input
# from keras.layers import Embedding
# from keras.layers import LSTM
# from keras.layers import Dropout
# from keras.layers import Dense
# from keras.layers.merge import concatenate
# from keras.layers.normalization import BatchNormalization
# from keras.callbacks import EarlyStopping
# from keras.callbacks import ModelCheckpoint


# from keras.utils import plot_model

# from sklearn.model_selection import train_test_split

# from gensim.models import KeyedVectors

# import pickle as pkl

# import pydot

## An LSTM Implementation for the quora question deduplication problem. 

Our goal for this project is to extend the previous work done on this problem by incorporating external knowledge and linguistic features. 

The current state-of-the-art solutions for this problem are almost unanimously deep recurrent network implementations. The best models can perform outstandingly well - better than human subjects (based on the result of over 500 responses I've collected from human subjects). 

Although our goal is to extract interesting linguistic insight rather to achieve competitive accuracy, if we are to claim that we've extended previous work, we must first build a model that is comparable to this work - both in accuracy and in technique. 

Thus, this model is intended to be a baseline performance model. What I would consider a success would be if we could make our other, less sophisticated (tree based) models as performant as this one, using hand crafted, insightful features. 

### 0. Load data and resources

The data is already pre-processed in the `preprocessing/string_cleaning.Rmd` notebook. 

I've also trained a `Tokenizer` object on the entire dataset, which will allow me to convert sentences into index vectors - a format usable by neural network models. 

#### 0.0 Load data

In [2]:
data = pd.read_csv("../data/processed/train.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,1,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,2,1,3,4,What is the story of Kohinoor Koh - i - Noor D...,What would happen if the Indian government sto...,0
2,3,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,4,3,7,8,Why am I mentally very lonely ? How can I solv...,Find the remainder when math 23 24 math is div...,0
4,5,4,9,10,"Which one dissolve in water quickly sugar , sa...",Which fish would survive in salt water ?,0


#### 0.1 Load Tokenizer

This is pretrained `keras.preprocessing.text.Tokenizer` object, that will allow me convert sentenses to index vectors in a consistent way

In [4]:
# load pickled word tokenzier
with open("../models/tokenizer.pickle", "rb") as handle:
    tokenizer = pkl.load(handle)

In [5]:
# Test to see if things look ok. 18065 is the index for <UNK>
tokenizer.texts_to_sequences(["this is text. It has a word never seen before. Namely: cockadoodledoo"])

[[67, 3, 740, 19, 69, 6, 239, 378, 466, 184, 18065]]

Store the number of words encoded by the tokenizer

In [6]:
# The number of types in the joint datset
vocab_size = len(tokenizer.word_index) + 1
vocab_size

93261

### 1. A function for tokenizing, indexing and padding sequences. 

Takes in sentences, outputs padded index vectors.

In [7]:
# A function which takes in a numpy array of quesitions (strings)
# and returns padded index vectors usable by deep learning models. 
def encode_and_pad(questions, sequence_length = 25):
    # questions encoded as index vectors
    encoded = tokenizer.texts_to_sequences(questions)
    # padded squences to be of length [sequence_length]
    padded = pad_sequences(encoded, 
                            maxlen = sequence_length,
                            padding = "post", 
                            truncating = "post")
    return(padded)

In [8]:
# test function
encode_and_pad(["this is one string. It is short,",
                "this is another string. It is much longer. in fact, it is so long, that it should not be padded., \
                but rather it will be truncated. "], 
              sequence_length = 10)

array([[  67,    3,   50, 2570,   19,    3,  667,    0,    0,    0],
       [  67,    3,  403, 2570,   19,    3,   75, 1288,    8, 1475]], dtype=int32)

### 2. Load embedding matrix

I use the pre-trained `fasttext` word embedding vectors (Mikolov, @Facebook Research). 

In [9]:
# load the pretrained fasttext embeddings (this takes a while)
embedding_model = KeyedVectors.load_word2vec_format('../data/embeddings/wiki.en.vec')

In [10]:
# Each row in the matrix is the embedding of one word in the dataset. 
# The row index corresponds to the integer ecoding of that word. 
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in embedding_model:
        embedding_matrix[i] = embedding_model[word]

### 3. Setting up testing environment

To pick the right archetecture in a sincere way, I will need to split the data into training/developement/test sets. 

I'll split the data into test/developement/test using a 60:20:20 split

In [11]:
# Split data into training and test sets.
X_train_and_dev, X_test, y_train_and_dev, y_test = train_test_split(
    data[data.columns - ["is_duplicate"]], data['is_duplicate'], \
    test_size=0.2, random_state=550)

  app.launch_new_instance()


In [12]:
X_train, X_dev, y_train, y_dev = train_test_split(
    X_train_and_dev, y_train_and_dev, test_size=0.25, random_state=550)

In [13]:
# sanity check
print(data.shape)
print
print(X_train.shape)
print(X_dev.shape)
print(X_test.shape)
print 
print (y_train.shape)
print(y_dev.shape)
print(y_test.shape)

(404288, 7)

(242572, 6)
(80858, 6)
(80858, 6)

(242572,)
(80858,)
(80858,)


### 4. A first model

This archetecture is inspired by that posted by Quora in [this block post](https://engineering.quora.com/Semantic-Question-Matching-with-Deep-Learning) and [this great starter code](https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings) by the person who calls himself `lystdo` on Kaggle. 

In [14]:
# parameters for this model
sequence_length1 = 25
num_lstm1 = 200
num_dense1 = 100
dropout_rate1 = .2
recurrent_droput_rate1 = .2

In [15]:
# Define the input units - one for each question
input1 = Input(shape=(sequence_length1,), name="Question1-Input")
input2 = Input(shape=(sequence_length1,), name="Question2-Input")

In [16]:
# add Embedding layer on top of first input 
embedding1 = Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = sequence_length1,
                     trainable = False)(input1)

In [17]:
# add Embedding layer on top of second input
embedding2 = Embedding(input_dim = vocab_size, 
                     output_dim = 300, 
                     input_length = sequence_length1,
                     trainable = False)(input2)

In [18]:
# add an LSTM unit to first input path
lstm_unit1 = LSTM(num_lstm1, dropout = dropout_rate1, recurrent_dropout = recurrent_droput_rate1)(embedding1)

In [19]:
# add an LSTM unit to the second input path
lstm_unit2 = LSTM(num_lstm1, dropout = dropout_rate1, recurrent_dropout = recurrent_droput_rate1)(embedding2)

In [20]:
# merge the two streams in to one 
merged = concatenate([lstm_unit1,lstm_unit2])

In [21]:
# add some dropout and some normalization, which will help speed up convergence. 
merged = Dropout(dropout_rate1)(merged)
merged = BatchNormalization()(merged)

In [22]:
# add a fully connected layer with ReLU acivation, to hold onto long dependencies
merged = Dense(num_dense1, activation='relu')(merged)
merged = Dropout(dropout_rate1)(merged)
merged = BatchNormalization()(merged)

In [23]:
# finally add a dense output layer with a sigmoid activation 
predictions = Dense(1, activation='sigmoid')(merged)

Now, to make sure things are OK, I'll compile the mode, look at the structure and diagram of the archetecrure

In [24]:
model1 = Model(inputs=[input1, input2], \
        outputs=predictions)
model1.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])

In [25]:
model1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Question1-Input (InputLayer)    (None, 25)           0                                            
__________________________________________________________________________________________________
Question2-Input (InputLayer)    (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 25, 300)      27978300    Question1-Input[0][0]            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 25, 300)      27978300    Question2-Input[0][0]            
__________________________________________________________________________________________________
lstm_1 (LS

In [41]:
plot_model(model1, "./diagrams/model1.png")

![title](diagrams/model1.png)

I'll add Early Stopping, so that the model will stop training if the developement error doesn't improve for 3 straight epochs. 

I'll also save the best model.

In [43]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

In [44]:
model_checkpoint = ModelCheckpoint("../models/lstm1.h5", save_best_only=True, save_weights_only=True)

#### 4.1 Train!

In [48]:
# Isolate the training and developement data/labels
question1_train = X_train['question1'].values
question2_train = X_train['question2'].values

question1_dev = X_dev['question1'].values
question2_dev = X_dev['question2'].values

In [55]:
# train, with early stopping
model1.fit([encode_and_pad(question1_train), encode_and_pad(question2_train)], y_train, \
        validation_data=([encode_and_pad(question1_dev), encode_and_pad(question2_dev)], y_dev), \
        epochs=200, batch_size=2425, shuffle=False, \
        callbacks=[early_stopping, model_checkpoint])

Train on 242572 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200


<keras.callbacks.History at 0x235c7c290>