# MaLSTM on Kaggle's Quora Question Pairs

This notebook is about implementing the MaLSTM model (http://www.mit.edu/~jonasm/info/MuellerThyagarajan_AAAI16.pdf) on Kaggle's Quora Question Pairs data.

Blog post containing a broader explanation about the network can be found in the following link https://medium.com/@eliorcohen/implementing-malstm-on-kaggles-quora-question-pairs-competition-8b31b0b16a07


# CODE

First, lets import all the necessary packages

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [2]:
from time import time
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
import datetime

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Lambda
import keras.backend as K
from keras.optimizers import Adadelta
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


Global variables

In [3]:
# File paths
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'
EMBEDDING_FILE = 'crawl-300d-2M-subword.vec'
MODEL_SAVING_DIR = './'

Create embedding matrix

In [None]:
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=False)

In [47]:
numberbatch = KeyedVectors.load_word2vec_format("numberbatch-en-17.06.txt.gz", binary=False)

In [49]:
# Load training and test set
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

stops = set(stopwords.words('english'))

def text_to_word_list(text):
    ''' Pre process and convert texts to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

# Prepare embedding
vocabulary = dict()
inverse_vocabulary = ['<unk>']  # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding

questions_cols = ['question1', 'question2']

# Iterate over the questions only of both training and test datasets
for dataset in [train_df, test_df]:
    for index, row in dataset.iterrows():

        # Iterate through the text of both questions of the row
        for question in questions_cols:

            q2n = []  # q2n -> question numbers representation
            for word in text_to_word_list(row[question]):

                # Check for unwanted words
                if word in stops and word not in numberbatch.vocab:
                    continue

                if word not in vocabulary:
                    vocabulary[word] = len(inverse_vocabulary)
                    q2n.append(len(inverse_vocabulary))
                    inverse_vocabulary.append(word)
                else:
                    q2n.append(vocabulary[word])

            # Replace questions as word to question as number representation
            dataset.set_value(index, question, q2n)
            



In [2]:
from collections import namedtuple

Phrase = namedtuple('Phrase', 'original candidate label')
Token = namedtuple('Token', 'text tags')

def split_tokens(sent):
    tokens = []
    for token in sent.split():
        tags = token.split('/')
        tokens.append(Token(tags[0].lower(), tuple(tags[1:])))
    return tokens


def readData(filename, eval_label, ignoreNone):
    data = []
    with open(filename) as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) == 7:
                (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = fields
            else:
                continue
            label = eval_label(judge)
            if ((label is None) and ignoreNone):
                continue
            data.append(Phrase(split_tokens(origsenttag), split_tokens(candsenttag), label))
    
    return data
                
def eval_amt_label(label):
    nYes = eval(label)[0]            
    
    if nYes >= 3:
        return True
    elif nYes <= 1:
        return False
    
    return None

def eval_expert_label(label):
    nYes = int(label[0])
    
    if nYes >= 4:
        return True
    elif nYes <= 2:
        return False
    
    return None


def readTrainData(filename):
    return readData(filename, eval_amt_label, True)

def readTestData(filename):
    return readData(filename, eval_expert_label, False)

In [3]:
train_data = readTrainData("SemEval-PIT2015-py3/data/train.data")
dev_data = readTrainData("SemEval-PIT2015-py3/data/dev.data")
test_data = [p for p in readTestData("SemEval-PIT2015-py3/data/test.data") if p.label is not None]

In [4]:
from nltk.corpus import stopwords
from tqdm import tqdm_notebook

def clean_sent(sent):
    new_sent = []
    for token in sent:
        if token.tags[0].startswith('B-'):
            new_sent.append(token.tags[0].split('-')[1])
            continue
        if token.tags[0].startswith('I-'): #or token.text in stopwords.words('english'):
            continue
        #if token.tags[1] == 'CD' or token.text in NUM or token.text.isnumeric():
        #    new_sent.append('number')
        #    continue
        new_sent.append(token.text)
                            
    return new_sent

def clean_data(data):
    return [Phrase(clean_sent(phrase.original), clean_sent(phrase.candidate), phrase.label) \
            for phrase in tqdm_notebook(data)]

In [5]:
clean_train_data = clean_data(train_data)

HBox(children=(IntProgress(value=0, max=11530), HTML(value='')))




In [6]:
clean_dev_data = clean_data(dev_data)

HBox(children=(IntProgress(value=0, max=4142), HTML(value='')))




In [7]:
clean_test_data = clean_data(test_data)

HBox(children=(IntProgress(value=0, max=838), HTML(value='')))




In [19]:
def text_to_sequence(words):
    return [vocabulary[word] for word in words if word in vocabulary]

def sequence_to_text(seq):
    return [inverse_vocabulary[i] for i in seq if i]

In [50]:
def data_to_sequences(data):
    
    encoder_seqs = []
    decoder_seqs = []
    labels = []
    
    for phrase in data:
        encoder_seqs.append(text_to_sequence([t for t in phrase.original]))
        decoder_seqs.append(text_to_sequence([t for t in phrase.candidate]))
        labels.append(phrase.label)
        
    return encoder_seqs, decoder_seqs, labels 

train_encoder_seqs, train_decoder_seqs, train_labels = data_to_sequences(clean_train_data)

print(len(train_encoder_seqs))
print(len(train_decoder_seqs))
print(len(train_labels))

11530
11530
11530


In [52]:
print(sequence_to_text(train_encoder_seqs[0]))
print(sequence_to_text(train_decoder_seqs[0]))
print(train_labels[0])

['person', 'the', '1st', 'qb', 'to', 'go', 'in', 'this', 'draft']
['but', 'my', 'bro', 'from', 'the', '757', 'person', 'is', 'the', '1st', 'qb', 'gone']
True


In [53]:
dev_encoder_seqs, dev_decoder_seqs, dev_labels = data_to_sequences(clean_dev_data)

In [54]:
test_encoder_seqs, test_decoder_seqs, test_labels = data_to_sequences(clean_test_data)

In [55]:
from keras.preprocessing.sequence import pad_sequences

def padding(sequences):
    return pad_sequences(sequences, maxlen=max_seq_length)

In [56]:
            
embedding_dim = 300
embeddings = 1 * np.random.randn(len(vocabulary) + 1, embedding_dim)  # This will be the embedding matrix
embeddings[0] = 0  # So that the padding will be ignored

# Build the embedding matrix
for word, index in vocabulary.items():
    if word in numberbatch.vocab:
        embeddings[index] = numberbatch.word_vec(word)

#del word2vec

In [57]:
len(vocabulary)

121322

Prepare training and validation data

In [58]:
max_seq_length = max(train_df.question1.map(lambda x: len(x)).max(),
                     train_df.question2.map(lambda x: len(x)).max(),
                     test_df.question1.map(lambda x: len(x)).max(),
                     test_df.question2.map(lambda x: len(x)).max())

# Split to train validation
validation_size = 40000
training_size = len(train_df) - validation_size

X = train_df[questions_cols]
Y = train_df['is_duplicate']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

# Split to dicts
X_train = {'left': X_train.question1, 'right': X_train.question2}
X_validation = {'left': X_validation.question1, 'right': X_validation.question2}
X_test = {'left': test_df.question1, 'right': test_df.question2}

# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values

# Zero padding
for dataset, side in itertools.product([X_train, X_validation], ['left', 'right']):
    dataset[side] = pad_sequences(dataset[side], maxlen=max_seq_length)

# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

Build the model

In [70]:
# Model variables
n_hidden = 50
gradient_clipping_norm = 1.25
batch_size = 64
n_epoch = 25

def exponent_neg_manhattan_distance(left, right):
    ''' Helper function for the similarity estimate of the LSTMs outputs'''
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

embedding_layer = Embedding(len(embeddings), embedding_dim, weights=[embeddings], input_length=max_seq_length, trainable=False)

# Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

# Since this is a siamese network, both sides share the same LSTM
shared_lstm = LSTM(n_hidden)

left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)

# Calculates the distance as defined by the MaLSTM model
malstm_distance = Lambda(function=lambda x: exponent_neg_manhattan_distance(x[0], x[1]),output_shape=lambda x: (x[0][0], 1))([left_output, right_output])

# Pack it all up into a model
malstm = Model([left_input, right_input], [malstm_distance])

# Adadelta optimizer, with gradient clipping by norm
optimizer = Adadelta(clipnorm=gradient_clipping_norm)

malstm.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])

malstm.summary()

# Start training
#training_start_time = time()

#print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 245)          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 245)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 245, 300)     36396900    input_7[0][0]                    
                                                                 input_8[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   (None, 50)           70200       embedding_4[0][0]                
          

In [62]:
malstm_trained = malstm.fit([X_train['left'], X_train['right']], Y_train, batch_size=batch_size, nb_epoch=3,
                            validation_data=([X_validation['left'], X_validation['right']], Y_validation))

  


Train on 364290 samples, validate on 40000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [63]:
from sklearn.metrics import classification_report

print(classification_report(Y_validation, [prob.item() > 0.5 for prob in malstm.predict([X_validation['left'], X_validation['right']])]))

              precision    recall  f1-score   support

           0       0.79      0.89      0.83     25280
           1       0.75      0.59      0.66     14720

   micro avg       0.78      0.78      0.78     40000
   macro avg       0.77      0.74      0.75     40000
weighted avg       0.77      0.78      0.77     40000



In [79]:
malstm.fit([padding(train_encoder_seqs), padding(train_decoder_seqs)], np.array(train_labels),
          batch_size = 64, epochs = 10, validation_data=([padding(dev_encoder_seqs), padding(dev_decoder_seqs)], dev_labels))

Train on 11530 samples, validate on 4142 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7faf08bfd0>

In [80]:
print(classification_report(dev_labels, 
                            [prob.item() > 0.5 for prob in malstm.predict([padding(dev_encoder_seqs), padding(dev_decoder_seqs)])]))

              precision    recall  f1-score   support

       False       0.66      0.99      0.79      2672
        True       0.80      0.09      0.15      1470

   micro avg       0.67      0.67      0.67      4142
   macro avg       0.73      0.54      0.47      4142
weighted avg       0.71      0.67      0.57      4142



Plotting the results

In [None]:
# Plot accuracy
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot loss
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

In [8]:
import tensorflow as tf
import tensorflow_hub as hub

W0524 21:56:50.073685 140647387600704 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [9]:
url = "https://tfhub.dev/google/elmo/2"
embed = hub.Module(url)

Instructions for updating:
Colocations handled automatically by placer.


W0524 21:56:51.411659 140647387600704 deprecation.py:323] From /home/szubovych/.virtualenvs/nlp/lib/python3.6/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [16]:
def get_elmo_embeddings(sentence):
    return elmo.embed_sentence(sentence)

In [17]:
def to_elmo_vectors(data):
    originals = [get_elmo_embeddings(phrase.original) for phrase in tqdm_notebook(data)]
    candidates = [get_elmo_embeddings(phrase.candidate) for phrase in tqdm_notebook(data)]
    return originals, candidates

In [None]:
%time train_orig_elmo, train_cand_elmo = to_elmo_vectors(clean_train_data)

HBox(children=(IntProgress(value=0, max=11530), HTML(value='')))

In [13]:
from allennlp.commands.elmo import ElmoEmbedder

In [15]:
elmo = ElmoEmbedder()
tokens = ["I", "ate", "an", "apple", "for", "breakfast"]
elmo.embed_sentence(tokens)

array([[[ 6.9227189e-01, -3.2613137e-01,  2.2827488e-01, ...,
          1.7574824e-01,  2.6598701e-01, -1.0131964e-01],
        [-6.6311520e-01,  2.9512239e-01,  6.5207249e-01, ...,
          6.5627527e-01,  4.2394829e-01,  1.2068849e+00],
        [ 7.9657242e-02,  1.9919699e-01, -6.9490090e-02, ...,
          2.1669459e-02,  1.2296096e-01,  4.1096029e-03],
        [ 1.4436828e-01,  6.7775294e-02,  3.7736303e-01, ...,
          4.1031602e-01,  2.9029363e-01, -6.1045051e-02],
        [-2.4150275e-01,  5.4133452e-02, -3.1142172e-01, ...,
          4.0102768e-01, -2.6212466e-01, -4.2983264e-01],
        [-2.3136677e-01,  2.7052870e-01,  4.4679990e-01, ...,
          2.0299807e-01, -2.5246465e-01,  4.1547567e-02]],

       [[-1.1051465e+00, -4.0921739e-01, -4.3645072e-01, ...,
         -5.5361623e-01, -2.2313191e-01,  3.2954246e-02],
        [ 2.3847309e-01, -2.4085948e-01,  2.2867769e-02, ...,
          3.5999569e-01, -1.5432328e-02,  9.6235320e-02],
        [ 6.7190886e-02, -1.4298746e-0