This kernel presents an extensible approach to handling the Quora Question Pairs competition with a deep neural network. Using the final model's predictions should yield ~0.35 on the public leaderboard.

In [1]:
import re
import nltk
import random
import gensim
import pickle
import logging
import itertools
import numpy as np
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Embedding
from keras.layers import Lambda
from keras.layers.merge import concatenate
from keras.layers import LSTM, Bidirectional
from keras.layers import Input, Dense, Dropout
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.callbacks import ModelCheckpoint


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Using TensorFlow backend.


### Define some useful functions we need to represent text

In [2]:
stopwords = set(nltk.corpus.stopwords.words("english"))
 
def preprocess(text, min_length=2, swords=set()):
    """
    Does preprocessing on an input string by lowering it, tokenizing, filtering out stopwords,
    tokens shorter than min_length and tokens consisting of not English letters.
    """
    text = str(text).lower()
    words = map(lambda word: word.lower(), nltk.word_tokenize(text))
    words = [word for word in words if word not in swords]
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length, words))
    return filtered_tokens

def build_vocab(tokenlists, max_size=20000, emb_model=None):
    """
    Builds a vocabulary of at most max_size words from the supplied list of lists of tokens.
    If a word embedding model is provided, adds only the words present in the model vocabulary.
    """

    all_words = list(itertools.chain.from_iterable(tokenlists))
    counter = Counter(all_words)
    if emb_model:
        counter = Counter(x for x in counter if x in emb_model)
            
    vocab = counter.most_common(max_size-2)

    voc_words = [k[0] for k in vocab]

    voc = {}
    voc['NULL'] = 0
    voc['UNKN'] = 1
    for i, k in enumerate(voc_words):
        voc[k] = i+2

    rvoc = {v: k for k, v in voc.items()}

    return voc, rvoc

def vectorize_tokens(tokens, token_to_id, max_len):
    """
    Converts a list of tokens to a list of token ids using the supplied dictionary.
    Pads resulting list with NULL identifiers up to max_len length.
    """
    ids = []
    for token in tokens:
        ids.append(token_to_id.get(token, voc["UNKN"]))

    ids = ids[:max_len]
    if len(ids) < max_len:
        ids += (max_len-len(ids))*[token_to_id["NULL"]]

    return ids

def vectorize(tok_lists, token_to_id, max_len=150):
    """
    Converts a list of lists of tokens to a numpy array of token identifiers
    """
    
    token_matrix = []
        
    for tok_list in tok_lists:
        token_ids = vectorize_tokens(tok_list, token_to_id, max_len)
        token_matrix.append(token_ids)
    
    token_matrix = np.array(token_matrix)
        
    return token_matrix

def get_embeddings(model, rev_voc, dim=300):

    myembeddings = []
    for key in sorted(rev_voc.keys()):
        val = rev_voc[key]
        if val == 'NULL':
            myembeddings.append(np.zeros((dim,)))
        elif val == 'UNKN':
            myembeddings.append(np.random.normal(size=(dim,)))
        else:
            try:
                myembeddings.append(model[val])
            except KeyError:
                print("OOV: {}".format(val))
                myembeddings.append(np.random.normal(size=(dim,)))

    myembeddings = np.array(myembeddings)
    return myembeddings

### Load train/test datasets

In [3]:
training_data = pd.read_csv("/kaggle/input/train.csv")
testing_data = pd.read_csv("/kaggle/input/test.csv")
labels = np.array(list(training_data['is_duplicate']))

### Preprocess all texts from train/test. 
This will take a while.

In [4]:
tr_q1_preprocessed = [preprocess(t, swords=stopwords) for t in training_data['question1']]
tr_q2_preprocessed = [preprocess(t, swords=stopwords) for t in training_data['question2']]

In [5]:
ts_q1_preprocessed = [preprocess(t, swords=stopwords) for t in testing_data['question1']]
ts_q2_preprocessed = [preprocess(t, swords=stopwords) for t in testing_data['question2']]

### Load the word embedding model
You can get it at https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit (1.5 GB download)

In [6]:
emb_mod = gensim.models.Word2Vec.load_word2vec_format("./assets/GoogleNews-vectors-negative300.bin", 
                                                      binary=True)

DeprecationWarning: Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.

###(Alternative) Learn the word embedding model
Instead of using an external word-embedding model, we might just learn our own from the provided data

In [7]:
all_texts = tr_q1_preprocessed+tr_q2_preprocessed+ts_q1_preprocessed+ts_q2_preprocessed

In [8]:
emb_mod = gensim.models.Word2Vec(all_texts, min_count=7, size=128)

2019-05-04 19:56:47,587 : INFO : collecting all words and their counts
2019-05-04 19:56:47,589 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-05-04 19:56:47,607 : INFO : PROGRESS: at sentence #10000, processed 53138 words, keeping 11633 word types
2019-05-04 19:56:47,625 : INFO : PROGRESS: at sentence #20000, processed 106530 words, keeping 16926 word types
2019-05-04 19:56:47,642 : INFO : PROGRESS: at sentence #30000, processed 159667 words, keeping 20930 word types
2019-05-04 19:56:47,660 : INFO : PROGRESS: at sentence #40000, processed 212333 words, keeping 24180 word types
2019-05-04 19:56:47,678 : INFO : PROGRESS: at sentence #50000, processed 265683 words, keeping 27183 word types
2019-05-04 19:56:47,696 : INFO : PROGRESS: at sentence #60000, processed 318740 words, keeping 29815 word types
2019-05-04 19:56:47,715 : INFO : PROGRESS: at sentence #70000, processed 371726 words, keeping 32168 word types
2019-05-04 19:56:47,733 : INFO : PROGRESS: at s

In [9]:
# You might want to train the model more to get better results
n_epochs = 5
for i in range(n_epochs)
    emb_mod.train(all_texts)

SyntaxError: invalid syntax (<ipython-input-9-95826a53f5d7>, line 3)

### Build a vocabulary of token identifiers and prepare word embedding matrix
Here we only add tokens present in the embedding model to the vocabulary

In [10]:
voc, rev_voc = build_vocab(all_texts, 
                           75000, emb_mod)
embs_m = get_embeddings(emb_mod, rev_voc, emb_mod.vector_size)



### Represent train/test texts with token identifiers
max_len of 24 tokens seems to be sufficient

In [11]:
v_tr_q1 = vectorize(tr_q1_preprocessed, voc, max_len=24)
v_tr_q2 = vectorize(tr_q2_preprocessed, voc, max_len=24)

In [12]:
v_ts_q1 = vectorize(ts_q1_preprocessed, voc, max_len=24)
v_ts_q2 = vectorize(ts_q2_preprocessed, voc, max_len=24)

### (Optional) Dump assets to disk
We might continue from there in case the kernel is reloaded

In [13]:
pickle.dump([v_tr_q1, v_tr_q2], open("vectorized_train", "wb"))
pickle.dump([v_ts_q1, v_ts_q2], open("vectorized_test", "wb"))
pickle.dump([voc, rev_voc], open("voc_rvoc", "wb"))
pickle.dump(embs_m, open("embedding_matrix", "wb"))

In [14]:
v_tr_q1, v_tr_q2 = pickle.load(open("./assets/vectorized_train", "rb"))
v_ts_q1, v_ts_q2 = pickle.load(open("./assets/vectorized_test", "rb"))
voc, rev_voc = pickle.load(open("./assets/voc_rvoc", "rb"))
embs_m = pickle.load(open("./assets/embedding_matrix", "rb"))

FileNotFoundError: [Errno 2] No such file or directory: './assets/vectorized_train'

### Define a neural network

In [15]:
MAXLEN = 24
DROPOUT = 0.5
LSTM_UNITS = 600
DENSE_UNITS = 600

In [16]:
def pairwise_dis(vests):
    x, y = vests
    return x-y

def pairwise_mul(vests):
    x, y = vests
    return x*y

def cosine_similarity(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return K.sum((x * y), axis=-1, keepdims=True)

def cosine_distance_output_shape(shapes):
    shape1, shape2 = shapes
    return shape1[0], 1

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean((1 - y_true) * K.square(y_pred) + y_true * K.square(K.maximum(margin - y_pred, 0)))

In [17]:
def build_rnn_mk1_encoder(embs_matrix):
    """
    Basic Bidirectional LSTM encoder. 
    Word embedding layer is frozen to prevent overfitting.
    """
    inp = Input(shape=(MAXLEN,))
    emb = Embedding(embs_matrix.shape[0], embs_matrix.shape[1], input_length=MAXLEN, 
                    weights=[embs_matrix], trainable = False)(inp)
    ls1 = Bidirectional(LSTM(LSTM_UNITS))(emb)
    mod = Model(inputs=inp, outputs=ls1)
    return mod

def build_sim_net(input_shape):
    """
    MLP combining the representations of two question into one vector.
    Takes into account distanse and angle between the input vectors.
    For more information check out the blog post
    https://engineering.quora.com/Semantic-Question-Matching-with-Deep-Learning
    """
    input_a = Input(shape=(input_shape[1],))
    input_b = Input(shape=(input_shape[1],))
    
    mul_layer = Lambda(pairwise_mul, name='MultiplicationLayer')([input_a, input_b])
    dis_layer = Lambda(pairwise_dis, name='SubstractionLayer')([input_a, input_b])

    mer = concatenate([mul_layer, dis_layer])
    bnr = BatchNormalization()(mer)
    
    dr1 = Dropout(DROPOUT)(bnr)
    fc1 = Dense(DENSE_UNITS, activation='relu')(dr1)
    
    mod = Model(inputs=[input_a, input_b], outputs=fc1)
    return mod

def build_model(embs_matrix):
    """
    Combines the modules above into an end-to-end model
    predicting similarity scores for pairs of questions.
    
    Keep in mind that you can plug in just about anything in place of the encoder.
    As long as it predicts a fixed-length vector for each sentence, it should just work.
    """
    
    encoder = build_rnn_mk1_encoder(embs_matrix)
    simnet = build_sim_net(encoder.layers[-1].output_shape)
    
    input_a = Input(shape=(MAXLEN,))
    input_b = Input(shape=(MAXLEN,))
    
    enc_a = encoder(input_a)
    enc_b = encoder(input_b)
    
    fc1 = simnet([enc_a, enc_b])
    
    fc2 = Dense(1, activation='sigmoid')(fc1)
    
    model = Model(inputs=[input_a, input_b], outputs=fc2)
    feature_model = Model(inputs=[input_a, input_b], outputs=fc1)

    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model, feature_model, encoder

### Build the neural network
The first one predicts similarity score, the second returns a feature vector.

In [18]:
mmod, fmod, encmod = build_model(embs_m)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [19]:
encmod.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 24)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 24, 128)           9600000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1200)              3499200   
Total params: 13,099,200
Trainable params: 3,499,200
Non-trainable params: 9,600,000
_________________________________________________________________


### Do the train/val split

In [20]:
idx = list(range(len(v_tr_q1)))
random.shuffle(idx)
train_idx, val_idx = train_test_split(idx, train_size=0.9)



In [21]:
train_X = [v_tr_q1[train_idx], v_tr_q2[train_idx]]
train_Y = labels[train_idx]

val_X = [v_tr_q1[val_idx], v_tr_q2[val_idx]]
val_Y = labels[val_idx]

## Train!
Training this takes ~1 hour on a GTX 1080 with recent CUDA/CUDNN

In [22]:
checkpointer = ModelCheckpoint(filepath="quora_bilstm.hdf5",
                                       verbose=0, save_best_only=True)

In [23]:
hist = mmod.fit(train_X, train_Y, validation_data=(val_X, val_Y), 
                batch_size=256, epochs=20, 
                callbacks=[checkpointer])

Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


Train on 363861 samples, validate on 40429 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

In [24]:
mmod.load_weights("quora_bilstm.hdf5")

### Make a submission

In [25]:
predictions = mmod.predict([v_ts_q1, v_ts_q2]).reshape(-1,)

In [26]:
sub = pd.DataFrame({'test_id': testing_data['test_id'], 'is_duplicate': predictions})
sub.to_csv('sample_submission.csv', index=False)
sub.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.004301
1,1,0.251524
2,2,0.406096
3,3,0.289482
4,4,0.626289
