### Feature Engineering

In [None]:
# Import libraries

import numpy as np
import _pickle as cPickle
import pandas as pd

import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

import nltk
nltk.download('stopwords')

from nltk import word_tokenize
stop_words = stopwords.words('english')

import wget


In [None]:
# Download GloVe, an unsupervised learning algorithm for obtaining vector representations for words
wget.download('http://www-nlp.stanford.edu/data/glove.840B.300d.zip')

In [None]:
# Download word2vec pre-trained Google News corpus
wget.download('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')

In [None]:
# Download quora questions train and test data
wget.download('http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv')

In [18]:
# Convert questions strings to lower case and eliminate stop words
# apply gensim word2vec model trained on Google News corpus
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [19]:
# Convert questions strings to lower case and eliminate stop words
# apply gensim word2vec model trained Google News corpus with precomputed L2-normalized vectors.
def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)


In [48]:
# Convert words to lower case, tokenize words, remove stop words and leave only alphabetic characters in words
# vectorize words, and normalize the result

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [33]:
# Import the data into pandas dataframe
data = pd.read_csv('quora-question-pairs/train.csv')

In [34]:
# Remove id columns
data = data.drop(['id', 'qid1', 'qid2'], axis=1)
data.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [35]:
#Get the length of questions strings and calculate the difference
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2

In [36]:
#Get the number of unique chatacters in each string, exluding the white space
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))


In [37]:
#Get the number of words in each string
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))

In [38]:
#Find the number of common words in q1 and q2
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)


In [39]:
# Using fuzzywuzzy library which compares two strings and outputs a score out of 100, 
# that denotes two string are equal by giving similarity index

data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)

In [40]:
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [41]:
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [42]:
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

In [43]:
# Build word2vec model and appy to question strings
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

In [45]:
# Build word2vec model, normalize vectors, and appy to question strings
norm_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

In [46]:
question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0

In [49]:
#Apply sent2vec function to question1_vectors and question2_vectors
for i, q in tqdm(enumerate(data.question1.values)):
    question1_vectors[i, :] = sent2vec(q)

  
404290it [05:19, 1265.83it/s]


In [50]:
question2_vectors  = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question2.values)):
    question2_vectors[i, :] = sent2vec(q)

  
404290it [03:43, 1812.20it/s]


In [51]:
# Calculate distances between questions trings
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [52]:
data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

In [53]:
data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

  dist = np.double(unequal_nonzero.sum()) / np.double(nonzero.sum())


In [54]:
data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

In [55]:
data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

In [56]:
data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

In [None]:
data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]


In [58]:
data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]


In [59]:
cPickle.dump(question1_vectors, open('q1_w2v.pkl', 'wb'), -1)
cPickle.dump(question2_vectors, open('q2_w2v.pkl', 'wb'), -1)

In [60]:
data.to_csv('quora_features.csv', index=False)

### Building model

In [None]:
# Import libraries

from keras import optimizers
from keras import backend as K

from keras.callbacks import ModelCheckpoint

from keras.layers import Embedding, Input
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D

from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM, GRU

from keras.models import Model
from keras.preprocessing import sequence, text

from keras.utils import np_utils


In [None]:
data = pd.read_csv('quora_features.csv')

In [10]:
# Define X and y, tokenize questions

y = data.is_duplicate.values

tk = text.Tokenizer(num_words=200000)

max_len = 40
tk.fit_on_texts(list(data.question1.values.astype(str)) + list(data.question2.values.astype(str)))
x1 = tk.texts_to_sequences(data.question1.values.astype(str))
x1 = sequence.pad_sequences(x1, maxlen=max_len)

x2 = tk.texts_to_sequences(data.question2.values.astype(str))
x2 = sequence.pad_sequences(x2, maxlen=max_len)

word_index = tk.word_index

ytrain_enc = np_utils.to_categorical(y)

In [None]:
from tqdm import tqdm

embeddings_index = {}
f = open('glove.840B.300d.txt', encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
import pickle
pickle.dump(embeddings_index, open("embeddings.p", "wb")) 

In [24]:
embeddings_index_large = pickle.load( open( "embeddings.p", "rb" ) )

In [28]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index_large.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


100%|██████████| 95596/95596 [00:00<00:00, 294692.98it/s]


In [None]:
max_features = 200000
filter_length = 5
num_filter = 64
pool_length = 4

input_1 = Input(shape=(40,))
embedding_1 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_1)

timedistributed_1 = TimeDistributed(Dense(300, activation='relu'))(embedding_1)
lambda_1 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(timedistributed_1)


input_2 = Input(shape=(40,))
embedding_2 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_2)

timedistributed_2 = TimeDistributed(Dense(300, activation='relu'))(embedding_1)
lambda_2 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(timedistributed_1)



input_3 = Input(shape=(40,))
embedding_3 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_3)

convolution_3 = Convolution1D(nb_filter=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(embedding_3)

dropout_3 = Dropout(0.2)(convolution_3)
convolution_3_2 = Convolution1D(filters=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(dropout_3)


globalmaxpooling1d_3 = GlobalMaxPooling1D()(convolution_3_2)

dropout_3_2 = Dropout(0.2)(globalmaxpooling1d_3)

dense_3 = Dense(300)(globalmaxpooling1d_3)

dropout_3_3 = Dropout(0.2)(dense_3)
batchnormalization_3 = BatchNormalization()(dropout_3_3)


input_4 = Input(shape=(40,))
embedding_4 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_4)

convolution_4 = Convolution1D(filters=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(embedding_4)

dropout_4 = Dropout(0.2)(convolution_4)
convolution_4_2 = Convolution1D(filters=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(dropout_4)


globalmaxpooling1d_4 = GlobalMaxPooling1D()(convolution_4_2)

dropout_4_2 = Dropout(0.2)(globalmaxpooling1d_4)

dense_4 = Dense(300)(globalmaxpooling1d_4)

dropout_4_3 = Dropout(0.2)(dense_4)
batchnormalization_4 = BatchNormalization()(dropout_4_3)

input_5 = Input(shape=(40,))
embedding_5 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                       input_length=40)(input_5)

dropout_5 = Dropout(0.2)(embedding_5)
ltsm_5 = LSTM(300)(dropout_5)
dropout_5_2 = Dropout(0.2)(ltsm_5)

input_6 = Input(shape=(40,))
embedding_6 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                       input_length=40)(input_6)

dropout_6 = Dropout(0.2)(embedding_6)
ltsm_6 = LSTM(300)(dropout_6)
dropout_6_2 = Dropout(0.2)(ltsm_6)


merged = concatenate([lambda_1, 
                      lambda_2, 
                      batchnormalization_3, 
                      batchnormalization_4,
                      dropout_5_2,
                      dropout_6_2])

m_dense = Dense(300)(merged)
m_relu = PReLU()(m_dense)
m_dropout = Dropout(0.2)(m_relu)
m_batch = BatchNormalization()(m_dropout)

m2_dense = Dense(300)(m_batch)
m2_relu = PReLU()(m2_dense)
m2_dropout = Dropout(0.2)(m2_relu)
m2_batch = BatchNormalization()(m2_dropout)


m3_dense = Dense(300)(m2_batch)
m3_relu = PReLU()(m3_dense)
m3_dropout = Dropout(0.2)(m3_relu)
m3_batch = BatchNormalization()(m3_dropout)

m4_dense = Dense(300)(m3_batch)
m4_relu = PReLU()(m4_dense)
m4_dropout = Dropout(0.2)(m4_relu)
m4_batch = BatchNormalization()(m4_dropout)

m5_dense = Dense(300)(m4_batch)
m5_relu = PReLU()(m_dense)
m5_dropout = Dropout(0.2)(m_relu)
m5_batch = BatchNormalization()(m5_dropout)


dense_out = Dense(1, activation='sigmoid')(m5_batch)

# build and compile model
model = Model(inputs=[input_1, 
                      input_2, 
                      input_3, 
                      input_4,
                      input_5, 
                      input_6], outputs=[dense_out])

model.compile(optimizers.Adam(), metrics=['accuracy'], loss='binary_crossentropy')

checkpoint = ModelCheckpoint('weights.h5', monitor='val_acc', save_best_only=True, verbose=2)
model.fit([x1, x2, x1, x2, x1, x2 ], y=y, batch_size=384, epochs=40,
                 verbose=1, validation_split=0.1, shuffle=True, callbacks=[checkpoint])



Train on 363861 samples, validate on 40429 samples
Epoch 1/40

Epoch 00001: val_acc improved from -inf to 0.78369, saving model to weights.h5
Epoch 2/40

Epoch 00002: val_acc improved from 0.78369 to 0.79745, saving model to weights.h5
Epoch 3/40

Epoch 00003: val_acc improved from 0.79745 to 0.79965, saving model to weights.h5
Epoch 4/40

Epoch 00004: val_acc did not improve from 0.79965
Epoch 5/40

Epoch 00005: val_acc improved from 0.79965 to 0.80272, saving model to weights.h5
Epoch 6/40

Epoch 00006: val_acc did not improve from 0.80272
Epoch 7/40

Epoch 00007: val_acc did not improve from 0.80272
Epoch 8/40

Epoch 00008: val_acc did not improve from 0.80272
Epoch 9/40

Epoch 00009: val_acc did not improve from 0.80272
Epoch 10/40

Epoch 00010: val_acc improved from 0.80272 to 0.80583, saving model to weights.h5
Epoch 11/40

Epoch 00011: val_acc did not improve from 0.80583
Epoch 12/40

Epoch 00012: val_acc did not improve from 0.80583
Epoch 13/40

Epoch 00013: val_acc did not imp

<keras.callbacks.History at 0x7ff026616ac8>

### Test Data Feature Engineering

In [None]:
# Import the test data into pandas dataframe
test = pd.read_csv('test.csv')
test = test.drop(['test_id'], axis=1)

#Get the length of questions strings and calculate the difference
test['len_q1'] = test.question1.apply(lambda x: len(str(x)))
test['len_q2'] = test.question2.apply(lambda x: len(str(x)))
test['diff_len'] = test.len_q1 - test.len_q2

#Get the number of unique chatacters in each string, exluding the white space
test['len_char_q1'] = test.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
test['len_char_q2'] = test.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))

#Get the number of words in each string
test['len_word_q1'] = test.question1.apply(lambda x: len(str(x).split()))
test['len_word_q2'] = test.question2.apply(lambda x: len(str(x).split()))

#Find the number of common words in q1 and q2
test['common_words'] = test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)

# Using fuzzywuzzy library which compares two strings and outputs a score out of 100, 
# that denotes two string are equal by giving similarity index

test['fuzz_qratio'] = test.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
test['fuzz_WRatio'] = test.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
test['fuzz_partial_ratio'] = test.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
test['fuzz_partial_token_set_ratio'] = test.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
test['fuzz_token_set_ratio'] = test.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
test['fuzz_token_sort_ratio'] = test.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

# Build word2vec model and appy to question strings
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
test['wmd'] = test.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

# Build word2vec model, normalize vectors, and appy to question strings
norm_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)
test['norm_wmd'] = test.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)


from nltk import word_tokenize
nltk.download('punkt')

question1_vectors = np.zeros((test.shape[0], 300))
error_count = 0

#Apply sent2vec function to question1_vectors and question2_vectors
for i, q in tqdm(enumerate(test.question1.values)):
    question1_vectors[i, :] = sent2vec(q)
    
question2_vectors  = np.zeros((test.shape[0], 300))
for i, q in tqdm(enumerate(test.question2.values)):
    question2_vectors[i, :] = sent2vec(q)    
    
    
# Calculate distances between questions trings
test['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
test['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
test['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
test['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
test['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
test['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]
test['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
test['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
test['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
test['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

cPickle.dump(question1_vectors, open('test_q1_w2v.pkl', 'wb'), -1)
cPickle.dump(question2_vectors, open('test_q2_w2v.pkl', 'wb'), -1)

test.to_csv('quora_test_features.csv', index=False)    

### Get Predictions on the Test Data

In [None]:
# Define test X and y, tokenize questions

x1_t = tk.texts_to_sequences(test.question1.values.astype(str))
x1_t = sequence.pad_sequences(x1_t, maxlen=max_len)

x2_t = tk.texts_to_sequences(test.question2.values.astype(str))
x2_t = sequence.pad_sequences(x2_t, maxlen=max_len)

In [None]:
# Get test data set predictions
predict = model.predict([x1_t, x2_t, x1_t, x2_t, x1_t, x2_t], batch_size=384, verbose=2)

In [None]:
#Prepare csv files for submission 
raw_test = pd.read_csv('test.csv')
submission = pd.DataFrame({"test_id": raw_test["test_id"], "is_duplicate": predict.ravel()})

In [None]:
# Map through predictions and apply .50 threshold on prediction values
submission_50 = submission.copy()
submission_50['is_duplicate'] = [1 if val > .5 else 0 for val in submission_50['is_duplicate']]

# Map through predictions and apply .80, .95, .99, and .9999 threshold on prediction values

submission_80 = submission.copy()
submission_80['is_duplicate'] = [1 if val > .80 else 0 for val in submission_80['is_duplicate']]
submission_80.to_csv("predictions_80.csv", index=False)

submission_95 = submission.copy()
submission_95['is_duplicate'] = [1 if val > .95 else 0 for val in submission_95['is_duplicate']]
submission_95.to_csv("predictions_95.csv", index=False)

submission_99 = submission.copy()
submission_99['is_duplicate'] = [1 if val > .99 else 0 for val in submission_99['is_duplicate']]
submission_99.to_csv("predictions_99.csv", index=False)

submission_9999 = submission.copy()
submission_9999['is_duplicate'] = [1 if val > .9999 else 0 for val in submission_9999['is_duplicate']]
submission_9999.to_csv("predictions_9999.csv", index=False)