### Feature Engineering on the test data

In [None]:
# Import libraries
import numpy as np
import _pickle as cPickle
import pandas as pd

import gensim
from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

import nltk
nltk.download('stopwords')

from nltk import word_tokenize
stop_words = stopwords.words('english')

import wget


In [4]:
# Download word2vec pre-trained Google News corpus
wget.download('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')

'GoogleNews-vectors-negative300.bin.gz'

In [23]:
# Convert questions strings to lower case and eliminate stop words
# apply gensim word2vec model trained on Google News corpus
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)

In [24]:
# Convert questions strings to lower case and eliminate stop words
# apply gensim word2vec model trained Google News corpus with precomputed L2-normalized vectors.
def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)


In [50]:
# Convert words to lower case, tokenize words, remove stop words and leave only alphabetic characters in words
# vectorize words, and normalize the result

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [26]:
# Import the test data into pandas dataframe
test = pd.read_csv('test.csv')
test = test.drop(['test_id'], axis=1)

In [28]:
#Get the length of questions strings and calculate the difference
test['len_q1'] = test.question1.apply(lambda x: len(str(x)))
test['len_q2'] = test.question2.apply(lambda x: len(str(x)))
test['diff_len'] = test.len_q1 - test.len_q2

In [30]:
#Get the number of unique chatacters in each string, exluding the white space
test['len_char_q1'] = test.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
test['len_char_q2'] = test.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))


In [31]:
#Get the number of words in each string
test['len_word_q1'] = test.question1.apply(lambda x: len(str(x).split()))
test['len_word_q2'] = test.question2.apply(lambda x: len(str(x).split()))

In [32]:
#Find the number of common words in q1 and q2
test['common_words'] = test.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)


In [33]:
# Using fuzzywuzzy library which compares two strings and outputs a score out of 100, 
# that denotes two string are equal by giving similarity index

test['fuzz_qratio'] = test.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)


In [34]:
test['fuzz_WRatio'] = test.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)

In [35]:
test['fuzz_partial_ratio'] = test.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)


In [None]:
test['fuzz_partial_token_set_ratio'] = test.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)


In [42]:
test['fuzz_token_set_ratio'] = test.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)


In [43]:
test['fuzz_token_sort_ratio'] = test.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


In [45]:
# Build word2vec model and appy to question strings
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
test['wmd'] = test.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

In [47]:
# Build word2vec model, normalize vectors, and appy to question strings
norm_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)
test['norm_wmd'] = test.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

In [None]:
from nltk import word_tokenize
nltk.download('punkt')

question1_vectors = np.zeros((test.shape[0], 300))
error_count = 0

#Apply sent2vec function to question1_vectors and question2_vectors
for i, q in tqdm(enumerate(test.question1.values)):
    question1_vectors[i, :] = sent2vec(q)
    
question2_vectors  = np.zeros((test.shape[0], 300))
for i, q in tqdm(enumerate(test.question2.values)):
    question2_vectors[i, :] = sent2vec(q)    

In [None]:
# Calculate distances between questions trings
test['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

In [55]:
test['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]


In [None]:
test['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]


In [None]:
test['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]


In [None]:
# Calculate distances between questions trings
test['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

test['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
test['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
test['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
test['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]

cPickle.dump(question1_vectors, open('test_q1_w2v.pkl', 'wb'), -1)
cPickle.dump(question2_vectors, open('test_q2_w2v.pkl', 'wb'), -1)


test.to_csv('quora_test_features.csv', index=False)

In [58]:
test['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

In [59]:
test['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]

In [60]:
test['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                          np.nan_to_num(question2_vectors))]


In [61]:
test['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
test['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
test['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
test['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]


In [59]:
cPickle.dump(question1_vectors, open('test_q1_w2v.pkl', 'wb'), -1)
cPickle.dump(question2_vectors, open('test_q2_w2v.pkl', 'wb'), -1)

In [62]:
test.to_csv('quora_test_features.csv', index=False)