In [2]:
import os 
import re
import sys
import time
import spacy

import numpy as np
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools

In [2]:
# avoid decoding problems
df = pd.read_csv("train.csv",nrows=75000)
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [8]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 384])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch idf score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)


100%|██████████| 404290/404290 [4:07:18<00:00, 27.25it/s]  


In [4]:
q1_vect=pd.read_csv("q1_feats_m.csv",usecols=['q1_feats_m'],nrows=75000)
q1_vect['q1_feats_m']=q1_vect['q1_feats_m'].apply(lambda x: str(x))
q1_vect['q1_feats_m']=q1_vect['q1_feats_m'].apply(lambda x: x[1:-1])
q1_vect.head(3)

Unnamed: 0,q1_feats_m
0,1.21929923e+02 1.00083890e+02 7.24979097e+0...
1,-7.80709391e+01 5.48437400e+01 8.27384499e+0...
2,-5.35503471e+00 7.36718163e+01 1.43763914e+0...


In [5]:
z=[]
for i in tqdm(range(q1_vect.shape[0])): 
    a= [x.strip('\n') for x in q1_vect.q1_feats_m.values[i].split(' ') if x != '']
    z.append(a)
df_1=pd.DataFrame(z)
df_1.head(2)

100%|██████████| 75000/75000 [00:17<00:00, 4175.55it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,121.929923,100.08389,72.4979097,115.641794,-48.3708777,34.6190416,-172.057801,-92.5026292,113.223287,50.562453,...,12.3976424,40.9095244,8.1502628,-15.1706951,18.0077097,6.16699657,-30.1241619,3.70090684,-1.7576942,-1.81805712
1,-78.0709391,54.84374,82.7384499,98.1918576,-51.2348266,55.0135268,-39.1407282,-82.6923568,45.1614954,-9.55630028,...,-21.9870804,-12.3892805,20.6679857,2.20271842,-17.1424538,-5.88097225,-10.1239596,-4.89065844,-13.0183909,-5.21928531


In [3]:
df_1=pd.read_csv("q1_vect.csv")

In [None]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), 384])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch idf score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

In [6]:
q2_vect=pd.read_csv("q2_feats_m.csv",usecols=['q2_feats_m'],nrows=75000,verbose=True)
q2_vect['q2_feats_m']=q2_vect['q2_feats_m'].apply(lambda x: str(x))
q2_vect['q2_feats_m']=q2_vect['q2_feats_m'].apply(lambda x: x[1:-1])
q2_vect.head(3)

Tokenization took: 11934.39 ms
Type conversion took: 3343.81 ms
Parser memory cleanup took: 13.43 ms
Tokenization took: 5051.78 ms
Type conversion took: 606.50 ms
Parser memory cleanup took: 102.21 ms


Unnamed: 0,q2_feats_m
0,1.25983289e+02 9.56364930e+01 4.21147345e+0...
1,-1.06871908e+02 8.02903922e+01 7.90662889e+0...
2,7.07288865e+00 1.55133688e+01 1.84688759e+0...


In [8]:
z=[]
for i in tqdm(range(q2_vect.shape[0])): 
    a= [x.strip('\n') for x in q2_vect.q2_feats_m.values[i].split(' ') if x != '']
    z.append(a)
df_2=pd.DataFrame(z)
df_2.head(2)

100%|██████████| 75000/75000 [02:50<00:00, 438.75it/s] 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,125.983289,95.636493,42.1147345,95.4499858,-37.3863018,39.4000605,-148.116062,-87.8514811,110.371958,62.272823,...,16.1655927,33.0306723,7.01999725,-14.7939571,15.4375102,8.19965562,-25.0708348,1.57162189,1.60373546,0.305648521
1,-106.871908,80.2903922,79.0662889,59.3020661,-42.1753669,117.616711,-144.364265,-127.13153,22.962532,25.3975822,...,-4.90113214,-4.56538653,41.520755,-0.727565695,-16.4137692,-7.37377381,2.63887854,-7.40346268,2.70306858,0.408055723


In [4]:
df_2=pd.read_csv("q2_vect.csv")

In [5]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",nrows=75000,encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",nrows=75000,encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [11]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df_1 = df_1.drop(['Unnamed: 0'],axis=1)
df_2 = df_2.drop(['Unnamed: 0'],axis=1)
#df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
#df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [7]:
df1.head(2)

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154


In [8]:
df2.head(2)

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,4,1,51,88,8,13,4.0,20.0,0.2,5,3


In [12]:
df_1.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,121.929923,100.08389,72.49791,115.641794,-48.370878,34.619042,-172.057801,-92.502629,113.223287,50.562453,...,12.397642,40.909524,8.150263,-15.170695,18.00771,6.166997,-30.124162,3.700907,-1.757694,-1.818057
1,-78.070939,54.84374,82.73845,98.191858,-51.234827,55.013527,-39.140728,-82.692357,45.161495,-9.5563,...,-21.98708,-12.38928,20.667986,2.202718,-17.142454,-5.880972,-10.12396,-4.890658,-13.018391,-5.219285


In [13]:
df_2.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,125.983289,95.636493,42.114734,95.449986,-37.386302,39.40006,-148.116062,-87.851481,110.371958,62.272823,...,16.165593,33.030672,7.019997,-14.793957,15.43751,8.199656,-25.070835,1.571622,1.603735,0.305649
1,-106.871908,80.290392,79.066289,59.302066,-42.175367,117.616711,-144.364265,-127.13153,22.962532,25.397582,...,-4.901132,-4.565387,41.520755,-0.727566,-16.413769,-7.373774,2.638879,-7.403463,2.703069,0.408056


In [14]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df_1.shape[1])
print("Number of features in question2 w2v  dataframe :", df_2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df_1.shape[1]+df_2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 384
Number of features in question2 w2v  dataframe : 384
Number of features in final dataframe  : 797


In [15]:
# storing the final features to csv file
if not os.path.isfile('final_features.csv'):
    df_1['id']=df1['id']
    df_2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df_1.merge(df_2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')

<h2> 4.1 Random train test split( 70:30) </h2>