<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [22]:
# avoid decoding problems
df = pd.read_csv("train.csv")[:100000]
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))
df["question"] = df["question1"].map(str) +\
                 df["question2"].map(str)
                    

In [23]:
# df.head(5)
# Y = df['is_duplicate']
# X = df
# del X['is_duplicate']

In [24]:
# del df['is_duplicate']

In [25]:
# X.head(5)

## Splitting the data before vectorizing

In [7]:
X_train,X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.3)

### TFIDF Vectorizer

In [64]:
vectorizer_tfidf_question_1 = TfidfVectorizer()

X_train_question_tfidf = vectorizer_tfidf_question_1.fit_transform(X_train['question'])
X_test_question_tfidf = vectorizer_tfidf_question_1.transform(X_test['question'])
print("Shape of X_train_question_1_tfidf matrix ",X_train_question_tfidf.shape)
print("Shape of X_test_question_1_tfidf matrix",X_test_question_tfidf.shape)

Shape of X_train_question_1_tfidf matrix  (283003, 74498)
Shape of X_test_question_1_tfidf matrix (121287, 74498)


In [65]:
print(vectorizer_tfidf_question_1.get_feature_names()[:5])
print(vectorizer_tfidf_question_1.idf_[:5])
# type(X_train_question_tfidf)

['00', '000', '0000', '00000000', '0000000000']
[ 9.41008158  6.86113257 12.86006913 12.86006913 12.86006913]


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [10]:
# word2tfidf

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [28]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)


100%|██████████| 100000/100000 [31:29<00:00, 52.91it/s]


In [30]:
df['q1_feats_m'].head(5)

0    [211.3274930715561, -144.7779631614685, -68.72...
1    [142.841313123703, -114.03367304801941, -110.2...
2    [82.5609240680933, -142.53362256288528, 0.7266...
3    [-125.97513282299042, -59.42235966771841, -67....
4    [299.5942276120186, -188.40130496025085, -21.2...
Name: q1_feats_m, dtype: object

In [31]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|██████████| 100000/100000 [35:42<00:00, 46.68it/s]


In [32]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')[0:100000]
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [33]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [34]:
# dataframe of nlp features
df1.head()


Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154
2,2,0,0.399992,0.333328,0.399992,0.249997,0.399996,0.285712,0.0,1.0,4.0,12.0,66,66,54,54,0.166667
3,3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,12.0,36,36,35,40,0.039216
4,4,0,0.399992,0.199998,0.99995,0.666644,0.57142,0.30769,0.0,1.0,6.0,10.0,67,47,46,56,0.175


In [35]:
# data before preprocessing 
df2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,1,1,51,88,8,13,4.0,20.0,0.2,2,0
2,2,1,1,73,59,14,10,4.0,24.0,0.166667,2,0
3,3,1,1,50,65,11,9,0.0,19.0,0.0,2,0
4,4,1,1,76,39,13,7,2.0,20.0,0.1,2,0


In [36]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,211.327493,-144.777963,-68.724038,-154.424293,-90.765352,2.541932,136.953358,50.734912,-63.741751,57.34516,...,33.424773,-102.977361,144.956313,50.869276,-21.612809,35.889588,-71.494379,4.160346,-131.862783,72.127579
1,142.841313,-114.033673,-110.215671,-103.034122,-87.741411,15.190485,55.16869,100.432233,7.681387,178.259341,...,67.073067,20.522029,41.012347,-0.266897,95.69529,-36.636391,-121.102605,70.35359,-17.043896,-7.070883
2,82.560924,-142.533623,0.726614,-104.875443,-84.751818,22.521084,115.94203,50.422234,-112.165931,52.261222,...,61.786713,-50.324955,107.57519,-11.787163,-40.99372,-27.191091,-12.928715,1.654342,-100.863351,148.836797
3,-125.975133,-59.42236,-67.570983,-137.758572,-100.720235,87.775736,-23.037602,85.907325,27.63218,50.557094,...,117.428018,-13.144253,-1.37601,77.991531,23.797347,12.770571,-8.103193,122.957298,63.929072,-24.856102
4,299.594228,-188.401305,-21.267282,-271.166976,-186.462065,105.778991,170.331633,-69.645959,-96.292431,134.757989,...,61.03596,-172.486168,246.922066,29.651128,-27.454099,-27.704648,-62.081455,3.472992,-119.313633,108.889052


In [37]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
0,152.150878,-127.275134,-31.749491,-143.763975,-98.000371,9.522685,107.033391,37.073909,-36.444822,54.110217,...,29.179625,-108.492959,131.753666,42.551238,-15.014939,15.55976,-59.914421,-9.710103,-100.494285,58.079703
1,148.997841,-44.465574,-101.379184,-125.883334,-116.109336,44.848494,134.104035,26.338358,-77.135588,83.96167,...,118.257488,4.803217,56.62208,91.515271,138.080721,-30.453486,-21.980228,86.011098,-25.02278,11.250763
2,6.616661,-29.885251,-119.091992,-99.385166,-19.594293,-9.981105,142.606486,91.884172,51.449668,13.693213,...,129.177678,18.698502,59.104817,88.825091,-86.036293,72.855847,-11.356267,-60.35253,-100.171914,96.11861
3,-6.760001,-44.608786,-17.806797,-61.250319,-7.154842,16.542798,96.313375,-2.358248,-12.391162,-27.797972,...,70.816154,-30.719308,85.571856,50.214468,25.514955,-27.231886,22.942071,-4.807315,-46.899339,57.239975
4,95.189887,-70.992464,21.328272,-92.314399,-106.132173,10.319852,91.417332,-40.342474,-34.476539,55.976651,...,19.032317,-70.004583,87.14419,38.69322,-13.8246,-12.965596,47.150454,-24.741699,-52.968391,-2.464822


In [38]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 96
Number of features in question2 w2v  dataframe : 96
Number of features in final dataframe  : 221


In [39]:
# storing the final features to csv file
if not os.path.isfile('final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('final_features.csv')

In [40]:
final_data = pd.read_csv('final_features.csv')

In [41]:
final_data.shape

(100000, 221)