<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
# avoid decoding problems
df = pd.read_csv("train.csv")
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
print(df.shape)
df.head()

(404290, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Splitting the data into Train and test

In [4]:
y_true = df['is_duplicate']
df = df.drop('is_duplicate', axis=1)
print(len(y_true))
df.head()

404290


Unnamed: 0,id,qid1,qid2,question1,question2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?


In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(df, y_true, stratify=y_true, test_size=0.3, random_state = 22)

In [6]:
# help(train_test_split)

In [7]:
print(X_train.shape)
print(X_test.shape)


(283003, 5)
(121287, 5)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions_train = list(X_train['question1']) + list(X_train['question2'])
questions_test = list(X_test['question1']) + list(X_test['question2'])

tfidf = TfidfVectorizer(lowercase=False,)
tfidf.fit(list(X_train['question1']) + list(X_train['question2']))
X_train_tfidf_q1 = tfidf.transform(list(X_train['question1']))
X_train_tfidf_q2 = tfidf.transform(list(X_train['question2']))

X_test_tfidf_q1 = tfidf.transform(list(X_test['question1']))
X_test_tfidf_q2 = tfidf.transform(list(X_test['question2']))




In [9]:
print(X_train_tfidf_q1.shape)
print(X_train_tfidf_q2.shape)

print(X_test_tfidf_q1.shape)
print(X_test_tfidf_q2.shape)


(283003, 93167)
(283003, 93167)
(121287, 93167)
(121287, 93167)


### Keeping top 5k features

In [10]:
top5k_tfidf = np.argsort(tfidf.idf_)[::-1][:5000]

In [11]:
X_train_tfidf_q1_5k = (X_train_tfidf_q1.T[top5k_tfidf]).T
X_train_tfidf_q2_5k = (X_train_tfidf_q2.T[top5k_tfidf]).T

X_test_tfidf_q1_5k = (X_test_tfidf_q1.T[top5k_tfidf]).T
X_test_tfidf_q2_5k = (X_test_tfidf_q2.T[top5k_tfidf]).T

In [12]:
print(X_train_tfidf_q1_5k.shape)
print(X_train_tfidf_q2_5k.shape)

print(X_test_tfidf_q1_5k.shape)
print(X_test_tfidf_q2_5k.shape)

(283003, 5000)
(283003, 5000)
(121287, 5000)
(121287, 5000)


In [13]:
# # en_vectors_web_lg, which includes over 1 million unique vectors.
# nlp = spacy.load('en_core_web_sm')

# vecs1 = []
# # https://github.com/noamraph/tqdm
# # tqdm is used to print the progress bar
# for qu1 in tqdm(list(df['question1'])):
#     doc1 = nlp(qu1) 
#     # 384 is the number of dimensions of vectors 
#     mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
#     for word1 in doc1:
#         # word2vec
#         vec1 = word1.vector
#         # fetch df score
#         try:
#             idf = word2tfidf[str(word1)]
#         except:
#             idf = 0
#         # compute final vec
#         mean_vec1 += vec1 * idf
#     mean_vec1 = mean_vec1.mean(axis=0)
#     vecs1.append(mean_vec1)
# df['q1_feats_m'] = list(vecs1)


In [14]:
# vecs2 = []
# for qu2 in tqdm(list(df['question2'])):
#     doc2 = nlp(qu2) 
#     mean_vec1 = np.zeros([len(doc1), len(doc2[0].vector)])
#     for word2 in doc2:
#         # word2vec
#         vec2 = word2.vector
#         # fetch df score
#         try:
#             idf = word2tfidf[str(word2)]
#         except:
#             #print word
#             idf = 0
#         # compute final vec
#         mean_vec2 += vec2 * idf
#     mean_vec2 = mean_vec2.mean(axis=0)
#     vecs2.append(mean_vec2)
# df['q2_feats_m'] = list(vecs2)

In [15]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [16]:
print(dfnlp.shape)
dfnlp.head(2)

(404290, 21)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,0.99998,0.833319,0.999983,0.999983,...,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,0.799984,0.399996,0.749981,0.599988,...,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154


In [17]:
print(dfppro.shape)
dfppro.head(2)

(404290, 17)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,4,1,51,88,8,13,4.0,20.0,0.2,5,3


In [18]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
# df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

In [19]:
X_train_dfnlp,X_test_dfnlp, y_train_dfnlp, y_test_dfnlp = train_test_split(df1, y_true, stratify=y_true, test_size=0.3, random_state = 22)
X_train_dfppro,X_test_dfppro, y_train_dfppro, y_test_dfppro = train_test_split(df1, y_true, stratify=y_true, test_size=0.3, random_state = 22)

In [20]:
# df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
# df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [21]:
# dataframe of nlp features
X_train_dfnlp.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
349310,349310,0,0.499988,0.399992,0.428565,0.299997,0.416663,0.333331,1.0,0.0,3.0,13.5,62,55,59,57,0.169492
247517,247517,0,0.999975,0.799984,0.57142,0.499994,0.61538,0.571424,1.0,0.0,1.0,13.5,84,73,73,81,0.442623
320565,320565,0,0.666644,0.499988,0.0,0.0,0.28571,0.181817,1.0,0.0,4.0,9.0,62,56,49,57,0.184211
379836,379836,0,0.799992,0.727266,0.999983,0.749991,0.823525,0.666663,1.0,1.0,4.0,19.0,89,82,85,81,0.542056
325603,325603,0,0.666644,0.499988,0.0,0.0,0.499988,0.249997,0.0,0.0,4.0,6.0,59,57,53,60,0.47619


In [22]:
# data before preprocessing 
X_train_dfppro.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
349310,349310,0,0.499988,0.399992,0.428565,0.299997,0.416663,0.333331,1.0,0.0,3.0,13.5,62,55,59,57,0.169492
247517,247517,0,0.999975,0.799984,0.57142,0.499994,0.61538,0.571424,1.0,0.0,1.0,13.5,84,73,73,81,0.442623
320565,320565,0,0.666644,0.499988,0.0,0.0,0.28571,0.181817,1.0,0.0,4.0,9.0,62,56,49,57,0.184211
379836,379836,0,0.799992,0.727266,0.999983,0.749991,0.823525,0.666663,1.0,1.0,4.0,19.0,89,82,85,81,0.542056
325603,325603,0,0.666644,0.499988,0.0,0.0,0.499988,0.249997,0.0,0.0,4.0,6.0,59,57,53,60,0.47619


In [23]:
# Questions 1 tfidf weighted word2vec
# df3_q1.head()

In [24]:
# Questions 2 tfidf weighted word2vec
# df3_q2.head()

In [25]:
print("Number of features in nlp dataframe :", X_train_dfnlp.shape)
print("Number of features in preprocessed dataframe :", X_train_dfppro.shape)
print("Number of features in question1 tfidf  dataframe :", X_train_tfidf_q1_5k.shape)
print("Number of features in question2 tfidf  dataframe :", X_train_tfidf_q2_5k.shape)
print("Number of features in final dataframe  :", X_train_dfnlp.shape[1]+X_train_dfppro.shape[1]+X_train_tfidf_q1_5k.shape[1]+X_train_tfidf_q2_5k.shape[1])

Number of features in nlp dataframe : (283003, 17)
Number of features in preprocessed dataframe : (283003, 17)
Number of features in question1 tfidf  dataframe : (283003, 5000)
Number of features in question2 tfidf  dataframe : (283003, 5000)
Number of features in final dataframe  : 10034


In [26]:
# storing the final features to csv file
from scipy.sparse import hstack
import pickle

train_filename = 'X_train_final_features_tfidf'
test_filename = 'X_test_final_features_tfidf'

if not os.path.isfile(train_filename):
    print("file not preset")
    X_train = hstack((X_train_dfnlp, X_train_dfppro, X_train_tfidf_q1_5k, X_train_tfidf_q2_5k))
    file = open(train_filename, "wb")
    pickle.dump(X_train, file)
    file.close()
    
if not os.path.isfile(test_filename):
    print("file not preset")
    X_test = hstack((X_test_dfnlp, X_test_dfppro, X_test_tfidf_q1_5k, X_test_tfidf_q2_5k))
    file = open(test_filename, "wb")
    pickle.dump(X_test, file)
    file.close()
    
#     X_test.to_csv('final_features_test.csv')
#     df3_q1['id']=df1['id']
#     df3_q2['id']=df1['id']
#     df1  = df1.merge(df2, on='id',how='left')
#     df2  = df3_q1.merge(df3_q2, on='id',how='left')
#     result  = df1.merge(df2, on='id',how='left')
#     result.to_csv('final_features.csv')

file not preset
file not preset


In [27]:
type(X_train)

scipy.sparse.coo.coo_matrix