### Featurizing text data with tfidf weighted word-vectors 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time # to handle time-related tasks
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')
import sys
import os
import tqdm 
# for advanced Natural Language Processing
import spacy

In [2]:
df = pd.read_csv('train.csv')
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
# merge texts 
questions = list(df['question1']) + list(df['question2'])

# contains insights about the less relevant and more relevant words in a document
tfidf = TfidfVectorizer(lowercase = False, )
tfidf.fit_transform(questions)

word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [8]:
!python -m spacy download en_core_web_lg 
!python -m spacy download en 

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
     -------------------------------------- 777.4/777.4 MB 1.1 MB/s eta 0:00:00
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     -------------------------------------- 13.9/13.9 MB 813.7 kB/s eta 0:00:00
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [9]:
nlp = spacy.load('en_core_web_lg')
x = nlp('man')
len(x.vector)

300

In [10]:
vecs = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2)
    mean_vec1 = np.zeroes([len(doc1), len(doc2[0].vector)])
    for word2 in doc2:
        vec2 = word2.vector
        try:
            idf = word2tfidf[str(word2)]
        except:
            idf = 0
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis = 0)
    vecs.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

TypeError: 'module' object is not callable

In [None]:
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv('nlp_features_train.csv', encoding='latin-1')
else:
    print('download nlp_features_train.csv from drive or run previoud notebook')

if os.path.isfile('df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv('de_fe_without_processing_train.csv', encoding='latin-1')
else:
    print('download df_fe_without_preprocessing_train.csv from drive or run previous notebook')

In [None]:
df1 = dfnlp.drop(['qid1', 'qid2', 'question1', 'question2'], axis = 1)
df2 = dfppro.drop(['qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], axis = 1)
df3 = df.drop(['qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], axis = 1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index = df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index = df3.index)

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df3_q1.head()

In [None]:
df3_q2.head()

In [None]:
print('Number of features in nlp dataframe = ', df1.shape[1])
print('Number of features in preprocesses dataframe = ', df2.shape[1])
print('Number of features in question1 w2v dataframe = ', df3_q1.shape[1])
print('Number of features in question2 w2v dataframe = ', df3_q2.shape[1])
print('Number of features in final dataframe = ', df1.shape[1] + df2.shape[1] + df3_q1.shape[1] + df3_q2.shape[1])

In [None]:
# storing the final features in csv file 
if not os.path.isfile('final_features.csv'):
    df3_q1['id'] = df1['id']
    df3_q2['id'] = df1['id']
    df1 = df1.merge(df2, on='id', how='left')
    df2 = df3_q1.merge(df3_q2, on='id', how='left')
    result = df1.merge(df2, on='id', how='left')
    result.to_csv('final_features.csv')