In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
from scipy.sparse import hstack
import os , pickle
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy

In [6]:
df = pd.read_csv("data/data_with_preprocess_2.csv")
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))
print(df.shape)
df.head(2)

(404287, 32)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_common,word_total,word_share,freq_q1+q2,freq_q1-q2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0,1,1,66,57,14,12,10.0,23.0,0.434783,2,0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,1.0,2.0,13.0,100,93,93,100,0.982759
1,1,3,4,what is the story of kohinoor koh i noor dia...,what would happen if the indian government sto...,0,4,2,51,88,8,13,4.0,20.0,0.2,6,2,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,1.0,5.0,12.5,86,63,66,75,0.596154


### For the ease of computation we will sample only 100k points

In [7]:
#sampling 100k
df = df.sample(n=100000,random_state=40)

In [8]:
#changing columns to numeric type
num_cols = df.drop(columns=['id', 'qid1', 'qid2', 'question1', 'question2']).columns
for i in num_cols:
    df[i] = df[i].apply(pd.to_numeric)

In [9]:
y = df['is_duplicate']
X = df[df.drop(columns=['id', 'qid1', 'qid2','is_duplicate']).columns.tolist()]
print(X.shape)
print(y.shape)

(100000, 28)
(100000,)


# Train test split

In [10]:
X_train,X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3,random_state=100)
print("Number of data points in train data :",X_train.shape)
print("Number of data points in test data :",X_test.shape)

Number of data points in train data : (70000, 28)
Number of data points in test data : (30000, 28)


# Handling text data

We have already cleaned the text data. Now we have to vectorize it. We mainly used 2 approached.

1. TFIDF
2. TFIDF weighted glove vectorization

## TFIDF

In [11]:
tfidf_vectorizer1 = TfidfVectorizer(lowercase=False,max_features= 20000)
trainqs1_tfidf = tfidf_vectorizer1.fit_transform(X_train['question1'])
testqs1_tfidf  = tfidf_vectorizer1.transform(X_test['question1'])
print(trainqs1_tfidf.shape)
print(testqs1_tfidf.shape)

(70000, 20000)
(30000, 20000)


In [12]:
tfidf_vectorizer2 = TfidfVectorizer(lowercase=False,max_features= 20000)
train_qs2_tfidf = tfidf_vectorizer2.fit_transform(X_train['question2'])
test_qs2_tfidf  = tfidf_vectorizer2.transform(X_test['question2'])
print(train_qs2_tfidf.shape)
print(test_qs2_tfidf.shape)

(70000, 20000)
(30000, 20000)


In [17]:
#Now we will hstack both the vectors
tfidf_train_vec = hstack((trainqs1_tfidf,train_qs2_tfidf))
tfidf_test_vec = hstack((testqs1_tfidf,test_qs2_tfidf)) 
print("train data shape",tfidf_train_vec.shape)
print("Test data shape ",tfidf_test_vec.shape)

train data shape (70000, 40000)
Test data shape  (30000, 40000)


In [18]:
# selecting other features
train_df = X_train.drop(columns=['question1', 'question2'])
test_df = X_test.drop(columns=['question1', 'question2'])

In [19]:
#we need to convert our data with features into sparse matrix so that we can combine our feature matrix and and tfidf vectors 
import scipy
train_sparse = scipy.sparse.csr_matrix(train_df)
test_sparse = scipy.sparse.csr_matrix(test_df)

In [20]:
# Now combining our tfidf and features into one 
tfidf_X_tr = hstack((train_sparse,tfidf_train_vec))
tfidf_X_test = hstack((test_sparse,tfidf_test_vec))
print("train data shape",tfidf_X_tr.shape)
print("Test data shape ",tfidf_X_test.shape)

train data shape (70000, 40026)
Test data shape  (30000, 40026)


In [21]:
#saving tfidf vectors
pickle.dump(tfidf_train_vec, open("data/tfidf_X_tr","wb"))
pickle.dump(tfidf_test_vec, open("data/tfidf_X_test","wb"))

## TFIDF Weighted Glove Vectors

In [None]:
# use spacy embedding
# run this from a normal command line
# !python -m spacy download en_core_web_md

In [None]:
# merge texts
questions = list(X_train['question1']) + list(X_train['question2'])
tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

In [None]:
# Load the spacy model that you have installed
import en_core_web_sm
nlp = en_core_web_sm.load()

# each vector will be of length 94..
doc = nlp("This is some text that I am processing with Spacy")
#example
doc[3].vector.shape

(96,)

In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_train['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_train_glove_q1 = vecs1

100%|██████████| 70000/70000 [12:29<00:00, 93.37it/s]


In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_train['question2'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_train_glove_q2 = vecs1

100%|██████████| 70000/70000 [12:36<00:00, 92.53it/s]


In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_test['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_test_glove_q1 = vecs1

100%|██████████| 30000/30000 [05:23<00:00, 92.71it/s]


In [None]:
nlp = spacy.load('en_core_web_sm')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
for qu1 in tqdm(list(X_test['question2'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), 96])
    for i,word1 in enumerate(doc1):
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1[i] += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)

X_test_glove_q2 = vecs1

100%|██████████| 30000/30000 [05:20<00:00, 93.64it/s]


In [None]:
X_train['q1_glove'] = X_train_glove_q1
X_train['q2_glove'] = X_train_glove_q2
X_test['q1_glove'] = X_test_glove_q1
X_test['q2_glove'] = X_test_glove_q2

In [None]:
train_glove = np.concatenate([np.array(X_train_glove_q1),np.array(X_train_glove_q2)],axis=1)
test_glove = np.concatenate([np.array(X_test_glove_q1),np.array(X_test_glove_q2)],axis=1)
train_glove.shape

(70000, 192)

In [None]:
glove_train_df = pd.DataFrame(train_glove,columns=[f'g_{i}' for i in range(train_glove.shape[1])])
glove_test_df = pd.DataFrame(test_glove,columns=[f'g_{i}' for i in range(test_glove.shape[1])])
glove_train_df.head()

Unnamed: 0,g_0,g_1,g_2,g_3,g_4,g_5,g_6,g_7,g_8,g_9,g_10,g_11,g_12,g_13,g_14,g_15,g_16,g_17,g_18,g_19,g_20,g_21,g_22,g_23,g_24,g_25,g_26,g_27,g_28,g_29,g_30,g_31,g_32,g_33,g_34,g_35,g_36,g_37,g_38,g_39,...,g_152,g_153,g_154,g_155,g_156,g_157,g_158,g_159,g_160,g_161,g_162,g_163,g_164,g_165,g_166,g_167,g_168,g_169,g_170,g_171,g_172,g_173,g_174,g_175,g_176,g_177,g_178,g_179,g_180,g_181,g_182,g_183,g_184,g_185,g_186,g_187,g_188,g_189,g_190,g_191
0,8.620377,-1.614727,-2.521583,-1.531998,2.092236,0.767564,5.339287,3.058473,10.082106,11.05451,-3.507912,-3.432895,-0.067182,-10.681097,1.08985,1.484099,-8.21673,4.523411,-0.183961,-3.073451,7.041418,4.111737,-5.499477,2.947024,-2.201095,-2.657204,0.895537,-2.32632,10.550176,-6.367206,4.771478,4.998177,0.459032,-3.96812,1.873935,-0.680668,4.821114,-9.623053,-10.167518,-2.572175,...,10.089836,3.936834,-5.538895,5.56327,8.331845,-7.516664,4.947571,0.551604,2.226217,-5.257148,4.66174,-4.827672,-2.469894,-2.716766,-4.011982,-1.669388,2.131926,-1.406734,-1.637095,0.84346,9.596346,-8.810573,-0.189235,0.185671,-0.870233,-8.29342,-5.647267,5.515711,3.732003,-7.587928,-2.701354,2.716441,-0.510052,6.447781,-2.949746,-0.962468,2.531881,6.010862,1.527958,1.486945
1,-0.976643,-2.725152,2.44842,-6.019244,10.633596,-0.612285,2.808925,2.506091,3.439625,2.74885,-2.935436,5.436929,-2.523156,6.865408,-9.358973,-8.060454,-4.945397,0.764525,2.253675,-4.320681,9.260277,-4.847854,-8.376614,-1.336396,6.339169,4.441661,-9.057021,-1.587038,-2.049387,2.388804,0.912349,-4.281859,-5.025249,-8.493344,-3.309496,-6.918417,6.168914,0.979653,-4.349751,4.50158,...,4.41864,3.693307,1.570967,5.141033,10.051354,2.914478,3.694912,1.172728,2.013224,-3.563407,4.454383,-6.441067,2.158532,-0.239046,-6.496768,-3.57729,5.334235,0.750009,3.977217,-3.131306,6.806299,-5.550982,-5.664017,-4.400164,-1.581056,3.991659,1.210462,0.192518,3.626633,-2.551011,3.914964,0.328768,-2.45402,0.270393,-9.784351,-0.421203,2.982829,3.221098,-0.51707,7.156083
2,1.364633,-0.612766,-0.917767,4.277427,10.759067,7.22073,2.62359,2.577329,10.591618,13.758568,-0.62,2.298428,-1.027227,-4.799027,-6.97742,-5.08317,-4.422765,3.306546,0.305656,-3.580782,3.97973,2.748984,0.035068,-2.376505,-1.09244,-1.779144,-5.030781,-1.390374,6.533338,-7.121969,3.819638,0.446124,-4.847996,-6.796444,1.181442,-3.438956,7.560942,-2.222238,-3.024427,-2.71182,...,5.248811,7.349653,-5.088327,5.053706,7.250805,0.519352,1.788629,0.233627,7.627815,-0.252892,11.181622,-7.246195,-1.489986,3.130861,-1.450135,-1.716376,7.907278,0.924074,-4.1894,-1.036867,6.969816,-9.251938,-0.795776,-0.299615,1.590454,-3.170906,-1.801627,0.208262,5.480814,-4.08117,-2.047053,-4.732654,-0.826601,6.169578,0.445108,-2.095889,4.520428,-1.048259,0.624426,6.458432
3,-5.402827,-1.453848,-3.300925,0.197813,1.61933,6.634061,1.017171,3.348628,1.350459,7.710702,-0.812362,-2.839082,-0.61261,-3.448848,-3.432873,-4.079187,-4.631437,-3.651631,-2.905281,1.757382,0.787178,0.16258,-2.535501,0.747244,-6.131663,9.82762,-1.258223,-5.423268,3.6237,-5.403328,3.619214,1.019382,0.074535,-7.099874,5.484165,-8.24423,5.117587,-1.87874,0.161722,-1.652342,...,2.966553,-1.523681,-1.829854,0.401545,1.645285,0.385318,7.061065,2.890736,1.050386,-3.336407,3.78518,-1.68105,-4.781838,2.129497,-1.0095,-3.689642,9.070033,5.875521,-2.148875,5.076125,8.54573,-5.828086,1.726607,-3.11013,-5.750603,-4.205308,-2.504673,5.086736,-2.342566,2.261608,-0.772201,1.317268,-5.662102,1.054329,-6.430517,-1.770199,-3.266285,1.832004,6.204855,4.19038
4,6.240948,-0.342004,0.272171,7.702876,3.548724,2.831506,9.050959,0.37995,8.114947,9.188508,-4.671504,1.200732,-5.432051,-10.548737,3.05998,-2.545802,-5.528193,-0.109835,-1.450191,-2.646301,6.511831,2.406859,-4.246453,-3.384232,-10.368488,1.443597,-5.060623,-1.779779,13.546983,-7.345306,9.435891,3.356754,0.303414,-5.911453,-0.803014,-8.052808,6.501953,-4.829181,-12.96877,-2.577738,...,2.730393,4.662781,-0.69009,2.061311,11.836681,1.818767,6.683187,3.40621,2.515244,-1.900384,6.867739,-7.587519,-1.230329,4.734106,-0.372573,-6.656595,3.070535,-0.184626,-2.353856,-0.784568,3.73444,-9.668799,-1.473442,-3.06506,-0.751881,3.627949,-5.079347,1.189592,5.002939,-0.516425,-2.429725,0.942915,-4.17439,8.085426,0.507519,-9.277117,4.342348,1.404644,5.940545,5.478593


In [None]:

X_train = X_train.drop(columns=['question1','question2']).reset_index(drop=True)
X_test = X_test.drop(columns=['question1','question2']).reset_index(drop=True)
print(X_train.shape)
print(X_test.shape)

(70000, 28)
(30000, 28)


In [None]:
# concatenating
X_train_d = pd.concat([X_train,glove_train_df],axis=1)
X_test_d = pd.concat([X_test,glove_test_df],axis=1)
print(X_train_d.shape)
print(X_test_d.shape)

(70000, 220)
(30000, 220)


In [None]:
X_train_d.to_csv('data/train_data.csv',index=False)
X_test_d.to_csv('data/test_data.csv',index=False)

In [None]:
y_train.to_csv('data/train_y.csv',index=False)
y_test.to_csv('data/test_y.csv',index=False)