In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from string import punctuation
import scipy #library for scientific calculations
import datetime

import re
from sklearn import pipeline
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from xgboost import XGBClassifier

In [2]:
BASE_DIR = './dataset/'
train = pd.read_csv(f'{BASE_DIR}train.csv')
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
train.is_duplicate.value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [4]:
def pad_str(s):
    return ' '+s+' '
## cleaning the questions
def normalize_text(text):
    SPECIAL_TOKENS = {'non-ascii': 'non_ascii_word'}

    if pd.isnull(text) or len(text)==0:
        return ''

    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    text = re.sub('[^\x00-\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) 

    #Removing Punctuations
    text = [word for word in text if word not in punctuation]
    text = ''.join(text)
    text = text.lower()

    # Return a list of words
    return text

In [5]:
train['question1'] = train['question1'].apply(normalize_text)
train['question2'] = train['question2'].apply(normalize_text)
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,0
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0


In [6]:
# train.to_csv('./processed_train.csv', index=False)

### Bag of Words + XgBoost

In [11]:
#using word vector of word_count and frequency withpout capturing the meaning of word
#r'\w{1,}' indiactes 1 or more word

CV = CountVectorizer(analyzer='word', stop_words='english', token_pattern=r'\w{1,}')
q1_trans = CV.fit_transform(train['question1'].values)
q2_trans = CV.fit_transform(train['question2'].values)

In [12]:
#scipy.sparse.hstack will stack sparse matrix columnwise, and stacking them side by side

X = scipy.sparse.hstack((q1_trans, q2_trans))
y = train.is_duplicate.values

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)

In [14]:
#gradient Boosting Model used
#start time
st = datetime.datetime.now()

classifier1 = XGBClassifier(
    max_depth=50, 
    n_estimators=80, 
    learning_rate=0.1, 
    colsample_bytree=.7, 
    gamma=0, 
    reg_alpha=4, 
    objective='binary:logistic', 
    eta=0.3, 
    silent=1, 
    subsample=0.8,
    use_label_encoder=False
)

#fitting the model
print(classifier1.fit(X_train, y_train))
#predicting if pair is duplicate or not
prediction_CV = classifier1.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, prediction_CV))
print("Accuracy score: \n", accuracy_score(y_test, prediction_CV))
print("Classification report:\n", classification_report(y_test, prediction_CV))
print("F1 Score:\n ",f1_score(y_test, prediction_CV))

et = datetime.datetime.now()
print("Code run-time: ", et-st)



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              silent=1, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)
Confusion Matrix:
 [

### Tf-IDF (word level) + XgBoost

In [15]:
#5000 features were used for tfidf vectorizer

tfidf = TfidfVectorizer(analyzer='word', max_features=5000, token_pattern=r'\w{1,}')

q1word_trans = tfidf.fit_transform(train['question1'].values)
q2word_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1word_trans,q2word_trans))
y = train.is_duplicate.values

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [17]:
# Xg Boost classifier for word level vectorizer

st = datetime.datetime.now()

classifier2 = XGBClassifier(
    max_depth=50, 
    n_estimators=80, 
    learning_rate=0.1, 
    colsample_bytree=.7, 
    gamma=0, reg_alpha=4, 
    objective='binary:logistic', 
    eta=0.3, 
    silent=1, 
    subsample=0.8,
    use_label_encoder=False)

#fitting the model with traing data
print(classifier2.fit(X_train, y_train))

#predicting the test data
prediction_tfidf = classifier2.predict(X_test)

#Performance evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, prediction_tfidf))
print("Accuracy score: \n", accuracy_score(y_test, prediction_tfidf))
print("Classification report:\n", classification_report(y_test, prediction_tfidf))
print("F1 Score:\n ",f1_score(y_test, prediction_tfidf))

et = datetime.datetime.now()
print("Code run-time: ", et-st)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              silent=1, subsample=0.8, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)
Confus

### Tf-Idf (ngram level) + XgBoost

In [18]:
#TF-IDF ngram level vectorizer 
#5000 features were used for tfidf vectorizer
#r'\w{1,}'  indicates more than 1 word
#ngram_range = (1,3) means 2 and 3 features are used

tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1,3), max_features=5000, token_pattern=r'\w{1,}')

q1ngram_trans = tfidf.fit_transform(train['question1'].values)
q2ngram_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1ngram_trans,q2ngram_trans))
y = train.is_duplicate.values

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)


In [20]:
# Xg Boost classifier for ngram_range=(1,3) level vectorizer

st = datetime.datetime.now()

classifier3 = XGBClassifier(
    max_depth=50, n_estimators=80, 
    learning_rate=0.1, colsample_bytree=.7, 
    gamma=0, reg_alpha=4, 
    objective='binary:logistic', eta=0.3, 
    silent=1, subsample=0.8
)

#fitting the model with traing data
print(classifier3.fit(X_train, y_train))

#predicting the test data
prediction_tfidf = classifier3.predict(X_test)

#Performance evaluation
print("ngram_range Confusion Matrix:\n", confusion_matrix(y_test, prediction_tfidf))
print("ngram_range Accuracy score: \n", accuracy_score(y_test, prediction_tfidf))
print("ngram_range Classification report:\n", classification_report(y_test, prediction_tfidf))
print("ngram_range F1 Score:\n ",f1_score(y_test, prediction_tfidf))

et = datetime.datetime.now()
print("Code run-time: ", et-st)



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              silent=1, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)
ngram_range Confusio

In [21]:
#TF-IDF ngram level vectorizer 
#5000 features were used for tfidf vectorizer
#r'\w{1,}'  indicates more than 1 word
#ngram_range = (2,3) means 2 and 3 features are used

tfidf = TfidfVectorizer(analyzer='word',ngram_range=(2,3), max_features=5000, token_pattern=r'\w{1,}')

q1ngram_trans = tfidf.fit_transform(train['question1'].values)
q2ngram_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1ngram_trans,q2ngram_trans))
y = train.is_duplicate.values

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)


In [23]:
# Xg Boost classifier for ngram_range=(1,3) level vectorizer

st = datetime.datetime.now()

classifier4 = XGBClassifier(
    max_depth=50, n_estimators=80, 
    learning_rate=0.1, colsample_bytree=.7, 
    gamma=0, reg_alpha=4, 
    objective='binary:logistic', eta=0.3, 
    silent=1, subsample=0.8
)

#fitting the model with traing data
print(classifier4.fit(X_train, y_train))

#predicting the test data
prediction_tfidf = classifier4.predict(X_test)

#Performance evaluation
print("ngram_range(2,3) Confusion Matrix:\n", confusion_matrix(y_test, prediction_tfidf))
print("ngram_range(2,3) Accuracy score: \n", accuracy_score(y_test, prediction_tfidf))
print("ngram_range(2,3) Classification report:\n", classification_report(y_test, prediction_tfidf))
print("ngram_range(2,3) F1 Score:\n ",f1_score(y_test, prediction_tfidf))

et = datetime.datetime.now()
print("Code run-time: ", et-st)



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              silent=1, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)
ngram_range(2,3) Con

### TF-IDF (character level)+ XgBoost

In [24]:
#TF-IDF ngram level vectorizer 
#5000 features were used for tfidf vectorizer
#r'\w{1,}'  indicates more than 1 word
#ngram_range = (1,3) means 2 and 3 features are used
#char level analyzer is used 

tfidf = TfidfVectorizer(analyzer='char',ngram_range=(1,3), max_features=5000, token_pattern=r'\w{1,}')

q1char_trans = tfidf.fit_transform(train['question1'].values)
q2char_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1char_trans,q2char_trans))
y = train.is_duplicate.values

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)


In [26]:
# Xg Boost classifier for char level vectorizer

st = datetime.datetime.now()

classifier5 = XGBClassifier(
    max_depth=50, n_estimators=80, 
    learning_rate=0.1, colsample_bytree=.7, 
    gamma=0, reg_alpha=4, 
    objective='binary:logistic', eta=0.3, 
    silent=1, subsample=0.8
)

#fitting the model with traing data
print(classifier5.fit(X_train, y_train))

#predicting the test data
prediction_tfidf = classifier5.predict(X_test)

#Performance evaluation
print("char level Confusion Matrix:\n", confusion_matrix(y_test, prediction_tfidf))
print("char level Accuracy score: \n", accuracy_score(y_test, prediction_tfidf))
print("char level Classification report:\n", classification_report(y_test, prediction_tfidf))
print("char level F1 Score:\n ",f1_score(y_test, prediction_tfidf))

et = datetime.datetime.now()
print("Code run-time: ", et-st)



Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              silent=1, subsample=0.8, tree_method='exact',
              validate_parameters=1, verbosity=None)
char level Confusion

In [None]:
import pickle
pickle.dump(classifier1, open("./models/analysis1/classfier1.dat", "wb"))
pickle.dump(classifier2, open("./models/analysis1/classfier2.dat", "wb"))
pickle.dump(classifier3, open("./models/analysis1/classfier3.dat", "wb"))
pickle.dump(classifier4, open("./models/analysis1/classfier4.dat", "wb"))
pickle.dump(classifier5, open("./models/analysis1/classfier5.dat", "wb"))