In [1]:
import pandas as pd
import numpy as np
from string import punctuation
import scipy
import datetime

import re
from sklearn.metrics import (
    confusion_matrix, 
    classification_report, 
    f1_score,
    accuracy_score, 
    recall_score, 
    precision_score
)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [2]:
BASE_DIR = './dataset/'
# train = pd.read_csv(f'{BASE_DIR}train_preprocessed.csv')
train = pd.read_csv(f'{BASE_DIR}train.csv')
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
train.is_duplicate.value_counts()

0    255027
1    149263
Name: is_duplicate, dtype: int64

In [3]:
def pad_str(s):
    return ' '+s+' '

def normalize_text(text):
    SPECIAL_TOKENS = {'non-ascii': 'non_ascii_word'}

    if pd.isnull(text) or len(text)==0:
        return ''

    text = text.lower()
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    text = re.sub('[^\x00-\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) 
    text = [word for word in text if word not in punctuation]
    text = ''.join(text)

    return text

In [4]:
train['question1'] = train['question1'].apply(normalize_text)
train['question2'] = train['question2'].apply(normalize_text)
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0
1,1,3,4,what is the story of kohinoor kohinoor diamond,what would happen if the indian government sto...,0
2,2,5,6,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0
3,3,7,8,why am i mentally very lonely how can i solve it,find the remainder when math2324math is divide...,0
4,4,9,10,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0


In [16]:
# train.to_csv('./processed_train.csv', index=False)

### Bag of Words + XgBoost

In [42]:
CV = CountVectorizer(analyzer='word', stop_words='english', token_pattern=r'\w{1,}')
q1_trans = CV.fit_transform(train['question1'].values)
q2_trans = CV.fit_transform(train['question2'].values)

In [43]:
X = scipy.sparse.hstack((q1_trans, q2_trans))
y = train.is_duplicate.values

In [44]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)

In [47]:
X_train.shape, X_test.shape

((283003, 155581), (121287, 155581))

In [50]:
import pickle
classifier1 = pickle.load(open("models/analysis1/classfier1.dat", "rb"))

In [51]:
y_pred = classifier1.predict(X_test)
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy score: 0.7490992439420548
F1 score: 0.5610512498737865
Precision: 0.7973759737597376
Recall: 0.4327836749226695


In [20]:
# st = datetime.datetime.now()

# classifier1 = XGBClassifier(
#     max_depth=50, 
#     n_estimators=80, ## number of boosting rounds
#     learning_rate=0.1, 
#     colsample_bytree=.7, ## Subsample ratio of columns when constructing each tree.
#     gamma=0, ##  Minimum loss reduction required to make a further partition on a leaf node of the tree.
#     reg_alpha=4, ##  L1 regularization term on weights
#     objective='binary:logistic', 
#     subsample=0.8, ## Subsample ratio of the training instance.
# )

# print(classifier1.fit(X_train, y_train))
# y_pred = classifier1.predict(X_test)

# print("Classification report:\n", classification_report(y_test, y_pred))
# print("Accuracy score: \n", accuracy_score(y_test, y_pred))
# print("F1 Score:\n ",f1_score(y_test, y_pred))

# et = datetime.datetime.now()
# print("Code run-time: ", et-st)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              silent=1, subsample=0.8, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)
Confus

### Tf-IDF (unigram) + XgBoost

In [52]:
#5000 features were used for tfidf vectorizer

tfidf = TfidfVectorizer(analyzer='word', max_features=5000, token_pattern=r'\w{1,}')

q1word_trans = tfidf.fit_transform(train['question1'].values)
q2word_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1word_trans,q2word_trans))
y = train.is_duplicate.values

In [53]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

In [54]:
import pickle
classifier2 =pickle.load(open("./models/analysis1/classfier2.dat", "rb"))

In [56]:
y_pred = classifier2.predict(X_test)
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy score: 0.787679138933861
F1 score: 0.6758887401457683
Precision: 0.7781231887876073
Recall: 0.5973989199247588


In [23]:
# st = datetime.datetime.now()

# classifier2 = XGBClassifier(
#    max_depth=50, 
#     n_estimators=80, ## number of boosting rounds
#     learning_rate=0.1, 
#     colsample_bytree=.7, ## Subsample ratio of columns when constructing each tree.
#     gamma=0, ##  Minimum loss reduction required to make a further partition on a leaf node of the tree.
#     reg_alpha=4, ##  L1 regularization term on weights
#     objective='binary:logistic', 
#     subsample=0.8, ## Subsample ratio of the training instance.
# )
# print(classifier2.fit(X_train, y_train))
# y_pred = classifier2.predict(X_test)

# print("Classification report:\n", classification_report(y_test, y_pred))
# print("Accuracy score: \n", accuracy_score(y_test, y_pred))
# print("F1 Score:\n ", f1_score(y_test, y_pred))

# et = datetime.datetime.now()
# print("Code run-time: ", et-st)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, eta=0.3, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              silent=1, subsample=0.8, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, ...)
Confus

### Tf-Idf (bigram level) + XgBoost

In [5]:
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(2,2), max_features=5000, token_pattern=r'\w{1,}')

q1ngram_trans = tfidf.fit_transform(train['question1'].values)
q2ngram_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1ngram_trans, q2ngram_trans))
y = train.is_duplicate.values

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)

In [11]:
import pickle
classifier3 = pickle.load(open("./models/analysis1/classfier3.dat", "rb"))

In [12]:
y_pred = classifier3.predict(X_test)
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy score: 0.7403761326440591
F1 score: 0.5502535171034779
Precision: 0.7681234548209586
Recall: 0.4286668001869284


In [9]:
# st = datetime.datetime.now()

# classifier3 = XGBClassifier(
#     max_depth=50, 
#     n_estimators=80, ## number of boosting rounds
#     learning_rate=0.1, 
#     colsample_bytree=.7, ## Subsample ratio of columns when constructing each tree.
#     gamma=0, ##  Minimum loss reduction required to make a further partition on a leaf node of the tree.
#     reg_alpha=4, ##  L1 regularization term on weights
#     objective='binary:logistic', 
#     subsample=0.8, ## Subsample ratio of the training instance.
# )
# classifier3.fit(X_train, y_train)
# y_pred = classifier3.predict(X_test)

# print("Bigram Classification report:\n", classification_report(y_test, y_pred))
# print("Bigram Accuracy score: \n", accuracy_score(y_test, y_pred))
# print("Bigram F1 Score:\n ",f1_score(y_test, y_pred))

# et = datetime.datetime.now()
# print("Code run-time: ", et-st)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8, tree_method='exact', validate_parameters=1,
              verbosity=None)
ngram_range (2,2) Confusion Matrix:
 [[70535  5815]
 [25674 19263]]
ngram_range (2,2) Accuracy score: 
 0.7403761326440591
ngram_range (2,2) Classification report:
               precision    recall  f1-score   support

           0       0.73      0.92      0.82     76350
           1       0.77      0.43      0.55     44937

    accuracy           

## TfIdf (trigram) + XGBoost

In [13]:
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(3,3), max_features=5000, token_pattern=r'\w{1,}')

q1ngram_trans = tfidf.fit_transform(train['question1'].values)
q2ngram_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1ngram_trans,q2ngram_trans))
y = train.is_duplicate.values

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)

In [19]:
import pickle
classifier4 = pickle.load(open("./models/analysis1/classfier4.dat", "rb"))

In [20]:
y_pred = classifier4.predict(X_test)
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))

Accuracy score: 0.7148746361934915
F1 score: 0.46482404283636136
Precision: 0.7630709821655404
Recall: 0.3342012150343815


In [17]:
# st = datetime.datetime.now()

# classifier4 = XGBClassifier(
#     max_depth=50, 
#     n_estimators=80, ## number of boosting rounds
#     learning_rate=0.1, 
#     colsample_bytree=.7, ## Subsample ratio of columns when constructing each tree.
#     gamma=0, ##  Minimum loss reduction required to make a further partition on a leaf node of the tree.
#     reg_alpha=4, ##  L1 regularization term on weights
#     objective='binary:logistic', 
#     subsample=0.8, ## Subsample ratio of the training instance.
# )

# print(classifier4.fit(X_train, y_train))
# prediction_tfidf = classifier4.predict(X_test)

# print("Trigram Classification report:\n", classification_report(y_test, prediction_tfidf))
# print("Trigram Accuracy score: \n", accuracy_score(y_test, prediction_tfidf))
# print("Trigram F1 Score:\n ",f1_score(y_test, prediction_tfidf))

# et = datetime.datetime.now()
# print("Code run-time: ", et-st)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=80, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=4, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8, tree_method='exact', validate_parameters=1,
              verbosity=None)
Trigram Classification report:
               precision    recall  f1-score   support

           0       0.71      0.94      0.81     76350
           1       0.76      0.33      0.46     44937

    accuracy                           0.71    121287
   macro avg       0.73      0.64      0.64    121287
weighted avg       0.73      0.71      0.68    1

### TF-IDF (character level)+ XgBoost

In [30]:
#TF-IDF ngram level vectorizer 
#5000 features were used for tfidf vectorizer
#r'\w{1,}'  indicates more than 1 word
#ngram_range = (1,3) means 2 and 3 features are used
#char level analyzer is used 

tfidf = TfidfVectorizer(analyzer='char',ngram_range=(1,3), max_features=5000, token_pattern=r'\w{1,}')

q1char_trans = tfidf.fit_transform(train['question1'].values)
q2char_trans = tfidf.fit_transform(train['question2'].values)

X = scipy.sparse.hstack((q1char_trans,q2char_trans))
y = train.is_duplicate.values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.30, random_state = 42)


NameError: name 'X' is not defined

In [None]:
# Xg Boost classifier for char level vectorizer

st = datetime.datetime.now()

classifier5 = XGBClassifier(
    max_depth=50, n_estimators=80, 
    learning_rate=0.1, colsample_bytree=.7, 
    gamma=0, reg_alpha=4, 
    objective='binary:logistic', eta=0.3, 
    silent=1, subsample=0.8
)

#fitting the model with traing data
print(classifier5.fit(X_train, y_train))

#predicting the test data
prediction_tfidf = classifier5.predict(X_test)

#Performance evaluation
print("char level Confusion Matrix:\n", confusion_matrix(y_test, prediction_tfidf))
print("char level Accuracy score: \n", accuracy_score(y_test, prediction_tfidf))
print("char level Classification report:\n", classification_report(y_test, prediction_tfidf))
print("char level F1 Score:\n ",f1_score(y_test, prediction_tfidf))

et = datetime.datetime.now()
print("Code run-time: ", et-st)

NameError: name 'X_train' is not defined

## Saving the models

In [None]:
import pickle
pickle.dump(classifier1, open("./models/analysis1/bow_xg.pkl", "wb")) ## bow
pickle.dump(classifier2, open("./models/analysis1/unigram_xg.pkl", "wb")) ## unigram
pickle.dump(classifier3, open("./models/analysis1/bigram_xg.pkl", "wb")) ## bigram
pickle.dump(classifier4, open("./models/analysis1/trigram_xg.pkl", "wb")) ## trigram
# pickle.dump(classifier5, open("./models/analysis1/classfier5.pkl", "wb"))

In [18]:
pickle.dump(classifier4, open("./models/analysis1/classfier4.dat", "wb")) ## bigram