In [None]:
# All imports here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold 
from collections import Counter, defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import normalized_mutual_info_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb

In [None]:
# All constants, paths here
processed_train_data_path = '/content/drive/MyDrive/Quora_Question_Pair_Similarity/data/processed_train_data.csv'
alphas = [10 ** x for x in range(-5, 2)] # hyperparameter for SGD classifier.
train_result_message = 'The log loss for the given train data is :: '
test_result_message = 'The log loss for the given test data is :: '

In [None]:
# Read the processed train data
df = pd.read_csv(processed_train_data_path)

In [None]:
df.shape

(404287, 221)

In [None]:
df.columns

Index(['Unnamed: 0', 'id', 'qid1', 'qid2', 'question1', 'question2',
       'is_duplicate', 'q1_length', 'q2_length', 'n_words_q1',
       ...
       '86_y', '87_y', '88_y', '89_y', '90_y', '91_y', '92_y', '93_y', '94_y',
       '95_y'],
      dtype='object', length=221)

In [None]:
# Get the data and label from the csv file
data = df.drop(columns=['id','is_duplicate', 'Unnamed: 0', 'qid1', 'qid2', 'question1', 'question2'])
class_labels = df['is_duplicate']

In [None]:
# Split the data into train and test(validation)
X_train,X_test, y_train, y_test = train_test_split(data, class_labels, stratify=class_labels, test_size=0.3)

In [None]:
def hyper_parameter_tuning(X_train, X_test, y_train, y_test, loss_type, regularization = 'l2',  method_name='sigmoid'):
  """
  Helper method to perform hyper parameter tuning for a model of given type
  """
  log_errors = []
  for alpha in alphas:
    sgd = SGDClassifier(alpha=alpha, penalty=regularization, loss=loss_type, random_state=42)
    sgd.fit(X_train, y_train)
    classifier = CalibratedClassifierCV(sgd, method=method_name)
    classifier.fit(X_train,y_train)
    y_predicted = classifier.predict_proba(X_test)
    log_errors.append(log_loss(y_test, y_predicted, labels=sgd.classes_, eps=1e-15))
    print('For values of alpha = ', alpha, "The log loss is:", log_errors[-1])
  return log_errors

In [None]:
def build_model(X_train, X_test, y_train, y_test, loss_type, regularization = 'l2',  method_name='sigmoid'):
  """
  Helper method that builds best model after hyper paramter tuning
  """
  log_errors = hyper_parameter_tuning(X_train, X_test, y_train, y_test, loss_type=loss_type)
  best_alpha_index = np.argmin(log_errors)
  sgd = SGDClassifier(alpha=alphas[best_alpha_index], penalty=regularization, loss=loss_type, random_state=42)
  sgd.fit(X_train, y_train)
  classifier = CalibratedClassifierCV(sgd, method=method_name)
  classifier.fit(X_train, y_train)
  return classifier

In [None]:
def model_evaluator(X,y,model, print_message):
  """
  Evaluate the loss on the trained model for given data and true labels
  """
  y_predicted = model.predict_proba(X)
  print(print_message,log_loss(y, y_predicted, labels=model.classes_, eps=1e-15))
  y_predicted =np.argmax(y_predicted,axis=1)
  print("The accuracy is :: ", accuracy_score(y,y_predicted))

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(283000, 214)
(283000,)
(121287, 214)
(121287,)


In [None]:
# Logistic regression: Build the best model using hyper-parameter tuning 
logistic_regression = build_model(X_train, X_test, y_train, y_test, 'log')
# Evaluate on Train data
model_evaluator(X_train,y_train,logistic_regression, train_result_message)

For values of alpha =  1e-05 The log loss is: 0.49842252315293656
For values of alpha =  0.0001 The log loss is: 0.49057970277743757
For values of alpha =  0.001 The log loss is: 0.5168597183688719
For values of alpha =  0.01 The log loss is: 0.5075494029634705
For values of alpha =  0.1 The log loss is: 0.5200618184796122
For values of alpha =  1 The log loss is: 0.5344019524891788
For values of alpha =  10 The log loss is: 0.5522338613871465
The log loss for the given train data is ::  0.493348976317517
The accuracy is ::  0.738095406360424


In [None]:
# Evaluate on Validation data
model_evaluator(X_test,y_test,logistic_regression, test_result_message)

The log loss for the given test data is ::  0.49057970277743757
The accuracy is ::  0.740392622457477


In [None]:
# Linear SVM: Build the best model using hyper-parameter tuning 
svm_linear = build_model(X_train, X_test, y_train, y_test, 'hinge', 'l1')
# Evaluate on Train data
model_evaluator(X_train,y_train,svm_linear, train_result_message)

For values of alpha =  1e-05 The log loss is: 0.49213601331291373
For values of alpha =  0.0001 The log loss is: 0.4953479140085616
For values of alpha =  0.001 The log loss is: 0.5061265299815904
For values of alpha =  0.01 The log loss is: 0.5053030392198005
For values of alpha =  0.1 The log loss is: 0.5174989250815675
For values of alpha =  1 The log loss is: 0.5322175307759471
For values of alpha =  10 The log loss is: 0.546939805781072
The log loss for the given train data is ::  0.5198646148303241
The accuracy is ::  0.7191095406360424


In [None]:
# Evaluate on Validation data
model_evaluator(X_test,y_test,svm_linear, test_result_message)

The log loss for the given test data is ::  0.5180447806342268
The accuracy is ::  0.7192856612827426


In [None]:
def hyperparameter_tuning_xgb():
  """
  Helper method to do hyperparameter tuning for XgBoost Classifier
  """
  # Grid params for Tuning
  params = {
    "learning_rate"    : [0.05, 0.10, 0.15],
    "max_depth"        : [6, 8, 10, 12, 15],
    "gamma"            : [ 0.1, 0.2 , 0.3, 0.4 ],
    "eta"              : [0.01, 0.02, 0.09, 0.1, 0.2]
  }

  # Do grid search and print the best params
  classifier=xgb.XGBClassifier()
  grid_search= GridSearchCV(classifier, param_grid=params, scoring='f1', n_jobs=-1, cv=3, verbose=3) 
  grid_search.fit(X_train, y_train)
  print("Best Params: ", grid_search.best_params_)
  return None

In [None]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15] ,
 "max_depth"        : [6, 8, 10, 12, 15],
 "gamma"            : [ 0.1, 0.2 , 0.3, 0.4 ],
 "eta"              : [0.01, 0.02, 0.09, 0.1, 0.2]
}

In [None]:
classifier=xgb.XGBClassifier()

In [None]:
grid_search= GridSearchCV(classifier, param_grid=params, scoring='f1', n_jobs=-1, cv=3, verbose=3) 

In [None]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


In [None]:
grid_search.best_params_

In [None]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_train, 'train'), (d_test, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=10)

xgdmat = xgb.DMatrix(X_train,y_train)
predict_y = bst.predict(d_test)
print("The test log loss is:",log_loss(y_test, predict_y, eps=1e-15))

[0]	train-logloss:0.685289	valid-logloss:0.685939
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 20 rounds.
[10]	train-logloss:0.626307	valid-logloss:0.626625
[20]	train-logloss:0.583398	valid-logloss:0.583949
[30]	train-logloss:0.551853	valid-logloss:0.552592
[40]	train-logloss:0.528634	valid-logloss:0.529123
[50]	train-logloss:0.510506	valid-logloss:0.511018
[60]	train-logloss:0.496122	valid-logloss:0.496771
[70]	train-logloss:0.484781	valid-logloss:0.485489
[80]	train-logloss:0.475765	valid-logloss:0.476519
[90]	train-logloss:0.468471	valid-logloss:0.469274
[100]	train-logloss:0.462296	valid-logloss:0.463114
[110]	train-logloss:0.45701	valid-logloss:0.458212
[120]	train-logloss:0.453026	valid-logloss:0.454141
[130]	train-logloss:0.449452	valid-logloss:0.450591
[140]	train-logloss:0.446384	valid-logloss:0.447756
[150]	train-logloss:0.443653	valid-logloss:0.445281
[160]	train-logloss:0.441401	