# kaggle score: 83.19 % AUC - Gradient Boost Model


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt       
%matplotlib inline 
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.model_selection import RandomizedSearchCV
import json
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import randint as sp_randint
from sklearn.metrics import classification_report,confusion_matrix

  import pandas.util.testing as tm


In [None]:
train=pd.read_csv('/content/drive/My Drive/Colab Notebooks/train.csv')
train.shape

(76020, 371)

In [None]:
test=pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv')
test.shape

(75818, 370)

In [None]:
y = train["TARGET"]
train = train.drop(["ID","TARGET"], axis=1) 

In [None]:
with open("/content/drive/My Drive/Colab Notebooks/final_lr_features.txt", "r") as fp:
  lr_feats = json.load(fp)

with open("/content/drive/My Drive/Colab Notebooks/final_rf_features.txt", "r") as fp:
  rf_feats = json.load(fp)

with open("/content/drive/My Drive/Colab Notebooks/final_gb_features.txt", "r") as fp:
  gb_feats = json.load(fp)

In [None]:
xtr_lr = train[lr_feats]
xtr_rf = train[rf_feats]
xtr_gb = train[gb_feats]

xte_lr = test[lr_feats]
xte_rf = test[rf_feats]
xte_gb = test[gb_feats]

print("train sets for logistic regression, randomforest, gradientboosting:")
print(xtr_lr.shape)
print(xtr_rf.shape)
print(xtr_gb.shape)

print("test sets for logistic regression, randomforest, gradientboosting:")
print(xte_lr.shape)
print(xte_rf.shape)
print(xte_gb.shape)

train sets for logistic regression, randomforest, gradientboosting:
(76020, 13)
(76020, 12)
(76020, 10)
test sets for logistic regression, randomforest, gradientboosting:
(75818, 13)
(75818, 12)
(75818, 10)


**We will scale the features using MinMaxScaler scaler. We will take up the train validate and test approach for modelling where training and validation will be done in a 10-fold CV fashion. We'll split the main train set in 70:30 ratio. The model will be trained and tuned on the 70% part in 10 CV loop and tested on the 30% hold-out set. The final model will be then tested on the main test set provided by kaggle and saved for submission.**<br>We have already seen how the simple models with default parameters perform in the previous part. Lets tune these models and test against the hold-out set:

In [None]:
def training_and_tuning(model,xtr,ytr,parameters,iterations):
  
  print("started random search cv:\n")
  randomCV = RandomizedSearchCV(model, param_distributions=parameters, n_iter=iterations, cv=10, random_state=7, scoring='roc_auc')
  randomCV.fit(xtr, ytr)
  best = randomCV.best_params_
  best_sc = randomCV.best_score_
  print("best score:")
  print(best_sc)
  print("best parameters obtained:")
  print(best)
  return best

In [None]:
def tuned_model(cl,xtr,ytr,xte,yte):

  cl.fit(xtr,ytr)
  train_pred_proba = cl.predict_proba(xtr)[:,1]
  test_pred_proba = cl.predict_proba(xte)[:,1]
  print("train set auc:")
  print(roc_auc_score(ytr,train_pred_proba))
  print("test set auc:")
  print(roc_auc_score(yte,test_pred_proba))
  print("")
  print("test set confusion matrix:")
  test_pred = cl.predict(xte)
  mat_test = confusion_matrix(yte,test_pred,labels=[1,0])
  cl_cm = pd.DataFrame(mat_test, index = [i for i in ["Truth(satisfied)","Truth(unsatisfied)"]],
                    columns = [i for i in ["Predict(satisfied)","Predict(unsatisfied)"]])
  display(cl_cm)

In [None]:
def kaggle_testset(final_model,trainset,trainlabels,testset,filename):

  final_model.fit(trainset,trainlabels)
  final_predictions = final_model.predict_proba(testset)[:, 1]

  submit = pd.DataFrame(columns=["ID","TARGET"])
  submit["TARGET"] = final_predictions
  submit["ID"] = test["ID"]
  submit.to_csv("/content/drive/My Drive/Colab Notebooks/"+filename,index=False)

In [None]:
def minmax_scaler(xtr,xte):

  scaler = MinMaxScaler()
  colnames = xtr.columns

  # fitting and transforming train set
  xtr = pd.DataFrame(scaler.fit_transform(xtr))
  xtr.columns = colnames

  # transforming test set
  xte = pd.DataFrame(scaler.transform(xte))
  xte.columns = colnames

  return [xtr,xte]

**Logistic regression:**

In [None]:
xtr_lr,xte_lr = minmax_scaler(xtr_lr,xte_lr)

x_train, x_test, y_train, y_test = train_test_split(xtr_lr, y, test_size=0.3, random_state=1111, stratify=y)
print(x_train.shape)
print(x_test.shape)

(53214, 13)
(22806, 13)


In [None]:
%%time
parameters = {"C": sp_randint(1, 1000),"class_weight": [None,"balanced"]}
best = training_and_tuning(LogisticRegression(penalty="l1", random_state=7, solver="liblinear"),x_train,y_train,parameters,40)

started random search cv:

best score:
0.7781128842286693
best parameters obtained:
{'C': 945, 'class_weight': 'balanced'}
CPU times: user 28min 36s, sys: 44.1 s, total: 29min 20s
Wall time: 28min 29s


In [None]:
cl = LogisticRegression(penalty="l1",random_state=7,solver="liblinear",C=945,class_weight="balanced")
tuned_model(cl, x_train, y_train, x_test, y_test)

train set auc:
0.7788379082680692
test set auc:
0.7747770861440935

test set confusion matrix:


Unnamed: 0,Predict(satisfied),Predict(unsatisfied)
Truth(satisfied),659,243
Truth(unsatisfied),7448,14456


In [None]:
# kaggle_testset(cl, xtr_lr, y, xte_lr, "lr_submission.csv")

**Kaggle AUC score of 77.72 %**

**Randomforest:**

In [None]:
xtr_rf,xte_rf = minmax_scaler(xtr_rf,xte_rf)

x_train, x_test, y_train, y_test = train_test_split(xtr_rf, y, test_size=0.3, random_state=1111, stratify=y)
print(x_train.shape)
print(x_test.shape)

(53214, 12)
(22806, 12)


In [None]:
%%time
model = RandomForestClassifier(random_state=7)

parameters = {"max_features": list(np.arange(1,12))+[None,"sqrt"],
              "max_depth":list(np.arange(3,50))+[None],
              "class_weight":[None,"balanced"],
              "n_estimators": np.arange(10,150,5)}

best = training_and_tuning(model,x_train,y_train,parameters,100)

started random search cv:

best score:
0.8349638002764465
best parameters obtained:
{'n_estimators': 85, 'max_features': 4, 'max_depth': 10, 'class_weight': None}
CPU times: user 2h 9min 54s, sys: 9.15 s, total: 2h 10min 3s
Wall time: 2h 10min 13s


In [None]:
cl = RandomForestClassifier(n_estimators=85, max_features=4, max_depth=10, class_weight=None, random_state=7)
tuned_model(cl, x_train, y_train, x_test, y_test)

train set auc:
0.9104277138831416
test set auc:
0.8215788730991433

test set confusion matrix:


Unnamed: 0,Predict(satisfied),Predict(unsatisfied)
Truth(satisfied),0,902
Truth(unsatisfied),0,21904


In [None]:
cl = RandomForestClassifier(n_estimators=85, max_features=4, max_depth=10, class_weight="balanced", random_state=7)
tuned_model(cl, x_train, y_train, x_test, y_test)

train set auc:
0.8983934482894202
test set auc:
0.8195831406629857

test set confusion matrix:


Unnamed: 0,Predict(satisfied),Predict(unsatisfied)
Truth(satisfied),561,341
Truth(unsatisfied),3482,18422


In [None]:
# kaggle_testset(cl, xtr_rf, y, xte_rf, "rf_submission2.csv")

**kaggle score for class_weight=None is slightly more (82.95 %) as compared to class_weight="balanced" (82.6 %) but in realtime we will still choose class_weight="balanced" since it provides more general model. Our dataset is highly imbalanced and model with balanced class weight is also able to predict our minority class which is "satisfied".**

**Gradient boosting:**

In [None]:
xtr_gb,xte_gb = minmax_scaler(xtr_gb,xte_gb)

x_train, x_test, y_train, y_test = train_test_split(xtr_gb, y, test_size=0.3, random_state=1111, stratify=y)
print(x_train.shape)
print(x_test.shape)

(53214, 10)
(22806, 10)


In [None]:
%%time
model = GradientBoostingClassifier(random_state=7)

parameters = {"max_features": list(np.arange(1,10))+[None,"sqrt"],
              "max_depth":list(np.arange(3,50))+[None],
              "n_estimators": np.arange(10,150,5)}

best = training_and_tuning(model,x_train,y_train,parameters,100)

started random search cv:

best score:
0.8385704602970382
best parameters obtained:
{'n_estimators': 50, 'max_features': 6, 'max_depth': 5}
CPU times: user 5h 37min 20s, sys: 12.5 s, total: 5h 37min 32s
Wall time: 5h 37min 55s


In [None]:
cl = GradientBoostingClassifier(n_estimators=50, max_features=6, max_depth=5, random_state=7)
tuned_model(cl, x_train, y_train, x_test, y_test)

train set auc:
0.870985430105333
test set auc:
0.8247520626187403

test set confusion matrix:


Unnamed: 0,Predict(satisfied),Predict(unsatisfied)
Truth(satisfied),3,899
Truth(unsatisfied),12,21892


In [None]:
# kaggle_testset(cl, xtr_rf, y, xte_rf, "gb_submission.csv")

**Kaggle score improved to 83.19 % but randomforest with class_weight="balanced" is still our model of choice since it provides a more general model.**

#### Kaggle winning score: 84.5 % AUC - leaderboard

### In the next part we will try to predict the minority class and make the classifier even more general. We will downsample the majority class which will also help us in training and tuning the models faster.