# Data Inputs and Display Libraries

In [None]:

import pandas as pd
import numpy as np
import pickle
pd.set_option('display.float_format', lambda x: '%.5f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Modeling Libraries

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection
from xgboost import XGBClassifier
import pickle
from sklearn.model_selection import GridSearchCV

# Metrics Libraries


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

In [None]:
# Accessing the data
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/datasets.rar" 
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/prep_file.rar" 
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/num_cols.csv" 
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/str_cols.csv" 

#unziping the rar
!unrar x './datasets.rar'
!unrar x './prep_file.rar'

--2022-03-10 07:17:59--  https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/datasets.rar
Resolving github.com (github.com)... 13.114.40.48
Connecting to github.com (github.com)|13.114.40.48|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/univai-ghf/ghfmedia/main/data/Trees_and_Ensembles/datasets.rar [following]
--2022-03-10 07:17:59--  https://raw.githubusercontent.com/univai-ghf/ghfmedia/main/data/Trees_and_Ensembles/datasets.rar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3508143 (3.3M) [application/octet-stream]
Saving to: ‘datasets.rar’


2022-03-10 07:18:00 (42.7 MB/s) - ‘datasets.rar’ saved [3508143/3508143]

--2022-03-10 07:18:00--  https://github.com/univai-ghf

In [None]:

def pick_in(obj_name):
    fl_out1 = obj_name
    pickle_in = open(fl_out1,"rb")
    mod1= pickle.load(pickle_in)
    
    return mod1

In [None]:
list_objs = ["df_all_train2","y_train1","df_all_test2","y_test1"]

for i in list_objs:
     globals()[i]= pick_in(i)

In [None]:
def auc1_scr(mod1,test_set,actual1):
  
    mod = eval(mod1)
    pred1=mod.predict_proba(test_set)[:,1]
    fpr, tpr, thresholds = roc_curve(actual1, pred1)
    auc1 = auc(fpr, tpr)
    
    return auc1

In [None]:
# AdaBoost Classifier

ab = AdaBoostClassifier(n_estimators=100, random_state=0)
ab.fit(df_all_train2,y_train1)

AdaBoostClassifier(n_estimators=100, random_state=0)

In [None]:
auc1_te = auc1_scr("ab",df_all_test2,y_test1)
auc1_tr = auc1_scr("ab",df_all_train2,y_train1)

In [None]:
auc1_te,auc1_tr

(0.6239087454925901, 0.6283670987340262)

# Grid Search

In [None]:
# This will take around 1hr+ to execute on standard colab runtime
# AB_grid= AdaBoostClassifier(random_state=42)

# params = {
#     'n_estimators': [100,500],
#     'learning_rate': [0.2,0.5,1],
#     'algorithm': ['SAMME','SAMME.R'],
#     'base_estimator' : [DecisionTreeClassifier(max_depth=1),DecisionTreeClassifier(max_depth=2),DecisionTreeClassifier(max_depth=5)]
# }

# grid_search = GridSearchCV(estimator=AB_grid, 
#                            param_grid=params, 
#                            cv=2, n_jobs=5, verbose=1, scoring = "roc_auc")


# grid_search.fit(df_all_test2,y_test1)

In [None]:
# score_df = pd.DataFrame(grid_search.cv_results_)
# score_df.head()
# score_df.sort_values(["rank_test_score"]).head(5)

# Gradient Boosting

In [None]:
# GradientBoosting Classifier

# It will take around 9 mins for execution
gb = GradientBoostingClassifier(max_depth=5,n_estimators=300, learning_rate=0.5)
gb.fit(df_all_train2,y_train1)


GradientBoostingClassifier(learning_rate=0.5, max_depth=5, n_estimators=300)

In [None]:
auc1_te = auc1_scr("gb",df_all_test2,y_test1)
auc1_tr = auc1_scr("gb",df_all_train2,y_train1)

In [None]:
auc1_te,auc1_tr

(0.9026801872014071, 0.9488784003834644)

In [None]:
# XGB Classifier

# It will take around 4 mins for execution
xgb = XGBClassifier()
xgb.fit(df_all_train2,y_train1)

XGBClassifier()

In [None]:
auc1_te = auc1_scr("xgb",df_all_test2,y_test1)
auc1_tr = auc1_scr("xgb",df_all_train2,y_train1)

In [None]:
auc1_te,auc1_tr

(0.663948910348972, 0.6733339608819728)

In [None]:
class_weights = [0.1,0.9]

xgb_param = XGBClassifier(n_estimators=300,max_depth= 5,class_weights = class_weights,
                          subsample= 0.2,colsample_bytree= 0.3,random_state=0)


xgb_param.fit(df_all_train2,y_train1)

XGBClassifier(class_weights=[0.1, 0.9], colsample_bytree=0.3, max_depth=5,
              n_estimators=300, subsample=0.2)

In [None]:
auc1_te = auc1_scr("xgb_param",df_all_test2,y_test1)
auc1_tr = auc1_scr("xgb_param",df_all_train2,y_train1)

In [None]:
auc1_te,auc1_tr

(0.7972238724769605, 0.8106333886227974)