# Data inputs and Display Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

pd.set_option('display.float_format', lambda x: '%.5f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Modelling Libraries

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

from xgboost import XGBClassifier
import pickle
from sklearn.model_selection import GridSearchCV

# Metrics Libraries


In [3]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

In [4]:
# Acessing the data
!wget "https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/datasets.rar" 

--2022-03-17 04:28:37--  https://github.com/univai-ghf/ghfmedia/raw/main/data/Trees_and_Ensembles/datasets.rar
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/univai-ghf/ghfmedia/main/data/Trees_and_Ensembles/datasets.rar [following]
--2022-03-17 04:28:37--  https://raw.githubusercontent.com/univai-ghf/ghfmedia/main/data/Trees_and_Ensembles/datasets.rar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3508143 (3.3M) [application/octet-stream]
Saving to: ‘datasets.rar’


2022-03-17 04:28:38 (26.4 MB/s) - ‘datasets.rar’ saved [3508143/3508143]



In [5]:
# Extracting the dataset
!mkdir './workshop_trees'
!unrar e './datasets.rar' 'workshop_trees'


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from ./datasets.rar

Extracting  workshop_trees/y_train1                                        0%  OK 
Extracting  workshop_trees/df_all_test2                                    1%  2%  3%  4%  5%  6%  7%  8%  9% 10% 11% 12% 13% 14% 15% 16% 17% 18% 19% 20% 21% 22% 23% 24% 25% 26% 27% 28% 29% 30% 31% 32% 33%  OK 
Extracting  workshop_trees/df_all_train2                                  34% 35% 36% 37% 38% 39% 40% 41% 42% 43% 44% 45% 46% 47% 48% 49% 50% 51% 52% 53% 54% 55% 56% 57% 58% 59% 60% 61% 62% 63% 64% 65% 66% 67% 68% 69% 70% 71% 72% 73% 74% 75% 76% 77% 78% 79% 80% 81%

In [6]:
fl_out = "./workshop_trees"
def pick_in(obj_name):
    fl_out1 = fl_out + "/" + obj_name
    pickle_in = open(fl_out1,"rb")
    mod1= pickle.load(pickle_in)
    
    return mod1

In [7]:
list_objs = ["df_all_train2","y_train1","df_all_test2","y_test1"]

for i in list_objs:
     globals()[i]= pick_in(i)

In [8]:
def auc1_scr(mod1,test_set,actual1):
    mod = eval(mod1)
    pred1=mod.predict_proba(test_set)[:,1]
    fpr, tpr, thresholds = roc_curve(actual1, pred1)
    auc1 = auc(fpr, tpr)
    
    return auc1

In [9]:
###bagging classifier

bc = BaggingClassifier(random_state=0)
bc.fit(df_all_train2,y_train1)

BaggingClassifier(random_state=0)

In [10]:
auc1_te = auc1_scr("bc",df_all_test2,y_test1)
auc1_tr = auc1_scr("bc",df_all_train2,y_train1)

In [11]:
auc1_te,auc1_tr

(0.9421740207850983, 0.9819321235084637)

In [12]:
kfold = model_selection.KFold(n_splits = 3)
  
results = model_selection.cross_val_score(bc, df_all_test2,y_test1, 
                                          cv = kfold,scoring='roc_auc',n_jobs=-1)
print (results.mean())

0.9379505474051072


In [13]:
###random forest
rf = RandomForestClassifier( random_state=0)
rf.fit(df_all_train2,y_train1)

RandomForestClassifier(random_state=0)

In [14]:
auc1_te = auc1_scr("rf",df_all_test2,y_test1)
auc1_tr = auc1_scr("rf",df_all_train2,y_train1)

In [15]:
auc1_te,auc1_tr

(0.9606192393892811, 0.9823078290676196)

In [16]:
kfold = model_selection.KFold(n_splits = 3)
  
results = model_selection.cross_val_score(rf, df_all_test2,y_test1, 
                                          cv = kfold,scoring='roc_auc',n_jobs=-1)
results.mean()

0.9581447877360906

# Grid Search

In [17]:
rf_grid= RandomForestClassifier(random_state=42)

params = {
    'n_estimators': [50,100],
    'criterion': ["gini", "entropy"],
    'max_samples': [0.2,0.5],
    'max_features':[0.2,0.3]
    
}

grid_search = GridSearchCV(estimator=rf_grid, 
                           param_grid=params, 
                           cv=2, n_jobs=5, verbose=1, scoring = "roc_auc")


grid_search.fit(df_all_test2,y_test1)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


GridSearchCV(cv=2, estimator=RandomForestClassifier(random_state=42), n_jobs=5,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [0.2, 0.3], 'max_samples': [0.2, 0.5],
                         'n_estimators': [50, 100]},
             scoring='roc_auc', verbose=1)

In [18]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_max_samples,param_n_estimators,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,33.74848,3.99911,3.45765,0.01657,gini,0.2,0.2,50,"{'criterion': 'gini', 'max_features': 0.2, 'ma...",0.93319,0.93512,0.93416,0.00097,15
1,56.74678,1.14408,6.87427,0.10086,gini,0.2,0.2,100,"{'criterion': 'gini', 'max_features': 0.2, 'ma...",0.93772,0.93973,0.93873,0.00101,11
2,86.52844,11.37203,3.78584,0.06435,gini,0.2,0.5,50,"{'criterion': 'gini', 'max_features': 0.2, 'ma...",0.95146,0.95314,0.9523,0.00084,7
3,155.72692,0.06561,7.48239,0.15711,gini,0.2,0.5,100,"{'criterion': 'gini', 'max_features': 0.2, 'ma...",0.95335,0.95462,0.95399,0.00064,3
4,41.56309,1.35093,3.46061,0.05105,gini,0.3,0.2,50,"{'criterion': 'gini', 'max_features': 0.3, 'ma...",0.93175,0.93379,0.93277,0.00102,16


In [19]:
score_df.sort_values(["rank_test_score"]).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_max_samples,param_n_estimators,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
11,161.4178,8.42783,7.31812,0.28003,entropy,0.2,0.5,100,"{'criterion': 'entropy', 'max_features': 0.2, ...",0.95478,0.95549,0.95513,0.00035,1
15,110.47519,6.30921,2.40425,0.34359,entropy,0.3,0.5,100,"{'criterion': 'entropy', 'max_features': 0.3, ...",0.95378,0.95472,0.95425,0.00047,2
3,155.72692,0.06561,7.48239,0.15711,gini,0.2,0.5,100,"{'criterion': 'gini', 'max_features': 0.2, 'ma...",0.95335,0.95462,0.95399,0.00064,3
7,222.35422,9.66014,7.71886,0.14716,gini,0.3,0.5,100,"{'criterion': 'gini', 'max_features': 0.3, 'ma...",0.953,0.95413,0.95357,0.00056,4
10,91.69108,14.20568,3.8651,0.00828,entropy,0.2,0.5,50,"{'criterion': 'entropy', 'max_features': 0.2, ...",0.95247,0.95352,0.953,0.00053,5


In [20]:
grid_search.best_estimator_

RandomForestClassifier(criterion='entropy', max_features=0.2, max_samples=0.5,
                       random_state=42)