# Import all libraries

In [1]:
RANDOM_STATE = 12345

#Manipulate data
import pandas as pd
import numpy as np

#Scale and split data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

#Classifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

#Ensembles
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

#Cross-Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

#Easier to read
from pprint import pprint

#Draw ROC curve
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Read data

In [2]:
df = pd.read_csv("Heart_disease_large.csv")

In [3]:
df.head(5)

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [4]:
df = df.rename(columns={'target':'outcome'})

In [5]:
df.head(5)

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,outcome
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [6]:
one = 0
zero = 0

for index, row in df.iterrows():
    if row['outcome'] == 0:
        one+=1
    else:
        zero+=1
        
print("Class balance:")
print("1",one)
print("0",zero)

Class balance:
1 561
0 629


In [7]:
df.describe()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,outcome
count,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0,1190.0
mean,53.720168,0.763866,3.232773,132.153782,210.363866,0.213445,0.698319,139.732773,0.387395,0.922773,1.62437,0.528571
std,9.358203,0.424884,0.93548,18.368823,101.420489,0.409912,0.870359,25.517636,0.48736,1.086337,0.610459,0.499393
min,28.0,0.0,1.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,1.0,3.0,120.0,188.0,0.0,0.0,121.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,4.0,130.0,229.0,0.0,0.0,140.5,0.0,0.6,2.0,1.0
75%,60.0,1.0,4.0,140.0,269.75,0.0,2.0,160.0,1.0,1.6,2.0,1.0
max,77.0,1.0,4.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,3.0,1.0


In [8]:
(df==0).sum(axis=0)

age                      0
sex                    281
chest pain type          0
resting bp s             1
cholesterol            172
fasting blood sugar    936
resting ecg            684
max heart rate           0
exercise angina        729
oldpeak                455
ST slope                 1
outcome                561
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  1190 non-null   int64  
 1   sex                  1190 non-null   int64  
 2   chest pain type      1190 non-null   int64  
 3   resting bp s         1190 non-null   int64  
 4   cholesterol          1190 non-null   int64  
 5   fasting blood sugar  1190 non-null   int64  
 6   resting ecg          1190 non-null   int64  
 7   max heart rate       1190 non-null   int64  
 8   exercise angina      1190 non-null   int64  
 9   oldpeak              1190 non-null   float64
 10  ST slope             1190 non-null   int64  
 11  outcome              1190 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 111.7 KB


# Seperate x and y and scale

In [10]:
x = df.drop('outcome', axis=1)
y = df['outcome']
scaler = StandardScaler()
X = scaler.fit_transform(x) 

# Classifiers

SVM

In [11]:
svmparam = SVC()

#Random values initial
X_svc_train, X_svc_test, y_svc_train, y_svc_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
param_grid = {'C': [0.01,0.1,1,10,100], 'gamma': [10,1,0.1,0.01,0.001,0.0001],'kernel': ['sigmoid', 'poly', 'linear']}
grid_search = GridSearchCV(estimator = svmparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_svc_train, y_svc_train)
print( "Initial:", grid_search.best_params_)

#1st Round optimizing
param_grid = {'C': [0.008, 0.009, 0.01, 0.02, 0.03], 'gamma': [10,1,0.1,0.01,0.001,0.0001],'kernel': ['sigmoid', 'poly', 'linear']}
grid_search = GridSearchCV(estimator = svmparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_svc_train, y_svc_train)
print( "First round:", grid_search.best_params_)

#2nd Round optimizing
param_grid = {'C': [0.006, 0.007, 0.008, 0.009, 0.01], 'gamma': [10,1,0.1,0.01,0.001,0.0001],'kernel': ['sigmoid', 'poly', 'linear']}
grid_search = GridSearchCV(estimator = svmparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_svc_train, y_svc_train)
print( "Second round:", grid_search.best_params_)

#3rd Round optimizing
param_grid = {'C': [0.004, 0.005, 0.006, 0.007, 0.008], 'gamma': [10,1,0.1,0.01,0.001,0.0001],'kernel': ['sigmoid', 'poly', 'linear']}
grid_search = GridSearchCV(estimator = svmparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_svc_train, y_svc_train)
print( "Third round:", grid_search.best_params_)

#4th Round optimizing
param_grid = {'C': [0.004, 0.005, 0.006, 0.007, 0.008], 'gamma': [0.8, 0.9, 1, 2, 3],'kernel': ['sigmoid', 'poly', 'linear']}
grid_search = GridSearchCV(estimator = svmparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_svc_train, y_svc_train)
print( "Fourth round:", grid_search.best_params_)

#5th Round optimizing
param_grid = {'C': [0.002, 0.003, 0.004, 0.005, 0.006], 'gamma': [0.8, 0.9, 1, 2, 3],'kernel': ['sigmoid', 'poly', 'linear']}
grid_search = GridSearchCV(estimator = svmparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_svc_train, y_svc_train)
print( "Fifth round:", grid_search.best_params_)

Initial: {'C': 0.01, 'gamma': 1, 'kernel': 'poly'}
First round: {'C': 0.008, 'gamma': 1, 'kernel': 'poly'}
Second round: {'C': 0.006, 'gamma': 1, 'kernel': 'poly'}
Third round: {'C': 0.006, 'gamma': 1, 'kernel': 'poly'}
Fourth round: {'C': 0.004, 'gamma': 0.9, 'kernel': 'poly'}
Fifth round: {'C': 0.004, 'gamma': 0.9, 'kernel': 'poly'}


In [12]:
#Without hyperparametering
#svc = SVC(probability=True)

#Accuracy 0.8627450980392157
#Recall: 0.9027027027027027
#Precision 0.8434343434343434
#F1_Score: 0.8720626631853785
#AUC: 0.9242614707730987

#=== Confusion Matrix ===
#[[141  31]
# [ 18 167]]

#With hyperparametering
svc = SVC(probability=True, C=0.004, gamma=0.9, kernel='poly')

svc.fit(X_svc_train, y_svc_train)
y_svc_pred = svc.predict(X_svc_test)

svc_score = svc.predict_proba(X_svc_test)[: ,1]

recall_score_svc = recall_score(y_svc_test, y_svc_pred)
precision_score_svc = precision_score(y_svc_test, y_svc_pred)
f1_score_svc = f1_score(y_svc_test, y_svc_pred)

accuracy_score_svc = accuracy_score(y_svc_test, y_svc_pred)
recall_score_svc = recall_score(y_svc_test, y_svc_pred)
precision_score_svc = precision_score(y_svc_test, y_svc_pred)
f1_score_svc = f1_score(y_svc_test, y_svc_pred)
auc_svc = roc_auc_score(y_svc_test, svc_score)

print("Accuracy", accuracy_score_svc)
print("Recall:", recall_score_svc)
print("Precision", precision_score_svc)
print("F1_Score:", f1_score_svc)
print("AUC:", auc_svc)
print()

print("=== Confusion Matrix ===")
print(confusion_matrix(y_svc_test, y_svc_pred))

Accuracy 0.8543417366946778
Recall: 0.8702702702702703
Precision 0.8518518518518519
F1_Score: 0.8609625668449198
AUC: 0.9020427404148337

=== Confusion Matrix ===
[[144  28]
 [ 24 161]]


Decision Tree

In [13]:
dctparam = DecisionTreeClassifier(random_state = RANDOM_STATE)
X_dtc_train, X_dtc_test, y_dtc_train, y_dtc_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

#Random values initial
param_grid = {'max_depth': [None, 1,2,3,4], 'max_features': [None, 'auto', 'sqrt', 'log2'],
              'min_samples_leaf': [1,2,3,4,5], 'criterion' : ['gini', 'entropy'], 'min_samples_split' : [1,2,3,4,5]}
grid_search = GridSearchCV(estimator = dctparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_dtc_train, y_dtc_train)
print( "Initial:", grid_search.best_params_)

#First round
param_grid = {'max_depth': [None, 1,2,3,4], 'max_features': [None, 'auto', 'sqrt', 'log2'],
              'min_samples_leaf': [1,2,3,4,5], 'criterion' : ['gini', 'entropy'], 'min_samples_split' : [1,2,3,4]}
grid_search = GridSearchCV(estimator = dctparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_dtc_train, y_dtc_train)
print( "First round:", grid_search.best_params_)

Initial: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
First round: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [14]:
#Without Hyperparametering

#dtc = DecisionTreeClassifier(random_state = RANDOM_STATE)

#Accuracy 0.8739495798319328
#Recall: 0.9135135135135135
#Precision 0.8535353535353535
#F1_Score: 0.8825065274151437
#AUC: 0.8724544311753615

#=== Confusion Matrix ===
#[[143  29]
# [ 16 169]]


#With Hyperparametering
dtc = DecisionTreeClassifier(random_state = RANDOM_STATE, max_depth=None, max_features=None, criterion = 'gini',
                             min_samples_leaf=1, min_samples_split=2)

dtc.fit(X_dtc_train,y_dtc_train)
y_dtc_pred = dtc.predict(X_dtc_test)

dtc_score = dtc.predict_proba(X_dtc_test)[: , 1]

accuracy_score_dtc = accuracy_score(y_dtc_test, y_dtc_pred)
recall_score_dtc = recall_score(y_dtc_test, y_dtc_pred)
precision_score_dtc = precision_score(y_dtc_test, y_dtc_pred)
f1_score_dtc = f1_score(y_dtc_test, y_dtc_pred)
auc_dtc = roc_auc_score(y_dtc_test, dtc_score)

print("Accuracy", accuracy_score_dtc)
print("Recall:", recall_score_dtc)
print("Precision", precision_score_dtc)
print("F1_Score:", f1_score_dtc)
print("AUC:", auc_dtc)
print()

print("=== Confusion Matrix ===")
print(confusion_matrix(y_dtc_test, y_dtc_pred))

Accuracy 0.8739495798319328
Recall: 0.9135135135135135
Precision 0.8535353535353535
F1_Score: 0.8825065274151437
AUC: 0.8724544311753615

=== Confusion Matrix ===
[[143  29]
 [ 16 169]]


Logistic Regression

In [15]:
lrparam = LogisticRegression()
X_lr_train, X_lr_test, y_lr_train, y_lr_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

#Random values initial
param_grid = {'penalty' : [None, 'l2','l1','elasticnet'], 'C' : [10, 1, 0.1, 0.001, 0.0001], 
              'max_iter' : [250, 500, 1000, 1250, 1500], 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(estimator = lrparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_lr_train, y_lr_train)
print( "Initial:", grid_search.best_params_)

#1st Round
param_grid = {'penalty' : [None, 'l2','l1','elasticnet'], 'C' : [10, 1, 0.1, 0.001, 0.0001], 
             'max_iter' : [50,100,150,200,250], 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(estimator = lrparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_lr_train, y_lr_train)
print( "First round:", grid_search.best_params_)

#2nd round
param_grid = {'penalty' : [None, 'l2','l1','elasticnet'], 'C' : [10, 1, 0.1, 0.001, 0.0001], 
              'max_iter' : [20,30,40,50], 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(estimator = lrparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_lr_train, y_lr_train)
print( "Second round:", grid_search.best_params_)

#3rd round
param_grid = {'penalty' : [None, 'l2','l1','elasticnet'], 'C' : [0.008, 0.009, 0.1, 0.2, 0.3], 
              'max_iter' : [20,30,40,50], 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(estimator = lrparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_lr_train, y_lr_train)
print( "Third round:", grid_search.best_params_)

Initial: {'C': 0.1, 'max_iter': 250, 'penalty': 'l1', 'solver': 'saga'}
First round: {'C': 0.1, 'max_iter': 50, 'penalty': 'l1', 'solver': 'saga'}
Second round: {'C': 0.1, 'max_iter': 20, 'penalty': 'l1', 'solver': 'saga'}
Third round: {'C': 0.2, 'max_iter': 20, 'penalty': 'l1', 'solver': 'saga'}


In [16]:
#Without Hyperparametering
#lr = LogisticRegression()

#Accuracy 0.8319327731092437
#Recall: 0.8594594594594595
#Precision 0.8238341968911918
#F1_Score: 0.8412698412698413

#=== Confusion Matrix ===
#[[138  34]
# [ 26 159]]

#With Hyperparametering
#Trying a new level for C (0.4, which gave a better result than 0.2)
lr = LogisticRegression(C = 0.4, max_iter = 20, penalty = 'l1', solver = 'saga')

lr.fit(X_lr_train,y_lr_train)
y_lr_pred = lr.predict(X_lr_test)

accuracy_score_lr = accuracy_score(y_lr_test, y_lr_pred)
recall_score_lr = recall_score(y_lr_test, y_lr_pred)
precision_score_lr = precision_score(y_lr_test, y_lr_pred)
f1_score_lr = f1_score(y_lr_test, y_lr_pred)

print("Accuracy", accuracy_score_lr)
print("Recall:", recall_score_lr)
print("Precision", precision_score_lr)
print("F1_Score:", f1_score_lr)
print()

print("=== Confusion Matrix ===")
print(confusion_matrix(y_lr_test, y_lr_pred))

Accuracy 0.834733893557423
Recall: 0.8648648648648649
Precision 0.8247422680412371
F1_Score: 0.8443271767810027

=== Confusion Matrix ===
[[138  34]
 [ 25 160]]


K-nearest neighbor

In [17]:
knnparam = KNeighborsClassifier()
X_knn_train, X_knn_test, y_knn_train, y_knn_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

#Random values initial
param_grid = {'n_neighbors': [2,3,4,5,6,7,8,9,10], 'weights': ['distance', 'uniform'],
              'algorithm': ['auto','ball_tree', 'kd_tree', 'brute']}
grid_search = GridSearchCV(estimator = knnparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_knn_train, y_knn_train)
print( "Initial:", grid_search.best_params_)

#1st round
param_grid = {'n_neighbors': [10,11,12,13,14,15,16,17,18,19,20], 'weights': ['distance', 'uniform'],
              'algorithm': ['auto','ball_tree', 'kd_tree', 'brute']}
grid_search = GridSearchCV(estimator = knnparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_knn_train, y_knn_train)
print( "Initial:", grid_search.best_params_)

Initial: {'algorithm': 'auto', 'n_neighbors': 10, 'weights': 'distance'}
Initial: {'algorithm': 'auto', 'n_neighbors': 13, 'weights': 'distance'}


In [18]:
#Without Hyperparametering
#knn = KNeighborsClassifier()

#Accuracy 0.8571428571428571
#Recall: 0.8702702702702703
#Precision 0.8563829787234043
#F1_Score: 0.8632707774798928
#AUC: 0.9120993086109366

#=== Confusion Matrix ===
#[[145  27]
# [ 24 161]]

#With Hyperparametering

knn = KNeighborsClassifier(n_neighbors = 13 , weights = 'distance', algorithm = 'auto')

knn.fit(X_knn_train,y_knn_train)
y_knn_pred = knn.predict(X_knn_test)

knn_score = knn.predict_proba(X_knn_test)[: , 1]

accuracy_score_knn = accuracy_score(y_knn_test, y_knn_pred)
recall_score_knn = recall_score(y_knn_test, y_knn_pred)
precision_score_knn = precision_score(y_knn_test, y_knn_pred)
f1_score_knn = f1_score(y_knn_test, y_knn_pred)
auc_knn = roc_auc_score(y_knn_test, knn_score)

print("Accuracy", accuracy_score_knn)
print("Recall:", recall_score_knn)
print("Precision", precision_score_knn)
print("F1_Score:", f1_score_knn)
print("AUC:", auc_knn)
print()

print("=== Confusion Matrix ===")
print(confusion_matrix(y_knn_test, y_knn_pred))

Accuracy 0.9047619047619048
Recall: 0.9243243243243243
Precision 0.8952879581151832
F1_Score: 0.9095744680851063
AUC: 0.9530641106222502

=== Confusion Matrix ===
[[152  20]
 [ 14 171]]


# Ensembles

Random Forest

In [19]:
#Optimizing parameters

rfparam = RandomForestClassifier(random_state=RANDOM_STATE)
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

#Random values initial
param_grid = {'max_depth': [1, 2, 3, 4], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [1,2,3,4], 'n_estimators' : [25, 50, 100, 150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Initial:", grid_search.best_params_)

#1st Round
param_grid = {'max_depth': [3,4,5], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "First round:", grid_search.best_params_)

#2nd Round
param_grid = {'max_depth': [4,5,6], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Second round:", grid_search.best_params_)

#3rd Round
param_grid = {'max_depth': [5,6,7], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Third round:", grid_search.best_params_)

#4th Round
param_grid = {'max_depth': [6,7,8], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Fourth round:", grid_search.best_params_)

#5th Round
param_grid = {'max_depth': [7,8,9], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Fifth round:", grid_search.best_params_)

#6th Round
param_grid = {'max_depth': [8,9,10], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Sixth round:", grid_search.best_params_)

#7th Round
param_grid = {'max_depth': [9,10,11], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Seventh round:", grid_search.best_params_)

#8th Round
param_grid = {'max_depth': [9,10,11], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Eighth round:", grid_search.best_params_)

#9th Round
param_grid = {'max_depth': [11,12,13], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [25,50,100,150]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Ninth round:", grid_search.best_params_)

#10th Round
param_grid = {'max_depth': [11,12,13], 'min_samples_leaf': [1,2,3,4], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [150,200,250,300]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Tenth round:", grid_search.best_params_)

#11th Round
param_grid = {'max_depth': [11,12,13], 'min_samples_leaf': [1,2,3], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [3,4,5], 'n_estimators' : [300,350,400,450]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Eleventh round:", grid_search.best_params_)

#12th Round
param_grid = {'max_depth': [11,12,13], 'min_samples_leaf': [1,2,3], 
              'criterion' : ['gini', 'entropy'], 
              'min_samples_split' : [1,2,3,4], 'n_estimators' : [300,350,400,450]}
grid_search = GridSearchCV(estimator = rfparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_rf_train, y_rf_train)
print( "Twelfth round:", grid_search.best_params_)

Initial: {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}
First round: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
Second round: {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 100}
Third round: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 50}
Fourth round: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}
Fifth round: {'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Sixth round: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
Seventh round: {'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 150}
Eighth round: {'criterion': 'entrop

In [20]:
#Without Hyperparametering

#rf = RandomForestClassifier(random_state=RANDOM_STATE)

#Accuracy 0.9159663865546218
#Recall: 0.9567567567567568
#Precision 0.8894472361809045
#F1_Score: 0.9218749999999999
#AUC: 0.9569767441860465

#=== Confusion Matrix ===
#[[150  22]
# [  8 177]]

#With Hyperparametering
rf = RandomForestClassifier(n_estimators = 350, random_state=RANDOM_STATE, 
                           min_samples_leaf=1, min_samples_split=3, max_depth=12, criterion = 'gini')

rf.fit(X_rf_train, y_rf_train)
y_rf_pred = rf.predict(X_rf_test)

rf_score = rf.predict_proba(X_rf_test)[: , 1]

accuracy_score_rf = accuracy_score(y_rf_test, y_rf_pred)
recall_score_rf = recall_score(y_rf_test, y_rf_pred)
precision_score_rf = precision_score(y_rf_test, y_rf_pred)
f1_score_rf = f1_score(y_rf_test, y_rf_pred)
auc_rf = roc_auc_score(y_rf_test, rf_score)

print("Accuracy", accuracy_score_rf)
print("Recall:", recall_score_rf)
print("Precision", precision_score_rf)
print("F1_Score:", f1_score_rf)
print("AUC:", auc_rf)
print()

print("=== Confusion Matrix ===")
print(confusion_matrix(y_rf_test, y_rf_pred))

Accuracy 0.9187675070028011
Recall: 0.9621621621621622
Precision 0.89
F1_Score: 0.9246753246753247
AUC: 0.9538969201759899

=== Confusion Matrix ===
[[150  22]
 [  7 178]]


AdaBoost

In [21]:
abparam = AdaBoostClassifier(random_state=RANDOM_STATE)
X_ab_train, X_ab_test, y_ab_train, y_ab_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

#Random values initial
param_grid = {'n_estimators': [50, 100, 150, 200, 250], 'learning_rate': [1,0.1,0.01,0.001,0.0001]}
grid_search = GridSearchCV(estimator = abparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_ab_train, y_ab_train)
print( "Initial:", grid_search.best_params_)

#1st Round optimizing
param_grid = {'n_estimators': [25, 50, 100, 150, 200], 'learning_rate': [1,0.1,0.01,0.001,0.0001]}
grid_search = GridSearchCV(estimator = abparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_ab_train, y_ab_train)
print( "First round:", grid_search.best_params_)

#2nd Round optimizing
param_grid = {'n_estimators': [25, 50, 100, 150, 200], 'learning_rate': [0.08,0.09, 0.1, 0.2, 0.3]}
grid_search = GridSearchCV(estimator = abparam, param_grid = param_grid, cv=3, n_jobs = -1)
grid_search.fit(X_ab_train, y_ab_train)
print( "Second round:", grid_search.best_params_)

Initial: {'learning_rate': 0.1, 'n_estimators': 100}
First round: {'learning_rate': 0.1, 'n_estimators': 100}
Second round: {'learning_rate': 0.1, 'n_estimators': 100}


In [22]:
#Without Hyperparametering
ab = AdaBoostClassifier(random_state=RANDOM_STATE)

#Accuracy 0.865546218487395
#Recall: 0.8810810810810811
#Precision 0.8624338624338624
#F1_Score: 0.8716577540106951
#AUC: 0.9209302325581395

#=== Confusion Matrix ===
#[[146  26]
# [ 22 163]]

#With Hyperparametering
ab = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=RANDOM_STATE)

ab.fit(X_ab_train, y_ab_train)
y_ab_pred = ab.predict(X_ab_test)

ab_score = ab.predict_proba(X_ab_test)[: , 1]

accuracy_score_ab = accuracy_score(y_ab_test, y_ab_pred)
recall_score_ab = recall_score(y_ab_test, y_ab_pred)
precision_score_ab = precision_score(y_ab_test, y_ab_pred)
f1_score_ab = f1_score(y_ab_test, y_ab_pred)
auc_ab = roc_auc_score(y_ab_test, ab_score)

print("Accuracy", accuracy_score_ab)
print("Recall:", recall_score_ab)
print("Precision", precision_score_ab)
print("F1_Score:", f1_score_ab)
print("AUC:", auc_ab)
print()
print("=== Confusion Matrix ===")
print(confusion_matrix(y_ab_test, y_ab_pred))

Accuracy 0.8599439775910365
Recall: 0.8810810810810811
Precision 0.8534031413612565
F1_Score: 0.8670212765957447
AUC: 0.9143620364550598

=== Confusion Matrix ===
[[144  28]
 [ 22 163]]
