In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

data = pd.read_csv('./diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
import numpy as np
data['Glucose'] = np.where(data['Glucose']==0,data['Glucose'].median(),data['Glucose'])

In [4]:
data['Insulin'] = np.where(data['Insulin']==0,data['Insulin'].median(),data['Insulin'])

In [5]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [6]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35,30.5,33.6,0.627,50
1,1,85.0,66,29,30.5,26.6,0.351,31
2,8,183.0,64,0,30.5,23.3,0.672,32
3,1,89.0,66,23,94.0,28.1,0.167,21
4,0,137.0,40,35,168.0,43.1,2.288,33


In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=6)

In [8]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=500,criterion='entropy',max_features='sqrt',min_samples_leaf=10).fit(X_train,y_train)
pred = clf.predict(X_test)

In [9]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

print(accuracy_score(y_true=y_test,y_pred=pred))
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

0.7922077922077922
[[91 12]
 [20 31]]
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       103
           1       0.72      0.61      0.66        51

    accuracy                           0.79       154
   macro avg       0.77      0.75      0.76       154
weighted avg       0.79      0.79      0.79       154



<b>Random Search</b>

In [10]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(200,2000,10)]

max_features = ['auto','sqrt','log2']

max_depth = [int(x) for x in np.linspace(10,1000,10)]

min_samples_split = [2,5,10,14]

min_samples_leaf = [1,2,4,6,8]

random_grid = {
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'min_samples_leaf' : min_samples_leaf,
    'criterion' : ['entropy','gini']
}

random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt', 'log2'],
 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000],
 'min_samples_split': [2, 5, 10, 14],
 'min_samples_leaf': [1, 2, 4, 6, 8],
 'criterion': ['entropy', 'gini']}

In [11]:
rf = RandomForestClassifier()
rf_RandomCV = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,random_state=24,n_jobs=-1)

rf_RandomCV.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [12]:
rf_RandomCV.best_estimator_

In [13]:
rf_RandomCV.best_params_

{'n_estimators': 1200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 120,
 'criterion': 'gini'}

In [14]:
print(accuracy_score(rf_RandomCV.predict(X_test),y_test))

0.7727272727272727


In [16]:
print(classification_report(rf_RandomCV.predict(X_test),y_test))

              precision    recall  f1-score   support

           0       0.84      0.82      0.83       106
           1       0.63      0.67      0.65        48

    accuracy                           0.77       154
   macro avg       0.74      0.74      0.74       154
weighted avg       0.78      0.77      0.77       154



<b>Grid Search</b>

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion':[rf_RandomCV.best_params_['criterion']],
    'max_features':[rf_RandomCV.best_params_['max_features']],
    'max_depth':[rf_RandomCV.best_params_['max_depth']],
    'min_samples_leaf':[rf_RandomCV.best_params_['min_samples_leaf'],
                        rf_RandomCV.best_params_['min_samples_leaf']+2,rf_RandomCV.best_params_['min_samples_leaf']+4],
    'min_samples_split':[rf_RandomCV.best_params_['min_samples_split']-2,
                         rf_RandomCV.best_params_['min_samples_split']-1,
                         rf_RandomCV.best_params_['min_samples_split'],
                         rf_RandomCV.best_params_['min_samples_split']+1,
                         rf_RandomCV.best_params_['min_samples_split']+2],
    'n_estimators':[rf_RandomCV.best_params_['n_estimators']-200,
                    rf_RandomCV.best_params_['n_estimators']-100,
                    rf_RandomCV.best_params_['n_estimators'],
                    rf_RandomCV.best_params_['n_estimators']+100,
                    rf_RandomCV.best_params_['n_estimators']-200],
}

In [18]:
param_grid

{'criterion': ['gini'],
 'max_features': ['sqrt'],
 'max_depth': [120],
 'min_samples_leaf': [1, 3, 5],
 'min_samples_split': [0, 1, 2, 3, 4],
 'n_estimators': [1000, 1100, 1200, 1300, 1000]}

In [19]:
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


In [20]:
accuracy_score(y_pred=grid_search.predict(X_test),y_true=y_test)

0.7792207792207793

<b>Bayesian Optimizer</b>

In [21]:
from skopt import BayesSearchCV
from skopt.space import Integer,Real,Categorical

params = {
    'n_estimators':Integer(50,300),
    'max_depth':Integer(3,20),
    'min_samples_leaf':Integer(1,5),
    'min_samples_split':Integer(2,10),
    'criterion':Categorical(['gini','entropy','log_loss'])
}

In [22]:
rf = RandomForestClassifier()

bayes_rf = BayesSearchCV(estimator=rf,search_spaces=params,n_iter=50,cv=5,n_jobs=-1,random_state=6,scoring='accuracy')

bayes_rf.fit(X_train,y_train)

In [23]:
bayes_rf.best_params_

OrderedDict([('criterion', 'entropy'),
             ('max_depth', 20),
             ('min_samples_leaf', 2),
             ('min_samples_split', 2),
             ('n_estimators', 58)])

In [24]:
bayes_rf.best_score_

0.7687325069972012

In [28]:
accuracy_score(y_pred=bayes_rf.predict(X_test),y_true=y_test)

0.7792207792207793