In [1]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
import numpy as np
df['Glucose']=np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin']=np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness']=np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


Independent And Dependent feature

In [5]:
X = df.drop('Outcome',axis=1)
y = df['Outcome']
print(X.head())
print(y.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6    148.0             72           35.0     30.5  33.6   
1            1     85.0             66           29.0     30.5  26.6   
2            8    183.0             64           23.0     30.5  23.3   
3            1     89.0             66           23.0     94.0  28.1   
4            0    137.0             40           35.0    168.0  43.1   

   DiabetesPedigreeFunction  Age  
0                     0.627   50  
1                     0.351   31  
2                     0.672   32  
3                     0.167   21  
4                     2.288   33  
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64


## Train Test Split

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=3)

In [7]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier=RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction=rf_classifier.predict(X_test)

In [8]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [9]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[74 18]
 [28 34]]
0.7012987012987013
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        92
           1       0.65      0.55      0.60        62

    accuracy                           0.70       154
   macro avg       0.69      0.68      0.68       154
weighted avg       0.70      0.70      0.70       154



# Manual Hyperparameter Tuning

In [10]:
model=RandomForestClassifier(n_estimators=500,criterion='entropy',
                            max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,prediction))

[[81 11]
 [28 34]]
0.7467532467532467
              precision    recall  f1-score   support

           0       0.73      0.80      0.76        92
           1       0.65      0.55      0.60        62

    accuracy                           0.70       154
   macro avg       0.69      0.68      0.68       154
weighted avg       0.70      0.70      0.70       154



# Randomized Search Cv

In [11]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

#Number of trees in random forest
n_estimators=[int(x)for x in np.linspace(start = 200, stop = 2000, num = 10)]

#Number of feature to consider at every split
max_features = ['auto','sqrt','log2']

#Maximum number of leaves in tree
max_depth = [int(x)for x in np.linspace(10,1000,10)]

#Minimun number of samples required to split a node
min_samples_split = [1,3,3,5,7,9]

#Minimum number of sample required at each of leaf node
min_samples_leaf = [1,2,4,6,8]

#Create the random grid
random_grid = {'n_estimators': n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [1, 3, 3, 5, 7, 9], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [12]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                              random_state=100,n_jobs=-1)
rf_randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [13]:
rf_randomcv.best_params_

{'n_estimators': 600,
 'min_samples_split': 3,
 'min_samples_leaf': 6,
 'max_features': 'auto',
 'max_depth': 230,
 'criterion': 'gini'}

In [14]:
rf_randomcv.best_estimator_

In [15]:
best_random_grid=rf_randomcv.best_estimator_

In [16]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Clalssification report: {}".format(classification_report(y_test,y_pred)))

[[79 13]
 [27 35]]
Accuracy Score 0.7402597402597403
Clalssification report:               precision    recall  f1-score   support

           0       0.75      0.86      0.80        92
           1       0.73      0.56      0.64        62

    accuracy                           0.74       154
   macro avg       0.74      0.71      0.72       154
weighted avg       0.74      0.74      0.73       154



## Grid Search SV

In [17]:
rf_randomcv.best_params_

{'n_estimators': 600,
 'min_samples_split': 3,
 'min_samples_leaf': 6,
 'max_features': 'auto',
 'max_depth': 230,
 'criterion': 'gini'}

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion':[rf_randomcv.best_params_['criterion']],
    'max_depth':[rf_randomcv.best_params_['max_depth']],
    'max_features':[rf_randomcv.best_params_['max_features']],
    'min_samples_leaf':[rf_randomcv.best_params_['min_samples_leaf'],
                        rf_randomcv.best_params_['min_samples_leaf']+2,
                         rf_randomcv.best_params_['min_samples_leaf']+4],
    'min_samples_split':[rf_randomcv.best_params_['min_samples_split'] -2,
                         rf_randomcv.best_params_['min_samples_split'] -1,
                          rf_randomcv.best_params_['min_samples_split'],
                           rf_randomcv.best_params_['min_samples_split'] +1,
                            rf_randomcv.best_params_['min_samples_split'] +2],
     'n_estimators':[rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators']-100,
                     rf_randomcv.best_params_['n_estimators'],
                      rf_randomcv.best_params_['n_estimators'] + 100,rf_randomcv.best_params_['n_estimators'] + 200] 
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [230], 'max_features': ['auto'], 'min_samples_leaf': [6, 8, 10], 'min_samples_split': [1, 2, 3, 4, 5], 'n_estimators': [400, 500, 600, 700, 800]}


In [19]:
1*1*1*3*5*5

75

### fit the grid search to the data

In [None]:
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

Fitting 10 folds for each of 75 candidates, totalling 750 fits


In [None]:
grid_search.best_estimator_

In [49]:
best_grid=grid_search.best_estimator_
best_grid

In [48]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score: {}".format(classification_report(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

[[79 13]
 [25 37]]
Accuracy Score:               precision    recall  f1-score   support

           0       0.76      0.86      0.81        92
           1       0.74      0.60      0.66        62

    accuracy                           0.75       154
   macro avg       0.75      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154

Classification report:               precision    recall  f1-score   support

           0       0.76      0.86      0.81        92
           1       0.74      0.60      0.66        62

    accuracy                           0.75       154
   macro avg       0.75      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



In [26]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [40]:
space={'criterion':hp.choice('criterion',['entropy','gini']),
      'max_depth':hp.quniform('max_depth',10,1200,10),
      'max_features':hp.choice('max_features',['auto','sqrt','log2',None]),
      'min_samples_leaf':hp.uniform('min_samples_leaf',0,0.5),
      'min_samples_split':hp.uniform('min_samples_split',0,1),
      'n_estimators':hp.choice('n_estimators',[10,50,300,750,1200,1300,1500])
      }
space

{'criterion': <hyperopt.pyll.base.Apply at 0x21b40a68a60>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x21b40a81ab0>,
 'max_features': <hyperopt.pyll.base.Apply at 0x21b40a819f0>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x21b40a824a0>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x21b40a822f0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x21b40a82260>}

In [42]:
def objective(space):
    model=RandomForestClassifier(criterion=space['criterion'],max_depth=space['max_depth'],
                                max_features=space['max_features'],
                                min_samples_leaf=space['min_samples_leaf'],
                                min_samples_split=space['min_samples_split'],
                                n_estimators=space['n_estimators'],
                                )
    accuracy = cross_val_score(model,X_train,y_train,cv=5).mean()
    
    #We aim to maximize accuracy, therefore we return it as a negative value
    return{'loss': -accuracy,'status':STATUS_OK}

In [53]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn = objective,
           space = space,
           algo = tpe.suggest,
           max_evals = 80,
           trials = trials)
print(best)

  0%|                                                                           | 0/80 [00:00<?, ?trial/s, best loss=?]

job exception: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\python37\lib\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  F

  0%|                                                                           | 0/80 [00:01<?, ?trial/s, best loss=?]


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\python37\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\python37\lib\site-packages\sklearn\ensemble\_forest.py", line 476, in fit
    trees = Parallel(
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\python37\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "C:\python37\lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "C:\python37\lib\site-packages\joblib\parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "C:\python37\lib\site-packages\sklearn\utils\fixes.py", line 117, in __call__
    return self.function(*args, **kwargs)
  File "C:\python37\lib\site-packages\sklearn\ensemble\_forest.py", line 189, in _parallel_build_trees
    tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
  File "C:\python37\lib\site-packages\sklearn\tree\_classes.py", line 969, in fit
    super().fit(
  File "C:\python37\lib\site-packages\sklearn\tree\_classes.py", line 238, in fit
    check_scalar(
  File "C:\python37\lib\site-packages\sklearn\utils\validation.py", line 1452, in check_scalar
    raise TypeError(
TypeError: max_depth must be an instance of int, not float.


## Genetic Algorithms

In [54]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

#Number of trees in random forest
n_estimators=[int(x)for x in np.linspace(start = 200, stop = 2000, num = 10)]

#Number of feature to consider at every split
max_features = ['auto','sqrt','log2']

#Maximum number of leaves in tree
max_depth = [int(x)for x in np.linspace(10,1000,10)]

#Minimun number of samples required to split a node
min_samples_split = [2,5,10,14]

#Minimum number of sample required at each of leaf node
min_samples_leaf = [1,2,4,6,8]

#Create the random grid
random_grid = {'n_estimators': n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8], 'criterion': ['entropy', 'gini']}


In [56]:
from tpot import TPOTClassifier

tpot_classifier = TPOTClassifier(generations=5,population_size=24,offspring_size=12,
                                verbosity=2,early_stop=12,
                                config_dict={'sklearn.ensemble.RandomForestClassifier':param},
                                cv=4,scoring='accuracy')
tpot_classifier.fit(X_train,y_train)

NameError: name 'param' is not defined

In [58]:
accuracy=tpot_classifier.score(X_test,y_test)
print(accuracy)

NameError: name 'tpot_classifier' is not defined

## Optimize Hyperparameter using Optuna