### RandomForestClassifier hyperparameters

n_estimators = The number of trees in the forest.

max_features = The number of features to consider when looking for the best split

bootstrap = Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.

In [167]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [168]:
df = pd.read_csv('heart.csv')

In [169]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [170]:
df.shape

(303, 14)

In [171]:
x = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [172]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20,random_state=42)

In [173]:
print(x_train.shape)
print(y_train.shape)

(242, 13)
(242,)


In [174]:
rf = RandomForestClassifier(max_samples=0.75,random_state=42)

In [175]:
rf.fit(x_train,y_train)
pred = rf.predict(x_test) 
accuracy_score(y_test,pred)

0.9016393442622951

In [176]:
# GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# svc
from sklearn.svm import SVC
# LogisticRegression
from sklearn.linear_model import LogisticRegression

In [177]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
lr = LogisticRegression()

In [178]:
rf.fit(x_train,y_train)
pred = rf.predict(x_test) 
accuracy_score(y_test,pred)

0.8524590163934426

In [179]:
gb.fit(x_train,y_train)
pred = gb.predict(x_test) 
accuracy_score(y_test,pred)

0.7704918032786885

In [180]:
svc.fit(x_train,y_train)
pred = svc.predict(x_test) 
accuracy_score(y_test,pred)

0.7049180327868853

In [181]:
lr.fit(x_train,y_train)
pred = lr.predict(x_test) 
accuracy_score(y_test,pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8852459016393442

In [182]:
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(LogisticRegression(),x,y,cv=10,scoring='accuracy'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8283870967741935

# gridsearchcv

In [184]:
# number of tree in random forest
n_estimators = [20,60,100,120]

# number of features to consider at every split
max_features = [0.2,0.6,1.0]

# maximum number of levels in tree
max_depth = [2,8,None]

# number of sample
max_samples = [0.5,0.75,1.0]


In [185]:
parameter = { 
    'n_estimators':n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'max_samples':max_samples
}
    

In [186]:
rf = RandomForestClassifier()

In [187]:
from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(rf,param_grid = parameter,cv=5,verbose =2,n_jobs =-1)

In [188]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [189]:
rf_grid.best_params_

{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.75, 'n_estimators': 120}

In [190]:
rf_grid.best_score_

0.8512755102040817

# RandomSearchCV

In [192]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [193]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'bootstrap':bootstrap,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [194]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator = rf, 
                       param_distributions = param_grid, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)

In [195]:
rf_grid.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "D:\anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\anaconda\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "D:\anaconda\Lib\site-packages\sklearn\ensemble\_forest.py", line 433, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

        nan        nan

In [196]:
rf_grid.best_params_

{'n_estimators': 60,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_samples': 1.0,
 'max_features': 0.2,
 'max_depth': 2,
 'bootstrap': True}

In [197]:
rf_grid.best_score_

0.8386054421768707

# OOB Score

In [199]:
rf = RandomForestClassifier(oob_score = True)

In [200]:
rf.fit(x_train,y_train)

In [201]:
rf.oob_score_

0.8140495867768595

In [202]:
pred = rf.predict(x_test)
accuracy_score (y_test , pred)

0.8688524590163934

In [247]:
rf.feature_importances_

array([0.08651977, 0.0425573 , 0.11348579, 0.07240748, 0.08789336,
       0.01008275, 0.02123944, 0.11335917, 0.06060485, 0.11864939,
       0.05619539, 0.1214942 , 0.0955111 ])