In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
from sklearn.datasets import load_iris
import sklearn.metrics as mx
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = load_iris()
X = df.data
y = df.target
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=42)

In [4]:
rf = RandomForestClassifier()
"""
Params: 
# n_estimators : Number of trees in Forest, Default --> 100
# n_jobs : No. of jobs to run in parallel
# bootstrap : Whether bootstrap samples are used when building trees. If False, 
              the whole dataset is used to build each tree.
# max_samples(default - True) : If bootstrap is True, the number of samples to draw from 
                                X to train each base estimator
# oob_score(default - False) : Whether to use out-of-bag samples to estimate the generalization accuracy
#Criterian : gini, entropy
#max_depth : Max depth of tree
# min_samples_split : The minimum number of samples required to split an internal node
# min_samples_leaf : The minimum number of samples required to be at a leaf node.
# max_features : The number of features to consider when looking for the best split
# min_weight_fraction_leaf : The minimum weighted fraction of the sum total of weights 
                             (of all the input samples) required to be at a leaf node.
# max_leaf_nodes : Best nodes are defined as relative reduction in impurity
# min_impurity_decrease : A node will be split if this split induces a decrease of the impurity greater than or equal to this value
"""

In [6]:
hyper_params = [{
    'criterion': ['gini','entropy'],
    'n_estimators':[10,50,100,500],
    'bootstrap' : [True,False],
    'oob_score' : [True,False]
}]
folds = KFold(n_splits=5,shuffle=True,random_state=42)
model_cv = GridSearchCV(estimator=rf,
                       param_grid = hyper_params,
                       scoring ='accuracy',
                       cv = folds,
                       verbose=1,
                       return_train_score=True,
                       n_jobs=32)
model_cv.fit(X_train,y_train)
'''
By default joblib.Parallel uses the 'loky' backend module to start separate Python worker processes to execute tasks 
concurrently on separate CPUs. This is a reasonable default for generic Python programs but can induce a significant 
overhead as the input and output data need to be serialized in a queue for communication with the worker processes 
'''

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=32)]: Done 160 out of 160 | elapsed:   38.0s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_scor

In [7]:
print(model_cv.best_score_)
print(model_cv.best_params_)
print(model_cv.best_estimator_)

0.9238095238095237
{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 500, 'oob_score': True}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)


In [8]:
rf = RandomForestClassifier(n_estimators = 500)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
print(rf.feature_importances_)
print(rf.classes_)
print(rf.n_classes_)
print(rf.n_features_)  # featuresfrom a dataset
print(rf.n_outputs_) # output will be only 1 class
print(rf.base_estimator_) # print(rf.base_estimator_) 
print(rf.estimators_)# Willgive all estimator output of 500 trees

[0.10645222 0.04015331 0.44199636 0.41139812]
[0 1 2]
3
4
1
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


In [19]:
print(rf.get_params())
print(rf.score(X_train,y_train))

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
1.0


In [21]:
print(rf.predict(X_test))
print(rf.predict_proba(X_test))

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
[[0.    0.996 0.004]
 [0.954 0.044 0.002]
 [0.    0.01  0.99 ]
 [0.    0.978 0.022]
 [0.    0.86  0.14 ]
 [0.978 0.02  0.002]
 [0.    1.    0.   ]
 [0.    0.068 0.932]
 [0.    0.858 0.142]
 [0.002 0.998 0.   ]
 [0.    0.076 0.924]
 [1.    0.    0.   ]
 [0.91  0.088 0.002]
 [1.    0.    0.   ]
 [1.    0.    0.   ]
 [0.016 0.884 0.1  ]
 [0.    0.    1.   ]
 [0.    1.    0.   ]
 [0.    0.992 0.008]
 [0.    0.    1.   ]
 [1.    0.    0.   ]
 [0.    0.098 0.902]
 [1.    0.    0.   ]
 [0.    0.    1.   ]
 [0.004 0.002 0.994]
 [0.    0.022 0.978]
 [0.    0.036 0.964]
 [0.    0.002 0.998]
 [1.    0.    0.   ]
 [1.    0.    0.   ]
 [1.    0.    0.   ]
 [0.956 0.044 0.   ]
 [0.004 0.992 0.004]
 [1.    0.    0.   ]
 [1.    0.    0.   ]
 [0.    0.072 0.928]
 [0.008 0.966 0.026]
 [0.994 0.004 0.002]
 [1.    0.    0.   ]
 [0.996 0.004 0.   ]
 [0.    0.038 0.962]
 [0.018 0.902 0.08 ]
 [0.004 0.988 0.008]
 [0.

In [22]:
rf.decision_path(X_train[1].reshape(1,4))

(<1x8164 sparse matrix of type '<class 'numpy.int64'>'
 	with 2673 stored elements in Compressed Sparse Row format>,
 array([   0,   17,   38,   55,   68,   81,  104,  125,  142,  161,  178,
         197,  212,  225,  246,  259,  272,  287,  308,  325,  342,  357,
         378,  391,  412,  427,  448,  463,  486,  501,  518,  537,  556,
         577,  596,  615,  636,  649,  664,  677,  690,  705,  718,  733,
         752,  767,  776,  801,  826,  843,  862,  881,  898,  903,  920,
         933,  954,  977,  994, 1009, 1022, 1041, 1058, 1069, 1086, 1093,
        1110, 1123, 1144, 1157, 1178, 1197, 1220, 1237, 1252, 1271, 1290,
        1307, 1324, 1345, 1364, 1387, 1398, 1417, 1434, 1447, 1466, 1479,
        1492, 1499, 1520, 1539, 1564, 1583, 1600, 1613, 1620, 1633, 1642,
        1661, 1682, 1697, 1718, 1723, 1744, 1757, 1772, 1785, 1806, 1821,
        1842, 1857, 1878, 1897, 1912, 1933, 1950, 1967, 1984, 1993, 2008,
        2023, 2042, 2057, 2074, 2091, 2114, 2131, 2146, 2157, 2170, 2