In [1]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

In [2]:
digits = load_digits()
X, y = digits.data, digits.target

In [3]:
# build a classifier
clf = RandomForestClassifier(n_estimators=20)

In [4]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [5]:
# specify parameters and distributions to sample from
param_dist = {"max_depth" : [3, None],
              "max_features" : sp_randint(1, 11),
              "min_samples_split" : sp_randint(2, 11),
              "min_samples_leaf" : sp_randint(1, 11),
              "bootstrap" : [True, False],
              "criterion" : ["gini", "entropy"]
             }



In [6]:
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(estimator= clf, param_distributions=param_dist, n_iter= n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates parameters string" % ((time() - start), n_iter_search))

report(random_search.cv_results_)



RandomizedSearchCV took 6.55 seconds for 20 candidates parameters string
Model with rank: 1
Mean validation score: 0.924 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.923 (std: 0.009)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 6}

Model with rank: 3
Mean validation score: 0.921 (std: 0.005)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 5, 'min_samples_split': 6}



In [7]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [8]:
# run grid search
grid_search = GridSearchCV(clf, param_grid= param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidates parameters setting" 
      % ((time() - start), len(grid_search.cv_results_["params"])))
report(grid_search.cv_results_)


GridSearchCV took 68.15 seconds for 216 candidates parameters setting
Model with rank: 1
Mean validation score: 0.938 (std: 0.002)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.937 (std: 0.006)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 3}

Model with rank: 3
Mean validation score: 0.930 (std: 0.011)
Parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}

