In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Import GridSearchCV

In [2]:
from sklearn.model_selection import GridSearchCV
#GridSearchCV?

# Initialization

## Dataset

In [3]:
from sklearn import datasets
iris = datasets.load_iris()

## estimator

In [4]:
from sklearn.tree import DecisionTreeClassifier
Model= DecisionTreeClassifier(random_state=42)

## param_grid

In [5]:
#DecisionTreeClassifier?

In [6]:
from random import uniform
param_grid = {
                'min_weight_fraction_leaf' : [uniform(0,0.5) for i in range(2)],
                'criterion': ['gini', 'entropy'],
                'max_depth': range(5,8),
                'min_samples_split': [uniform(0,1) for i in range(2)]     }

## cv

In [7]:
from sklearn.model_selection import StratifiedKFold
Kf = StratifiedKFold(n_splits=5,random_state=2020,shuffle=True)

## fit

In [8]:
%%time
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(estimator=Model, 
                  param_grid= param_grid,
                  verbose= 1,
                  cv= Kf)
cv.fit(iris.data, iris.target)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
CPU times: user 202 ms, sys: 1.44 ms, total: 204 ms
Wall time: 204 ms


## predict

In [9]:
cv.predict(iris.data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## scoring 

- Check this link: 
    - https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [10]:
%%time
cv = GridSearchCV(estimator=Model, 
                  param_grid= param_grid,
                  verbose= 1,
                  cv= Kf,
                  scoring = 'accuracy')
cv.fit(iris.data, iris.target)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
CPU times: user 211 ms, sys: 2.93 ms, total: 214 ms
Wall time: 213 ms


## Display

In [11]:
table = pd.DataFrame(cv.cv_results_)
table.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,param_min_weight_fraction_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000578,1.417209e-05,0.0,0.0,gini,5,0,0.377965,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",,,,,,,,36
1,0.000574,8.710602e-06,0.0,0.0,gini,5,0,0.265681,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",,,,,,,,17
2,0.000568,4.812513e-06,0.0,0.0,gini,5,1,0.377965,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",,,,,,,,22
3,0.000561,5.761645e-07,0.0,0.0,gini,5,1,0.265681,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",,,,,,,,31
4,0.000664,1.511175e-05,0.000252,5e-06,gini,5,2,0.377965,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.0,7


### Sort best parameters & score

In [12]:
table = pd.DataFrame(cv.cv_results_)[['params', 'mean_test_score', 'rank_test_score','mean_fit_time']]
table.sort_values(by='mean_test_score', ascending=False).head(2)

Unnamed: 0,params,mean_test_score,rank_test_score,mean_fit_time
5,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.94,1,0.000674
11,"{'criterion': 'gini', 'max_depth': 6, 'min_sam...",0.94,1,0.000703


### The BEST parameters & score 

In [13]:
print("Best Parameters: \n{}".format(cv.best_params_)) 
print("Best score is {}".format(cv.best_score_))

Best Parameters: 
{'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.2656807048523776}
Best score is 0.9399999999999998


# Inconvenient

## Huge number of candidates

In [14]:
%%time
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
DecisionTreeClassifier(random_state=42)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from random import uniform
iris = datasets.load_iris()

param_grid = {
                'min_weight_fraction_leaf' : [uniform(0,0.5) for i in range(10)],
                'criterion': ['gini', 'entropy'],
                'max_depth': range(2,18),
                'min_samples_split': [uniform(0,1) for i in range(2)]       }

cv = GridSearchCV(estimator=Model, 
                  param_grid= param_grid,
                  verbose= 1,
                  cv= Kf,
                  scoring = 'accuracy')
cv.fit(iris.data, iris.target)

Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
CPU times: user 30.2 s, sys: 187 ms, total: 30.4 s
Wall time: 30.4 s


# Solution : n_jobs = -1

In [16]:
%%time
cv = GridSearchCV(estimator=Model, 
                  param_grid= param_grid,
                  verbose= 1,
                  cv= Kf,
                  scoring = 'accuracy',
                  n_jobs = -1)
cv.fit(iris.data, iris.target)

Fitting 5 folds for each of 4800 candidates, totalling 24000 fits
CPU times: user 16 s, sys: 359 ms, total: 16.4 s
Wall time: 16 s
