<a href="https://colab.research.google.com/github/the9kim/Machine-Deep-Learning-Practice/blob/main/CrossValidationAndGridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. Utilizing the Validateion Set

In [173]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')

wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [174]:
from  sklearn.model_selection import train_test_split

data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

# Spliting the existing trainig data into the sub training and valdiation set
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)

print(sub_input.shape, val_input.shape)


(4157, 3) (1040, 3)


In [175]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)

dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


# 1. Cross Validation

In [176]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)

print(scores)


{'fit_time': array([0.00794053, 0.00791121, 0.00840759, 0.00812364, 0.0089159 ]), 'score_time': array([0.00127864, 0.00155735, 0.00140643, 0.00185704, 0.00128388]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [177]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [178]:
# 5-Fold cross validation with the StratifiedKFold splitter to shuffle the training set
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))


0.855300214703487


In [179]:
# 10-Fold cross Validation

splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))


0.8574181117533719


# 2. Hyperparameter Tunning - Grid Search


## 2-1. Single Hyperparameter


In [180]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1) # Set the n_jobs property to -1 to use all cores

In [181]:
# The default value of 'cv' parameter is 5, so the grid search utilize 5-fold cross-validation
# Also, this model trains 5 models for the five cases of parameters
# Thus, Grid Search model totally trains 25(= 5 x 5) models

gs.fit(train_input, train_target)

In [182]:
# This Grid Search model trains the model again with the best combination of paramters the among 25 models

dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [183]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [184]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [185]:
# The Numpy 'argmax()' method finds the index of the greatest value witihin the numpy array.

best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


## 2. Various hyper parameters


In [186]:
# The Numpy 'arange()' method creates an array containing the specific range of real numbers.
# The python 'range()' creates an array containig integer numbers

params= {
    'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
    'max_depth' : range(5, 20, 1),
    'min_samples_split' : range(2, 100, 10)
}


In [187]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [188]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [189]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


# 3. Hyperparameter Tunning - Random Search


In [190]:
from scipy.stats import uniform, randint

# Set the range of random integers
rgen = randint(0, 10)

# Extracting random variates
rgen.rvs(10)

array([5, 9, 6, 5, 4, 7, 4, 2, 2, 2])

In [191]:
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([109,  97, 101,  79,  98,  79, 101, 109, 112, 115]))

In [192]:
# Set the range of random real numbers
ugen = uniform(0, 1)

# Extracting random variates
ugen.rvs(10)

array([0.49991527, 0.95048456, 0.42566363, 0.76024614, 0.99981963,
       0.09699076, 0.42881374, 0.75600578, 0.48912049, 0.19805687])

In [193]:
params = {
    'min_impurity_decrease' : uniform(0.0001, 0.001),
    'max_depth' : randint(20, 50),
    'min_samples_split' : randint(2, 25),
    'min_samples_leaf' : randint(1, 25)
}

In [194]:
from sklearn.model_selection import RandomizedSearchCV

# Set 'n_iter' property to limit the number of samplings to 100 times
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params, n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

In [195]:
print(gs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173, 'min_samples_leaf': 7, 'min_samples_split': 13}


In [196]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8695428296438884


In [197]:
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

0.86
