# machine learning validation 
* reference
  * https://blog.amedama.jp/entry/2018/07/23/084500

In [None]:
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
iris_x = pd.DataFrame(iris.data, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
display(iris_x)
iris_y = pd.Series(iris.target)
display(iris_y)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Length: 150, dtype: int64

In [None]:
# hold-out
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(iris_x, iris_y, test_size=0.3, shuffle=True) # hold-out

In [None]:
import xgboost as xgb

clf = xgb.XGBClassifier()

In [None]:
from sklearn import model_selection

# train
clf = clf.fit(train_x, train_y.values)

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.model_selection import GridSearchCV

## Method 1: simple k-folds cross validation

In [None]:
scoring = {"p": "precision_macro",
           "r": "recall_macro",
           "f":"f1_macro"}

skfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# cross-validate is convenient than cross_val_score
scores = cross_validate(clf, iris_x, iris_y,
                        cv=skfold, scoring=scoring)

display(scores)
for key,value in scores.items():
    print("{}: {:.2f}, {:.2f}".format(key, value.mean(), value.std()))

{'fit_time': array([0.0242281 , 0.02018285, 0.01956892]),
 'score_time': array([0.00362301, 0.00345182, 0.0034802 ]),
 'test_p': array([0.95955882, 0.91911765, 0.94212963]),
 'test_r': array([0.95955882, 0.91911765, 0.94117647]),
 'test_f': array([0.95955882, 0.91911765, 0.94112554])}

fit_time: 0.02, 0.00
score_time: 0.00, 0.00
test_p: 0.94, 0.02
test_r: 0.94, 0.02
test_f: 0.94, 0.02


## Method 2: k-folds cross validation with grid search


In [None]:
skfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
clf_cv = GridSearchCV(estimator=clf, param_grid={'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, cv=skfold, verbose=1)
clf_cv.fit(train_x, train_y.values)

print(f'clf_cv.best_params_: {clf_cv.best_params_}')
print(f'clf_cv.best_score_: {clf_cv.best_score_}')

# re-train using best parameters
clf = xgb.XGBClassifier(**clf_cv.best_params_)
clf.fit(train_x, train_y.values)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
clf_cv.best_params_: {'max_depth': 2, 'n_estimators': 100}
clf_cv.best_score_: 0.9333333333333332


XGBClassifier(max_depth=2, objective='multi:softprob')

## Method 3: netsted k-folds cross validation with grid search


In [None]:
scoring = {"p": "precision_macro",
           "r": "recall_macro",
           "f":"f1_macro"}

skfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
clf_cv = GridSearchCV(estimator=clf, param_grid={'max_depth': [2,4,6], 'n_estimators': [50,100,200]}, cv=skfold, verbose=1)

scores = cross_validate(clf_cv, iris_x, iris_y,
                        cv=skfold, scoring=scoring)

display(scores)
for key,value in scores.items():
    print("{}: {:.2f}, {:.2f}".format(key, value.mean(), value.std()))

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Fitting 3 folds for each of 9 candidates, totalling 27 fits


{'fit_time': array([0.58187461, 0.574301  , 0.59132195]),
 'score_time': array([0.00520134, 0.00421238, 0.00345945]),
 'test_p': array([0.95955882, 0.91911765, 0.96078431]),
 'test_r': array([0.95955882, 0.91911765, 0.96078431]),
 'test_f': array([0.95955882, 0.91911765, 0.96078431])}

fit_time: 0.58, 0.01
score_time: 0.00, 0.00
test_p: 0.95, 0.02
test_r: 0.95, 0.02
test_f: 0.95, 0.02
