# 교차 검증

In [4]:
import numpy as np
from sklearn.model_selection import KFold

X = np.array([[1,2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=2)

print(kf.get_n_splits(X))
print(kf)

for train_idx, test_idx in kf.split(X):
    print('--- idx')
    print(train_idx, test_idx)
    print('--- train data')
    print(X[train_idx])
    print('--- val data')
    print(X[test_idx])

2
KFold(n_splits=2, random_state=None, shuffle=False)
--- idx
[2 3] [0 1]
--- train data
[[1 2]
 [3 4]]
--- val data
[[1 2]
 [3 4]]
--- idx
[0 1] [2 3]
--- train data
[[1 2]
 [3 4]]
--- val data
[[1 2]
 [3 4]]


## 이전에 했던 방법

In [5]:
import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=";")
white_wine = pd.read_csv(white_url, sep=";")

red_wine['color'] = 1.0
white_wine['color'] = 0.0

wine = pd.concat([red_wine, white_wine])

In [7]:
wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=17)
wine_tree.fit(X_train, y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train Acc : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc : ', accuracy_score(y_test, y_pred_test))

Train Acc :  0.7381181450837021
Test Acc :  0.72


## KFold

In [10]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state = 17)

In [12]:
for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))

5197 1300
5197 1300
5198 1299
5198 1299
5198 1299


In [14]:
cv_accuracy = []

for train_idx, test_idx in kfold.split(X):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

cv_accuracy

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

In [15]:
np.mean(cv_accuracy)

0.709578255462782

## StratifiedKFold

In [16]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=17)

cv_accuracy = []

for train_idx, test_idx in skfold.split(X, y):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

cv_accuracy

[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]

In [17]:
np.mean(cv_accuracy)

0.6888004974240539

## Cross Validation

In [18]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=17)

cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)

array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595])

In [20]:
wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=17)

cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)

array([0.50692308, 0.62615385, 0.69745958, 0.7582756 , 0.74903772])

### 함수로 만들어서 확인하기

In [23]:
def skfold_dt(depth):
    from sklearn.model_selection import cross_val_score

    skfold = StratifiedKFold(n_splits=5)
    wine_tree_cv = DecisionTreeClassifier(max_depth=depth, random_state=17)

    print(cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold))

In [24]:
skfold_dt(3)

[0.56846154 0.68846154 0.71439569 0.73210162 0.75673595]


In [25]:
skfold_dt(5)

[0.50692308 0.62615385 0.69745958 0.7582756  0.74903772]


## Cross_validate
- train score도 함께 확인할 수 있다.

In [26]:
from sklearn.model_selection import cross_validate

cross_validate(wine_tree_cv, X, y, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.02044964, 0.02476764, 0.02898741, 0.02587032, 0.02157378]),
 'score_time': array([0.00432491, 0.00441599, 0.00498223, 0.00299001, 0.00202966]),
 'test_score': array([0.50692308, 0.62615385, 0.69745958, 0.7582756 , 0.74903772]),
 'train_score': array([0.78795459, 0.78045026, 0.77568295, 0.76356291, 0.76279338])}

# 하이퍼파라미터 튜닝

In [27]:
import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=";")
white_wine = pd.read_csv(white_url, sep=";")

red_wine['color'] = 1.0
white_wine['color'] = 0.0

wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1. if grade>5 else 0. for grade in wine['quality']]

X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth' : [2, 4, 7, 10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=17)

gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
gridsearch.fit(X, y)

In [30]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)

{   'mean_fit_time': array([0.01248422, 0.01751733, 0.02907572, 0.04612923]),
    'mean_score_time': array([0.00382843, 0.00287838, 0.00245838, 0.0026505 ]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.65448463, 0.6426296 ]),
    'param_max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'max_depth': 2},
                  {'max_depth': 4},
                  {'max_depth': 7},
                  {'max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4]),
    'split0_test_score': array([0.55230769, 0.51230769, 0.51692308, 0.50846154]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60538462, 0.61230769]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68052348, 0.66820631]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73518091, 0.70207852]),
    'split4_test_score': array([0.75673595, 0.7182448 , 0.73441109, 0.72209392]),
    'std

## 최적의 성능을 가진 모델은?!

In [31]:
gridsearch.best_estimator_

In [32]:
gridsearch.best_score_

0.6888004974240539

In [33]:
gridsearch.best_params_

{'max_depth': 2}

## Pipeline을 적용한 모델에 GridSearch 적용하기

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

estimators = [('scaler', StandardScaler()),
              ('clf', DecisionTreeClassifier(random_state=17))]

pipe = Pipeline(estimators)

In [35]:
param_grid = [ {'clf__max_depth': [2, 4, 7, 10]} ]

GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(X, y)

In [38]:
GridSearch.best_estimator_

In [39]:
GridSearch.best_score_

0.6888004974240539

## 예쁘게 표로 정리하기

In [41]:
import pandas as pd

score_df = pd.DataFrame(GridSearch.cv_results_)
score_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.014794,0.001733,0.00358,0.000769,2,{'clf__max_depth': 2},0.552308,0.688462,0.714396,0.732102,0.756736,0.6888,0.071799,1
1,0.018173,0.001688,0.003151,0.001314,4,{'clf__max_depth': 4},0.512308,0.631538,0.723634,0.732102,0.718245,0.663565,0.083905,2
2,0.032387,0.000664,0.002256,0.000495,7,{'clf__max_depth': 7},0.516923,0.606154,0.678984,0.735181,0.734411,0.654331,0.083378,3
3,0.04933,0.006725,0.002291,0.000367,10,{'clf__max_depth': 10},0.511538,0.613077,0.666667,0.702079,0.721324,0.642937,0.075316,4


In [42]:
score_df [['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,{'clf__max_depth': 2},1,0.6888,0.071799
1,{'clf__max_depth': 4},2,0.663565,0.083905
2,{'clf__max_depth': 7},3,0.654331,0.083378
3,{'clf__max_depth': 10},4,0.642937,0.075316
