# 교차검증

### 과적합 : 모델이 학습 데이터에만 과도하게 최적화된 현상.
- 그로 인해 일반화된 데이터에서는 예측 성능이 과하게 떨어지는 현상
- 지난번 와인 맛 평가에서 훈련용 데이터의 Acc는 72.94,

### 테스트용 데이터는 Acc가 71.61%였는데, 누가 이 결과가 정말 괜찮은 것인지 묻는다면?
- 교차검증을 이용
- 나에게 주어진 데이터에 적용한 모델의 성능을 정확히 표현하기 위해서도 유용하다

![nn](img/k-fold.png)

In [3]:
import numpy as np
import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])

### 데이터셋 split

In [4]:
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

In [8]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state = 13)
wine_tree.fit(X_train, y_train)

y_pred_tr = wine_tree.predict(X_train)
y_pred_test = wine_tree.predict(X_test)

print('Train acc: ', accuracy_score(y_train, y_pred_tr))
print('Test acc: ', accuracy_score(y_test, y_pred_test))

Train acc:  0.7294593034442948
Test acc:  0.7161538461538461


### 여기서 잠깐, 그러니까 누가, “데이터를 저렇게 분리하는 것이 최선인건가?”
- “저 acc를 어떻게 신뢰할 수 있는가?” 라고 묻는다면~

### k-fold 이용한 cross validation이 등장함.

In [20]:
from sklearn.model_selection import KFold

# k-fold 선언
kfold = KFold(n_splits=5) # 3 fold 보다는 5 fold를 많이 씀

# 의사결정나무 모델 선언
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

### KFold는 index를 반환
- k-fold 이용해서 데이터셋 나눔

In [22]:
# X데이터를 k-fold 이용해 train, test 로 나눔.

for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))
    print(train_idx[:10])

5197 1300
[1300 1301 1302 1303 1304 1305 1306 1307 1308 1309]
5197 1300
[0 1 2 3 4 5 6 7 8 9]
5198 1299
[0 1 2 3 4 5 6 7 8 9]
5198 1299
[0 1 2 3 4 5 6 7 8 9]
5198 1299
[0 1 2 3 4 5 6 7 8 9]


### 5개의 fold를 각각 학습
- train_idx, test_idx 안에는 각각 데이터셋(row)의 인덱스 값이 들어있음.
- 이걸로 X에서 iloc 하면 rows 가져올 수 있음
- 총 fold가 5이므로 acc도 5개 존재함.

In [11]:
cv_accuracy = []

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy

[0.6007692307692307,
 0.6884615384615385,
 0.7090069284064665,
 0.7628945342571208,
 0.7867590454195535]

### 모델의 정확도가 0.60 ~ 0.78까지 범위가 넓음
- 5개의 평균을 대표값으로 사용

In [12]:
np.mean(cv_accuracy)

0.709578255462782

### stratifiedKFold 쓰고자 한다면?

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

In [14]:
cv_accuracy = []

for train_idx, test_idx in skfold.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(X_train, y_train)
    pred = wine_tree_cv.predict(X_test)
    cv_accuracy.append(accuracy_score(y_test, pred))
cv_accuracy

[0.5523076923076923,
 0.6884615384615385,
 0.7143956889915319,
 0.7321016166281755,
 0.7567359507313318]

### stratified k-fold 이용하니 정확도 더 나빠짐
#### 왜?

In [15]:
np.mean(cv_accuracy)

0.6888004974240539

# * cross validation 관련 라이브러리 존재함. 이게 더 편함

In [17]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=13)

acc = cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)
np.mean(acc)

0.6888004974240539

### depth를 2에서 5로 높여도 정확도는 떨어짐.

In [18]:
wine_tree_cv = DecisionTreeClassifier(max_depth=5, random_state=13)
acc = cross_val_score(wine_tree_cv, X, y, scoring=None, cv=skfold)
np.mean(acc)

0.6663391958311127

### cross_validate
- 과적합 현상이 발생 중
- train에서는 정확도 73~74%인데에 반해
- test에서는 정확도가 55~75% 이다.

In [24]:
from sklearn.model_selection import cross_validate

cross_validate(wine_tree_cv, X, y, scoring=None, cv=skfold, return_train_score = True)

{'fit_time': array([0.00699854, 0.00698209, 0.00698447, 0.00598764, 0.0069828 ]),
 'score_time': array([0.00198317, 0.00199389, 0.00099373, 0.00199008, 0.00099564]),
 'test_score': array([0.55230769, 0.68846154, 0.71439569, 0.73210162, 0.75673595]),
 'train_score': array([0.74773908, 0.74696941, 0.74317045, 0.73509042, 0.73258946])}

---

---

# 하이퍼파라미터 튜닝

- feature engineering : 특성을 관찰하여 더 나은 성능을 위해 새로운 특성을 찾아내는 과정

![nn](img/hyper_param.png)

In [25]:
import numpy as np
import pandas as pd

red_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/ML_tutorial/master/dataset/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])

wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
X = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']

### Grid Search CV
- 최적의 하이퍼파라미터 찾아주는 과정
- CV: Cross Validation

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

params = {'max_depth' : [2, 4, 7, 10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=13)

In [42]:
# 정의한 tree 분류기에 depth 파라미터를 넣고 5-fold로 fit 하라는 코드 (매우 쉬움)
grid_search = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
grid_search.fit(X, y)

### pprint 이용해 grid search cv의 결과 확인

In [31]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(grid_search.cv_results_)

{   'mean_fit_time': array([0.00719695, 0.01177454, 0.02273993, 0.03491073]),
    'mean_score_time': array([0.00178885, 0.00139017, 0.00159492, 0.00159917]),
    'mean_test_score': array([0.6888005 , 0.66356523, 0.65340854, 0.64401587]),
    'param_max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object),
    'params': [   {'max_depth': 2},
                  {'max_depth': 4},
                  {'max_depth': 7},
                  {'max_depth': 10}],
    'rank_test_score': array([1, 2, 3, 4]),
    'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]),
    'split1_test_score': array([0.68846154, 0.63153846, 0.60307692, 0.60076923]),
    'split2_test_score': array([0.71439569, 0.72363356, 0.68360277, 0.66743649]),
    'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]),
    'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.72517321]),
    'std

### 최적의 파라미터 찾기

In [32]:
print(grid_search.best_estimator_)
print(grid_search.best_score_)
print(grid_search.best_params_)

DecisionTreeClassifier(max_depth=2, random_state=13)
0.6888004974240539
{'max_depth': 2}


### 파이프라인에 최적의 파라미터 적용한 모델 만들기

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

estimators = [('scaler', StandardScaler()), 
              ('clf', DecisionTreeClassifier(random_state=13))]
pipe = Pipeline(estimators)

param_grid = {'clf__max_depth' : [2, 4, 7, 10]}
GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(X, y)

### 이때가 제일 좋은 모델.

In [38]:
print(GridSearch.best_estimator_)

Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', DecisionTreeClassifier(max_depth=2, random_state=13))])


In [39]:
print(GridSearch.best_score_)

0.6888004974240539


In [40]:
print(GridSearch.cv_results_)

{'mean_fit_time': array([0.00884061, 0.01355677, 0.02486634, 0.03679085]), 'std_fit_time': array([0.0007786 , 0.00048318, 0.00077313, 0.00196682]), 'mean_score_time': array([0.00139413, 0.00180302, 0.00179482, 0.00199971]), 'std_score_time': array([4.90997200e-04, 4.02780839e-04, 3.98637119e-04, 1.29165672e-05]), 'param_clf__max_depth': masked_array(data=[2, 4, 7, 10],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'clf__max_depth': 2}, {'clf__max_depth': 4}, {'clf__max_depth': 7}, {'clf__max_depth': 10}], 'split0_test_score': array([0.55230769, 0.51230769, 0.50846154, 0.51615385]), 'split1_test_score': array([0.68846154, 0.63153846, 0.60461538, 0.60230769]), 'split2_test_score': array([0.71439569, 0.72363356, 0.68206313, 0.66589684]), 'split3_test_score': array([0.73210162, 0.73210162, 0.73672055, 0.71054657]), 'split4_test_score': array([0.75673595, 0.7182448 , 0.73518091, 0.72517321]), 'mean_test_score': array([0.6888005

# 표로 성능을 정리하기

In [44]:
import pandas as pd

score_df = pd.DataFrame(GridSearch.cv_results_)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]

Unnamed: 0,params,rank_test_score,mean_test_score,std_test_score
0,{'clf__max_depth': 2},1,0.6888,0.071799
1,{'clf__max_depth': 4},2,0.663565,0.083905
2,{'clf__max_depth': 7},3,0.653408,0.086993
3,{'clf__max_depth': 10},4,0.644016,0.076915
