In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# matplotlib 한글화

import matplotlib as mpl
import matplotlib.font_manager as fm

mpl.rcParams['axes.unicode_minus'] = False

path = 'C:/Windows/Fonts/malgun.ttf'
font_name = fm.FontProperties(fname=path, size=50).get_name()
plt.rc('font', family=font_name)

In [4]:
import warnings
warnings.filterwarnings(action='ignore') 

# Validation
---

- 2중 검증보다는 3중 검증이 오차를 조금 줄일 수 있다.
- 하이퍼 파라미터를 최적화 시킬 때, test 데이터에 과적합이 될 수 있다.
- 하이퍼 파라미터를 최적화 시킬 때 Validation으로 검증시키고, 최종적으로 test 데이터로 한번 더 검증

![Validation](https://t1.daumcdn.net/cfile/tistory/9951E5445AAE1BE025?download)

# Cross Validation
---
- Train 데이터와 Validation 데이터를 서로 cross하면서 검증
- 일반화 성능 측정 방법
- 일반화 수준을 파악할 수 있지만, 비용(시간, 메모리)이 많이든다.

![Cross Validation](https://blog.kakaocdn.net/dn/3gQO8/btqF0ZOHja8/SUTbGTYwVndcUJ5qWusqa0/img.png)

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
x = cancer.data
y = cancer.target

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

knn = KNeighborsClassifier(n_neighbors=5)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=111)

In [14]:
from sklearn.metrics import classification_report, accuracy_score

In [17]:
knn.fit(x_train, y_train)
preds = knn.predict(x_test)
accuracy_score(y_test, preds)     # accuracy
# knn.score(x_test, y_test)       # accuracy_score에 비해 정확하지는 않지만 빠르고 편함

0.9300699300699301

In [39]:
# cv를 조정하여 data를 몇등분 시킬지 정한다
scores = cross_val_score(knn, x_train, y_train, cv=5)
scores

array([0.90697674, 0.94117647, 0.91764706, 0.94117647, 0.92941176])

In [27]:
np.mean(scores)

0.9279459711224964

In [40]:
print('교차검증점수 :', scores)
print('교차검증평균 :', np.mean(scores))
print('교차검증편차 :', np.std(scores))

교차검증점수 : [0.90697674 0.94117647 0.91764706 0.94117647 0.92941176]
교차검증평균 : 0.9272777017783858
교차검증편차 : 0.01338494629296428


In [30]:
# cv를 조정하여 data를 몇등분 시킬지 정한다
scores = cross_val_score(knn, x_train, y_train, cv=3)
print('교차검증점수 :', scores)
print('교차검증평균 :', np.mean(scores))
print('교차검증편차 :', np.std(scores))

교차검증점수 : [0.93661972 0.91549296 0.93661972]
교차검증평균 : 0.9295774647887324
교차검증편차 : 0.009959250439247135


In [32]:
# 분할기 클래스 객체를 사용 : 보다 정교한 cross validation

In [41]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
scores = cross_val_score(knn, x_train, y_train, cv = kfold)
print('교차검증점수 :', scores)
print('교차검증평균 :', np.mean(scores))
print('교차검증편차 :', np.std(scores))

교차검증점수 : [0.90697674 0.94117647 0.92941176 0.94117647 0.92941176]
교차검증평균 : 0.9296306429548565
교차검증편차 : 0.012489253110435606


In [36]:
# shuffle cross validation
kfold = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(knn, x_train, y_train, cv = kfold)

print('교차검증점수 :', scores)
print('교차검증평균 :', np.mean(scores))
print('교차검증편차 :', np.std(scores))

교차검증점수 : [0.94186047 0.95294118 0.91764706 0.89411765 0.95294118]
교차검증평균 : 0.9319015047879617
교차검증편차 : 0.02287061693306545


In [37]:
# scoring : 원하는 평가 지표를 넣을 수 있다.
# f1_macro : f1_score
kfold = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(knn, x_train, y_train, cv = kfold, scoring='f1_macro')

print('교차검증점수 :', scores)
print('교차검증평균 :', np.mean(scores))
print('교차검증편차 :', np.std(scores))

교차검증점수 : [0.90165809 0.91378061 0.8466811  0.93280632 0.93772894]
교차검증평균 : 0.9065310120727241
교차검증편차 : 0.032625195841567554


In [38]:
# precision_macro : 정밀도
kfold = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(knn, x_train, y_train, cv = kfold, scoring='precision_macro')

print('교차검증점수 :', scores)
print('교차검증평균 :', np.mean(scores))
print('교차검증편차 :', np.std(scores))

교차검증점수 : [0.96774194 0.89317043 0.93837803 0.91833726 0.8872549 ]
교차검증평균 : 0.9209765121732761
교차검증편차 : 0.029688748042841565
