## 다양한 교차검증
- model_selection 모듈
  - cross_val_score()
  - cross_val_validate()
  - cross_val_predict()

In [84]:
# 모듈로드
from sklearn.model_selection import cross_val_predict,cross_val_score,cross_validate,train_test_split,KFold,StratifiedKFold
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

In [85]:
# 데이터 불러오기
iris=load_iris()

In [86]:
# input과 target 분리
data=iris['data']
target=iris['target']
featureName=iris['feature_names']
className=iris['target_names']

In [87]:
featureName
className

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [88]:
# 모델생성
# max_iter: 처음부터 끝까지 샘플데이터 학습 => 에포크(epoch) 횟수
lrmodel=LogisticRegression(max_iter=500)

In [89]:
# 교차검증으로 모델 학습 진행
# 학습 데이터 기반 5개 Fold로 학습 & 검증 진행
result=cross_val_score(lrmodel,data,target)

In [90]:
# 5개 모델에 대한 정확도(accuracy)
result

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [91]:
# 10개 모델로 정확도 계산
lrmodel=LogisticRegression(max_iter=500)
result=cross_val_score(lrmodel,data,target,cv=10)
result

array([1.        , 0.93333333, 1.        , 1.        , 0.93333333,
       0.93333333, 0.93333333, 1.        , 1.        , 1.        ])

In [92]:
allresult=cross_validate(lrmodel,data,target,
return_train_score=True,cv=7)

In [93]:
result_df=pd.DataFrame(allresult)
result_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.019875,0.000532,0.954545,0.96875
1,0.015098,0.0,1.0,0.96875
2,0.015088,0.0,0.909091,0.976562
3,0.0151,0.000998,0.952381,0.976744
4,0.0147,0.0,0.952381,0.992248
5,0.014004,0.000988,1.0,0.968992
6,0.015957,0.001,1.0,0.976744


In [94]:
# cv객체를 만들어서 얻을 수도 있다.
# Splitter 객체 생성 - KFold()
ksplitter=KFold(n_splits=7,shuffle=True)
allresult=cross_validate(lrmodel,data,target,
return_train_score=True,cv=ksplitter)
result_df=pd.DataFrame(allresult)
result_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.023586,0.000997,1.0,0.96875
1,0.0231,0.000575,1.0,0.976562
2,0.028151,0.0,1.0,0.96875
3,0.025389,0.00034,0.952381,0.96124
4,0.015978,0.0,0.952381,0.976744
5,0.017979,0.001002,0.857143,0.992248
6,0.022506,0.0,1.0,0.976744


In [95]:
# cv객체를 만들어서 얻을 수도 있다.
# StratifiedKFold 객체 생성 - StratifiedKFold()
ssplitter=StratifiedKFold(n_splits=7,shuffle=True)
allresult=cross_validate(lrmodel,data,target,cv=ssplitter,
return_train_score=True,return_estimator=True)
result_df=pd.DataFrame(allresult)
result_df

# return_train_score: 학습데이터 평가 결과 반환
# return_estimator: 모델 객체 반환여부 설정

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,0.026354,0.0,LogisticRegression(max_iter=500),1.0,0.976562
1,0.028018,0.0,LogisticRegression(max_iter=500),1.0,0.96875
2,0.025377,0.000708,LogisticRegression(max_iter=500),0.954545,0.960938
3,0.02673,0.000413,LogisticRegression(max_iter=500),1.0,0.968992
4,0.017944,0.00099,LogisticRegression(max_iter=500),1.0,0.976744
5,0.015069,0.001189,LogisticRegression(max_iter=500),0.904762,0.992248
6,0.012748,0.0,LogisticRegression(max_iter=500),0.904762,0.984496
