<a href="https://colab.research.google.com/github/zzhining/ml_basic/blob/main/cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  cross validation


## baseline model

In [7]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# 데이터 불러오기
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target

# 독립변수 종속변수 분할
X = iris_df.iloc[:, :4]
y = iris_df['label']

# 학습용 데이터와 테스트용 데이터로 나눔
X_train, X_test, y_train, y_test = train_test_split(X, y)

# DecisionTreeClassifier 객체 생성 
model = DecisionTreeClassifier(random_state=11)

# 학습 수행 
model.fit(X_train, y_train)

#테스트용 데이터로 모델 평가
score = model.score(X_test, y_test)
print('정확도:{:.4f}'.format(score))

정확도:0.9474


## KFold

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
i = 0
accuracy_list = []

for train_index, test_index in kfold.split(X):    
    print('[{}번째 학습]'.format(i+1))
    #print('학습용 데이터의 인덱스:{}'.format(train_index))
    print('테스트용 데이터의 인덱스:{}~{}'.format(test_index[0], test_index[-1]))
    
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    print('\n학습용 데이터의 각 범주 별 개수')
    print(y_train.value_counts())
        
    print('\n테스트용 데이터의 각 범주 별 개수')
    print(y_test.value_counts())

    #학습
    model.fit(X_train , y_train)
    
    #적용
    pred = model.predict(X_test)
   
    #평가
    accuracy = accuracy_score(y_test,pred)    
    print('\n정확도:{:.4f}'.format(accuracy))
    
    accuracy_list.append(accuracy)
    i = i + 1
    print('-------------------------------------------------------------')

print('{0}번 학습 평균 정확도:{1:.4f}'.format(i, np.mean(accuracy_list))) 

[1번째 학습]
테스트용 데이터의 인덱스:0~29

학습 데이터의 각 범주 별 개수
1    50
2    50
0    20
Name: label, dtype: int64

테스트 데이터의 각 범주 별 개수
0    30
Name: label, dtype: int64

정확도:1.0000
-------------------------------------------------------------
[2번째 학습]
테스트용 데이터의 인덱스:30~59

학습 데이터의 각 범주 별 개수
2    50
1    40
0    30
Name: label, dtype: int64

테스트 데이터의 각 범주 별 개수
0    20
1    10
Name: label, dtype: int64

정확도:0.9667
-------------------------------------------------------------
[3번째 학습]
테스트용 데이터의 인덱스:60~89

학습 데이터의 각 범주 별 개수
0    50
2    50
1    20
Name: label, dtype: int64

테스트 데이터의 각 범주 별 개수
1    30
Name: label, dtype: int64

정확도:0.8667
-------------------------------------------------------------
[4번째 학습]
테스트용 데이터의 인덱스:90~119

학습 데이터의 각 범주 별 개수
0    50
1    40
2    30
Name: label, dtype: int64

테스트 데이터의 각 범주 별 개수
2    20
1    10
Name: label, dtype: int64

정확도:0.9333
-------------------------------------------------------------
[5번째 학습]
테스트용 데이터의 인덱스:120~149

학습 데이터의 각 범주 별 개수
0    50
1    50
2    20
Name: 

## StratifiedKFold

In [19]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=3)
i = 0
accuracy_list = []

for train_index, test_index in skfold.split(X, y):
    print('[{}번째 학습]'.format(i+1))
    # print('학습용 데이터:{}'.format(train_index))
    print('테스트용 데이터의 인덱스:{}~{}'.format(test_index[0], test_index[-1]))
    
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    print('\n학습용 데이터의 각 범주 별 개수')
    print(y_train.value_counts())
        
    print('\n테스트용 데이터의 각 범주 별 개수')
    print(y_test.value_counts())

    #학습
    model.fit(X_train , y_train)
    
    #적용
    pred = model.predict(X_test)
   
    #평가
    accuracy = accuracy_score(y_test,pred)    
    print('정확도:{:.4f}'.format(accuracy))
    
    accuracy_list.append(accuracy)
    i = i + 1
    print('-------------------------------------------------------------')

print('{0}번 학습 평균 정확도:{1:.4f}'.format(i, np.mean(accuracy_list))) 

[1번째 학습]
테스트용 데이터의 인덱스:0~115

학습용 데이터의 각 범주 별 개수
2    34
0    33
1    33
Name: label, dtype: int64

테스트용 데이터의 각 범주 별 개수
0    17
1    17
2    16
Name: label, dtype: int64
정확도:0.9800
-------------------------------------------------------------
[2번째 학습]
테스트용 데이터의 인덱스:17~132

학습용 데이터의 각 범주 별 개수
1    34
0    33
2    33
Name: label, dtype: int64

테스트용 데이터의 각 범주 별 개수
0    17
2    17
1    16
Name: label, dtype: int64
정확도:0.9200
-------------------------------------------------------------
[3번째 학습]
테스트용 데이터의 인덱스:34~149

학습용 데이터의 각 범주 별 개수
0    34
1    33
2    33
Name: label, dtype: int64

테스트용 데이터의 각 범주 별 개수
1    17
2    17
0    16
Name: label, dtype: int64
정확도:0.9800
-------------------------------------------------------------
3번 학습 평균 정확도:0.9600


## cross_validate

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

# cv: 3개의 train, test set fold 로 나누어 학습 
scores = cross_validate(model, X, y, cv=3, return_train_score=True, return_estimator=True)
sorted(scores.keys())

['estimator', 'fit_time', 'score_time', 'test_score', 'train_score']

In [None]:
scores
# scores['test_score']
# scores['estimator'][0]

{'fit_time': array([0.00199389, 0.00202084, 0.00196314]),
 'score_time': array([0.00099778, 0.        , 0.00099802]),
 'estimator': (DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                         max_depth=None, max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort='deprecated',
                         random_state=11, splitter='best'),
  DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                         max_depth=None, max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort='deprecated',
                         random_state=11, splitter='b

## GridSearchCV

In [20]:
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeClassifier()

# parameter를 dictionary 형태로 설정
parameters = {'max_depth':[1,5,10], 'min_samples_split':[3,5]}

# param_grid: 하이퍼파라미터
# cv: 3개의 train, test set fold 로 나누어 학습    
grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3)

# param_grid의 하이퍼파라미터들을 순차적으로 학습
grid_dtree.fit(X_train, y_train)

# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004973,0.00211,0.002617,0.00024,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.676471,0.666667,0.666667,0.669935,0.004622,5
1,0.003417,0.000321,0.002232,0.000133,1,5,"{'max_depth': 1, 'min_samples_split': 5}",0.676471,0.666667,0.666667,0.669935,0.004622,5
2,0.00313,0.000382,0.004595,0.003408,5,3,"{'max_depth': 5, 'min_samples_split': 3}",0.970588,0.939394,0.878788,0.92959,0.038113,4
3,0.007234,0.002951,0.002441,0.000271,5,5,"{'max_depth': 5, 'min_samples_split': 5}",1.0,0.939394,0.969697,0.969697,0.024742,1
4,0.003061,0.000255,0.002159,0.000164,10,3,"{'max_depth': 10, 'min_samples_split': 3}",1.0,0.939394,0.878788,0.939394,0.049485,3
5,0.002856,0.000101,0.002181,0.00014,10,5,"{'max_depth': 10, 'min_samples_split': 5}",0.970588,0.939394,0.969697,0.959893,0.0145,2


In [28]:
scores_df.iloc[:, 6:].sort_values(by ='rank_test_score')

Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
3,"{'max_depth': 5, 'min_samples_split': 5}",1.0,0.939394,0.969697,0.969697,0.024742,1
5,"{'max_depth': 10, 'min_samples_split': 5}",0.970588,0.939394,0.969697,0.959893,0.0145,2
4,"{'max_depth': 10, 'min_samples_split': 3}",1.0,0.939394,0.878788,0.939394,0.049485,3
2,"{'max_depth': 5, 'min_samples_split': 3}",0.970588,0.939394,0.878788,0.92959,0.038113,4
0,"{'max_depth': 1, 'min_samples_split': 3}",0.676471,0.666667,0.666667,0.669935,0.004622,5
1,"{'max_depth': 1, 'min_samples_split': 5}",0.676471,0.666667,0.666667,0.669935,0.004622,5


In [22]:
print('GridSearchCV 최적 파라미터:', grid_dtree.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_dtree.best_score_))

# 최고 성능을 낸 분류기
estimator = grid_dtree.best_estimator_

pred = estimator.predict(X_test)
print('테스트 데이터 세트 정확도: {0:.4f}'.format(accuracy_score(y_test,pred)))

GridSearchCV 최적 파라미터: {'max_depth': 5, 'min_samples_split': 5}
GridSearchCV 최고 정확도: 0.9697
테스트 데이터 세트 정확도: 0.9800
