# Model Selection 

### 학습/테스트 데이터세트 분리 - train_test_split()

In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# 데이터 호출
iris = load_iris()

## 넘파이 ndarray 활용
### 학습/테스트 분리 (8:2 비율)
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, 
                                                    test_size = 0.2,
                                                    random_state = 1214)


## Pandas DataFrame/Series 활용
### DataFrame화 진행
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
### 학습/테스트 분리 (8:2 비율)
X_train, X_test, y_train, y_test = train_test_split(iris_df.iloc[:,:-1], iris_df.iloc[:,-1], 
                                                    test_size = 0.2, random_state = 1214)


# 교차검증

## K-Fold

In [6]:
from sklearn.model_selection import KFold
import numpy as np

# 5개의 fold를 생성하기
kfold = KFold(n_splits = 5)

# 모델 호출
Decisiontree_model = DecisionTreeClassifier()


# 교차검증 진행
## feature vs target 나누기
iris = load_iris()
features = iris.data
target = iris.target

accuracy_list = []
for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # 학습 
    Decisiontree_model.fit(X_train, y_train)
    prediction_value = Decisiontree_model.predict(X_test)
    

    # 정확도 측정
    accuracy = np.round(accuracy_score(y_test, prediction_value), 4)
    accuracy_list.append(accuracy)

print('평균 정확도 : ', np.mean(accuracy_list))

평균 정확도 :  0.9066599999999999


## StratifiedKFold

Imbalance 문제를 해결하여 진행할 수 있는 방법


In [8]:
from sklearn.model_selection import StratifiedKFold

stratified = StratifiedKFold(n_splits = 5)

for train_index, test_index in stratified.split(features, target):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # 학습 
    Decisiontree_model.fit(X_train, y_train)
    prediction_value = Decisiontree_model.predict(X_test)
    

    # 정확도 측정
    accuracy = np.round(accuracy_score(y_test, prediction_value), 4)
    accuracy_list.append(accuracy)
print('평균 정확도 : ', np.mean(accuracy_list))

평균 정확도 :  0.93334


# 교차검증 정확도 

## cross_val_score()

In [10]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(Decisiontree_model, features, target, scoring = 'accuracy', cv = 5)
print(scores)
print('평균 교차검증 정확도 : ', np.mean(scores))

[0.96666667 0.96666667 0.9        1.         1.        ]
평균 교차검증 정확도 :  0.9666666666666668


## GridSearchCV

In [16]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, 
                                                    test_size = 0.2, random_state = 1214)
 
# 모델 호출
Decisiontree_model = DecisionTreeClassifier()

# 탐색할 파라미터값 설정
parameters = {'max_depth' : [1,2,3], 'min_samples_split' : [2,3]}

# GridSearchCV를 활용하여 최적의 파라미터값 도출
## refit = True는 default --> 가장 좋은 파라미터 재학습 수행
cv_num = 5
grid_decisiontree = GridSearchCV(Decisiontree_model, param_grid=parameters, cv = cv_num, refit = True, return_train_score = True)

# 순차적으로 학습 및 평가
grid_decisiontree.fit(X_train, y_train)

# GridSearchCV 결과물 저장
scores_df = pd.DataFrame(grid_decisiontree.cv_results_)
split_test_scores_list = []
for i in range(cv_num):
    split_test_scores_list.append(f'split{i}_test_score')
columns = ['params', 'mean_test_score','rank_test_score'] + split_test_scores_list
scores_df[columns]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.675,5,0.708333,0.666667,0.666667,0.666667,0.666667
1,"{'max_depth': 1, 'min_samples_split': 3}",0.675,5,0.708333,0.666667,0.666667,0.666667,0.666667
2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,3,0.958333,0.958333,0.791667,0.958333,0.958333
3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,3,0.958333,0.958333,0.791667,0.958333,0.958333
4,"{'max_depth': 3, 'min_samples_split': 2}",0.941667,1,1.0,1.0,0.833333,0.916667,0.958333
5,"{'max_depth': 3, 'min_samples_split': 3}",0.941667,1,1.0,1.0,0.833333,0.916667,0.958333


In [18]:
print('GridSearchCV 최적 파라미터 : ', grid_decisiontree.best_params_)
print('GridSearchCV 최고 정확도 : {0:.4f}'.format(grid_decisiontree.best_score_))

# 테스트 데이터 세트 정확도
pred = grid_decisiontree.predict(X_test)
print('테스트 데이터 세트 정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

GridSearchCV 최적 파라미터 :  {'max_depth': 3, 'min_samples_split': 2}
GridSearchCV 최고 정확도 : 0.9417
테스트 데이터 세트 정확도: 0.9333
