# Model Selection

## - Train/Test 데이터를 분리하지 않고 머신러닝 수행

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
iris = load_iris()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [3]:
# 일반성 떨어지고 과적합
pred = dt_clf.predict(iris.data)
accuracy_score(iris.target, pred)

1.0

## - cross_validate method

In [4]:
from sklearn.model_selection import cross_validate

In [5]:
dtc = DecisionTreeClassifier()
cross_validate(dtc,iris.data,iris.target)
# fit_time 5번:5번 교차검증

{'fit_time': array([0.00104189, 0.00098681, 0.00100946, 0.00094914, 0.00099802]),
 'score_time': array([0., 0., 0., 0., 0.]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])}

In [6]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target, return_train_score = True)

{'fit_time': array([0.00099659, 0.00102568, 0.        , 0.        , 0.        ]),
 'score_time': array([0.0009551 , 0.        , 0.        , 0.        , 0.00095916]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

## - Train/Test 데이터 셋을 분리

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=2021
)

In [10]:
import pandas as pd
pd.Series(y_test).value_counts()

0    41
1    41
2    38
dtype: int64

## - stratified 분리 (균등하게 배분)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=2021,
    stratify=iris.target
)
pd.Series(y_test).value_counts()

0    40
1    40
2    40
dtype: int64

## - cross_val_score() method

In [12]:
from sklearn.model_selection import cross_val_score

In [13]:
# 성능 지표는 정확도(accuracy), 교차검증 세트는 5개 
cross_val_score(dtc, iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])

In [14]:
import numpy as np
scores = cross_val_score(dtc, iris.data, iris.target, cv=5)
np.mean(scores)

0.9666666666666668

# GridSearchCV
## - cv: 교차 검증을 위한 학습/테스트 세트의 갯수
## - 교차검증과 최적 하이퍼 파라메터 튜닝을 한꺼번에 수행

In [16]:
DTC = DecisionTreeClassifier(random_state=2021)

In [17]:
#parameter를 dictionary 형태로 설정
params = {
    'max_depth':[2,3,4,5],
    'min_samples_split':[2,3]
}

In [18]:
from sklearn.model_selection import GridSearchCV
grid_dtc = GridSearchCV(dtc,param_grid=params, cv=3)

In [19]:
grid_dtc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [2, 3, 4, 5],
                         'min_samples_split': [2, 3]})

In [22]:
# 최적 파라미터
grid_dtc.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [23]:
# 최고 정확도
grid_dtc.best_score_

0.9666666666666667

In [24]:
# 최고 정확도를 가지는 최적 파라미터로 학습한 estimator
best_estimator = grid_dtc.best_estimator_
best_estimator.score(X_test,y_test)

0.9666666666666667