# medel selection 모듈

- 훈련/테스트 데이터로 분리하지 않고 머신러닝 수행

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier


In [3]:
iris = load_iris()
dtc = DecisionTreeClassifier()
dtc.fit(iris.data, iris.target)
dtc.score(iris.data, iris.target)

1.0

- cross_validate()

In [4]:
from sklearn.model_selection import cross_validate

In [5]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target)


{'fit_time': array([0.00099707, 0.00099707, 0.        , 0.        , 0.        ]),
 'score_time': array([0.        , 0.        , 0.0009954 , 0.        , 0.00099707]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])}

- cross_val_score()

In [6]:
from sklearn.model_selection import cross_val_score

In [9]:
cross_val_score(dtc, iris.data, iris.target, scoring='accuracy', cv=5)

array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])

In [10]:
cross_val_score(dtc, iris.data, iris.target, scoring='accuracy', cv=3)

array([0.98, 0.94, 0.98])

- Grid searchCV 클래스 : 하이퍼 파라메타 튜닝 + 교차검증

In [16]:
# Train/Test Dataset 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

In [17]:
#하이퍼 파라메타의 종류
dtc = DecisionTreeClassifier(random_state=2021)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [18]:
params = {
    'max_depth' : [2,4,3],

    'min_samples_split' : [2,3,4]
}

In [19]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(dtc, param_grid=params,scoring='accuracy', cv=5)

In [20]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 3],
                         'min_samples_split': [2, 4, 3]},
             scoring='accuracy')

In [21]:
# 최적 파라메터
grid_dt.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [22]:
# 최적 스코어
grid_dt.best_score_

0.9666666666666668

In [23]:
# 학습된 최적 분류기(Classifier)
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.9

- 하이퍼 파라메터 튜닝은 한번으로 끝내는 것이 아니라
범위를 좀혀가며 여러번 수행하는 작업

In [24]:
params2 = {
    'max_depth' : [5,6,4,3],

    'min_samples_split' : [2,3]
}

In [26]:
grid_dt2 = GridSearchCV(dtc, param_grid=params2, scoring='accuracy', cv=5)
grid_dt2.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [5, 6, 4, 3],
                         'min_samples_split': [2, 3]},
             scoring='accuracy')

In [27]:
grid_dt2.best_params_

{'max_depth': 5, 'min_samples_split': 2}

In [28]:
grid_dt2.best_score_

0.9666666666666668