In [35]:
# K 폴드

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

iris = load_iris()
features = iris.data
label = iris.target
df_clf = DecisionTreeClassifier(random_state=156)

kfold = KFold(n_splits=5)
print(features.shape[0]) # 전체 feature row

150


In [11]:

n_iter = 0
cv_accuracy = [] # accuracy의 값을 append 하는 list. 이후 kfold accuracy의 평균을 구하는데 사용 됨
for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]

    df_clf.fit(X_train, y_train)
    pred = df_clf.predict(X_test)
    n_iter += 1

    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]

    cv_accuracy.append(accuracy)

print(cv_accuracy)
print(np.mean(cv_accuracy))
print((sum(cv_accuracy))/n_iter)

[1.0, 0.9667, 0.8667, 0.9333, 0.7333]
0.9
0.9


In [21]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=3)

n_iter = 0
cv_accuracy = []

for train_idx, test_idx in skf.split(features, label):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]

    
    print(y_test)

    df_clf.fit(X_train, y_train)
    pred = df_clf.predict(X_test)
    n_iter += 1

    accuracy = np.round(accuracy_score(y_test, pred), 4)
    train_size = X_train.shape[0]
    test_size = X_test.shape[0]

    cv_accuracy.append(accuracy)


print(np.mean(cv_accuracy))

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
[0.7333, 0.7333, 0.7333]
0.7333
0.7333


In [32]:
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
import numpy as np
iris_data = load_iris()
dt_clf = dtc(random_state=156)

data = iris_data.data
label = iris_data.target

# cross_val_score: train과 valid 분할 후 scoring // kfold는 stratified로 자동 진행된다.
scores = cross_val_score(dt_clf, X=data, y=label, scoring='accuracy', cv=3)
"""
적용할 분류기: dt_clf
feature: X = data
target: y = label
나타내고싶은 점수: scoring = 'accuracy' (recall 등도 가능)
kfold 횟수: cv = 3

"""
print("Stratified K-fold score:", scores)
print("Average of Score:", np.round(np.mean(scores),4))


Stratified K-fold score: [0.98 0.94 0.98]
Average of Score: 0.9667


In [33]:
from sklearn.model_selection import GridSearchCV as gscv, train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.2, random_state=121)

dtree = dtc()

parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}

In [43]:
grid_dtree = gscv(dtree, param_grid=parameters, cv=3, refit=True, return_train_score=True)
"""
GridSearchCV: 해당 모델에 적합할 최적의 파라미터를 찾아준다.
 - grid_dtree는 dtree(의사결정트리분류기)에 최적 파라미터를 찾아낸 모델을 정의하는 것.

최적의 파라미터 찾을 모델(분류기): dtree -> DecisionTreeClassifier
시도해보는 parameter 조합: param_grid = parameters
kfold 분할 테스트 횟수: cv = 3
최적의 파라미터를 찾았을 때 모델에 학습 시키기: refit = True
"""

grid_dtree.fit(X_train, y_train)
"""
gird_dtree는 최적 파라미터를 찾는 인스턴스로 정의된거지만, refit=True를 줌으로써 학습된 estimator로 작동할 수 있게 된다.
grid_dtree에 fit으로 train data를 학습시키면, 최적의 파라미터를 찾아 모델링된다.
만약, gscv의 refit을 False로 했다면, gird_dtree.fit은 최적의 하이퍼파라미터만 찾아주고
이걸 다시 DecisionTreeClassifier에 파라미터로 지정해 모델을 학습시켜 사용해야한다.
"""

scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]



Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [44]:
pred = grid_dtree.predict(X_test)
"""
grid_dtree는 하이퍼파라미터를 찾는 인스턴스였지만, refit=True를 통해 estimator를 상속받았기때문에 fit을 통해 학습된 모델로 작동한다.
따라서 모델에서 사용하는 predict를 할 수 있다.
"""
print(accuracy_score(y_test, pred))

0.9666666666666667
