In [10]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, train_test_split, cross_validate
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import StratifiedKFold

wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.DataFrame(wine.target, columns=['label'])

X.to_numpy()
y.to_numpy()

train_input, test_input, train_target, test_target = train_test_split(X, y, test_size=0.2, random_state=42)
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

scores = cross_validate(dt, train_input, train_target)
print(scores)

#교차 검증
print(np.mean(scores['test_score']))

#K-pold
scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

1.0
0.896551724137931
{'fit_time': array([0.0059607 , 0.0019958 , 0.00399327, 0.00299096, 0.00199747]), 'score_time': array([0.00103784, 0.00099826, 0.00195384, 0.00103474, 0.00099277]), 'test_score': array([0.93103448, 0.93103448, 0.89285714, 0.92857143, 0.89285714])}
0.9152709359605913
0.9152709359605913
0.9223809523809525


In [11]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [12]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

print(gs.best_params_)

print(gs.cv_results_['mean_test_score'])

best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

print(gs.best_params_)

print(np.max(gs.cv_results_['mean_test_score']))

1.0
{'min_impurity_decrease': 0.0001}
[0.91527094 0.91527094 0.91527094 0.91527094 0.91527094]
{'min_impurity_decrease': 0.0001}
{'max_depth': 5, 'min_impurity_decrease': 0.0001, 'min_samples_split': 12}
0.9224137931034484
