In [30]:
#데이터 불러오기

import pandas as pd

wine = pd.read_csv('http://bit.ly/wine_csv_data')

In [31]:
#data와 target 생성
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [32]:
#훈련세트와 테스트 세트 나누기
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [33]:
#훈련 세트를 이용해서 다시 나눠주기 (검증 세트 만들기)
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size = 0.2, random_state = 42)

In [34]:
#훈련 세트와 검증 세트 크기 확인
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [35]:
#모델 생성 후 평가
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state = 42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [36]:
#교차검증(검증 세트를 떼어 내지 않고 전달)
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.03138685, 0.02619791, 0.03295541, 0.0248332 , 0.02482867]), 'score_time': array([0.00196791, 0.00175571, 0.00177622, 0.00176811, 0.00935054]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [37]:
#검증 폴드의 점수 평균
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [38]:
#분할기 지정
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv = StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [39]:
#10-폴드 교차 검증
splitter = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
scores = cross_validate(dt, train_input, train_target, cv = splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


In [40]:
#그리드 서치
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)

In [41]:
#훈련
gs.fit(train_input, train_target)

dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [42]:
#최적의 매개변수
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [43]:
#각 매개변수가 수행한 교차 검증의 평균 점수
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [44]:
#최상의 검증 점수를 만든 매개변수 조합
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [45]:
#그리드 서치
params = {'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001),
          'max_depth' : range(5, 20, 1),
          'min_samples_split' : range(2, 100,10)}

gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)
gs.fit(train_input, train_target)

In [46]:
#최상의 매개변수 조합 출력
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [47]:
#최상의 교차검증 점수
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [48]:
#10개 숫자 샘플링
from scipy.stats import uniform, randint

rgen = randint(0, 10)
rgen.rvs(10)

#1000개 샘플링 후 숫자 개수 세기
np.unique(rgen.rvs(1000), return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([103, 109, 104,  96, 106, 101,  80,  99, 101, 101]))

In [49]:
#10개의 실수 추출
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.06500299, 0.07737643, 0.77442633, 0.65737154, 0.47377799,
       0.28089639, 0.85188001, 0.91125608, 0.15245837, 0.2043518 ])

In [50]:
#랜덤 서치
from sklearn.model_selection import RandomizedSearchCV

params = {'min_impurity_decrease' : uniform(0.0001, 0.001),
          'max_depth' : randint(20, 50),
          'min_samples_split' : randint(2, 25),
          'min_samples_leaf' : randint(1, 25)}

gs = RandomizedSearchCV(DecisionTreeClassifier(random_state = 42), params,
                        n_iter = 100, n_jobs = -1, random_state = 42)

gs.fit(train_input, train_target)

In [51]:
#최적의 매개변수 조합
print(gs.best_params_)

{'max_depth': 43, 'min_impurity_decrease': 0.00011407982271508446, 'min_samples_leaf': 19, 'min_samples_split': 18}


In [52]:
#최고의 교차 검증 점수 확인
print(np.max(gs.cv_results_['mean_test_score']))

0.8458726956392981


In [53]:
#테스트 세트의 성능 확인
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

0.786923076923077
