<a href="https://colab.research.google.com/github/yeneua/machine-learning/blob/main/05_2_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h2>05-2. 교차 검증과 그리드 서치</h2>

In [6]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [7]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [9]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target ,test_size = 0.2, random_state = 42)

In [11]:
# 검증세트 만들기
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target , test_size = 0.2, random_state = 42)

In [12]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [17]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.8605769230769231


교차검증

In [66]:
from sklearn.model_selection import cross_validate # cross_validate() : 교차검증함수
scores = cross_validate(dt, train_input, train_target) # 훈련세트 전체를 cross_validate() 함수에 전달
print(scores)

{'fit_time': array([0.00690103, 0.00762558, 0.00760841, 0.00680709, 0.00777626]), 'score_time': array([0.00117087, 0.00105906, 0.00112677, 0.0010922 , 0.00172114]), 'test_score': array([0.85769231, 0.86730769, 0.88642926, 0.85659288, 0.87969201])}


In [67]:
import numpy as np
print(np.mean(scores['test_score'])) # 검증점수의 평균

0.8695428296438884


In [23]:
from sklearn.model_selection import StratifiedKFold  # 분류일때 - StratifiedKFold
scores = cross_validate(dt, train_input, train_target, cv = StratifiedKFold())
print(np.mean(scores['test_score']))

0.8558788035833272


In [25]:
# 폴드를 나누는 행위를 디테일하게 제어(splitter)
splitter = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)
scores  = cross_validate(dt, train_input, train_target, cv = splitter)
print(np.mean(scores['test_score']))

0.8605005928560843


하이퍼파라미터 튜닝

In [27]:
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]} # 매개변수 목록 - 테스트할 값들

In [33]:
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1) # 훈련. 결정트리 객체를 생성하자마자 바로 전달함

In [34]:
gs.fit(train_input, train_target) # 훈련데이터로 훈련

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [35]:
dt = gs.best_estimator_ # best_estimator_ : 최적의 파라미터 조합이 저장되어있음. 최적의 모델을 dt에 저장
print(dt.score(train_input, train_target))

0.9615162593804117


In [36]:
print(gs.best_params_) # best_params_ : 그리드 서치로 찾은 최적의 파라미터

{'min_impurity_decrease': 0.0001}


In [38]:
print(gs.cv_results_['mean_test_score']) # 각 매개변수에서 수행한 교차 검증의 평균 점수

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [42]:
best_index = np.argmax(gs.cv_results_['mean_test_score']) # np.argmax() : 주어진 배열에서 가장 높은 값의 인덱스 반환
print(gs.cv_results_['params'][best_index]) # 제일 높은 점수를 보인 모델의 파라미터 값

{'min_impurity_decrease': 0.0001}


In [78]:
# 여러개의 매개변수 조합
params = {'min_impurity_decrease' : np.arange(0.0001, 0.001, 0.0001), # 9개
          'max_depth' : range(5, 20, 1), # 15개
          'min_samples_split' : range(2, 100, 10)} # 10개

In [79]:
gs = GridSearchCV(DecisionTreeClassifier(random_state = 42), params, n_jobs = -1)
gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [81]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [83]:
print(np.max(gs.cv_results_['mean_test_score'])) # 교차검증점수

0.8683865773302731


In [85]:
print(gs.cv_results_['mean_test_score'].shape)
# 9*15*10 => 1350
# 기본 5폴드이기 때문에 6750개의 모델이 만들어진다
# 각 매개변수에서 수행한 교차 검증의 평균 점수가 들어있음
# 각 매개변수에서 5폴드로 수행
# 1350개의 평균 점수가 들어가있음

(1350,)


랜덤 서치

In [51]:
from scipy.stats import uniform, randint

In [52]:
rgen = randint(0, 10)
rgen.rvs(10) # rvs : random sampling

array([6, 7, 1, 9, 1, 6, 3, 5, 8, 5])

In [53]:
np.unique(rgen.rvs(1000), return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([106,  97, 108,  99, 105,  95,  94,  94, 100, 102]))

In [54]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.95206747, 0.21845835, 0.8942351 , 0.01023284, 0.6990539 ,
       0.21321337, 0.31663746, 0.71931259, 0.29426469, 0.86101034])

In [86]:
# 매개변수 값을 균등분포 샘플링으로 부여
params = {'min_impurity_decrease' : uniform(0.0001, 0.001),
          'max_depth' : randint(20, 50),
          'min_samples_split' : randint(2, 25),
          'min_samples_leaf' : randint(1, 25)}

In [87]:
from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state = 42),
                        params,
                        n_iter = 100, # 모델을 만들 개수 - 샘플링을 100번
                        n_jobs = -1,
                        random_state = 42)
gs.fit(train_input, train_target)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f61824f80d0>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f61824f85d0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f6182855b50>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f61824f8610>},
                   random_state=42)

In [88]:
print(gs.best_params_) # 최적의 파라미터 값 

{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173, 'min_samples_leaf': 7, 'min_samples_split': 13}


In [89]:
print(np.max(gs.cv_results_['mean_test_score'])) # 최적의 파라미터로 찾은 검증세트의 점수

0.8695428296438884


In [90]:
dt = gs.best_estimator_  # 최적의 이 모델을 dt 모델로 저장. best_estimator_ : 최적의 파라미터로 훈련데이터(훈련+검증)를 훈련한 모델
print(dt.score(test_input, test_target)) # 최종적으로 테스트 세트로 검증

0.86


In [91]:
# 확인문제

In [95]:
rg = RandomizedSearchCV(DecisionTreeClassifier(splitter = 'random', random_state = 42), params, n_iter = 100, n_jobs = -1, random_state = 42)
rg.fit(train_input, train_target)
print(rg.best_params_)
print(np.max(rg.cv_results_['mean_test_score']))
dt = rg.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': 0.00011407982271508446, 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [97]:
# => 테스트세트에서 성능이 내려갔다 !!