In [97]:
import pandas as pd

wine=pd.read_csv("https://bit.ly/wine_csv_data")

In [98]:
data=wine[ ['alcohol', 'sugar', 'pH'] ].to_numpy()
target=wine['class'].to_numpy()

In [99]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(data,target,test_size=0.2,random_state=42)

In [100]:
# 훈련세트에서 검증세트 추출
#sub_x,sub_y=훈련세트
#val_x,val_y=검증세트

sub_x,val_x,sub_y,val_y=train_test_split(X_train,y_train,test_size=0.2,random_state=42)

In [101]:
sub_x.shape,val_x.shape

((4157, 3), (1040, 3))

In [102]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(random_state=42)
dt.fit(sub_x,sub_y)
print(dt.score(sub_x,sub_y))
print(dt.score(val_x,val_y))

0.9971133028626413
0.864423076923077


In [103]:
from sklearn.model_selection import cross_validate

scores=cross_validate(dt,X_train,y_train)
print(scores)

{'fit_time': array([0.00504589, 0.01006556, 0.01019931, 0.        , 0.01276469]), 'score_time': array([0., 0., 0., 0., 0.]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [104]:
#검증폴드의 점수
#테스트세트의 점수가 아님

import numpy as np
np.mean(scores['test_score'])

0.855300214703487

In [105]:
#교차검증에서 훈련세트를 섞으려면 분할기를 사용해야함 (교차검증이 아닐시에는 train_test_split에서 자동으로 훈련세트가 섞인다.)
# 분할기의 종류 : 회귀-KFold , 분류-StartfieldKFold

from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

scores=cross_validate(dt,X_train,y_train,cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [106]:
# 10-Fold
#method-1
spliter=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
score=cross_validate(dt,X_train,y_train,cv=spliter)

print(np.mean(score['test_score']))

0.8574181117533719


In [107]:
#method-2
score=cross_validate(dt,X_train,y_train,cv=StratifiedKFold(n_splits=10,shuffle=True,random_state=42))

print(np.mean(score['test_score']))

0.8574181117533719


In [108]:
#그리드 서치 : 하이퍼파라미터탐색 + 교차검증
#min_impurity_decrease 0.0001~0.0005까지 실행
#cv 의 기본값은 5-fold , 따라서 25개의 모델을 훈련
from sklearn.model_selection import GridSearchCV
params={'min_impurity_decrease':[0.0001,0.0002,0.0003,0.0004,0.0005]}

gs=GridSearchCV(DecisionTreeClassifier(random_state=42), params,n_jobs=-1)
gs.fit(X_train,y_train)


GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [109]:
#검증점수가 가장 높은 모델의 파라미터 조합으로 전체 훈련세트에서 자동으로 다시 모델을 훈련
#최적의 파라미터에서의 score

dt = gs.best_estimator_
print(dt.score(X_train,y_train))

0.9615162593804117


In [110]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [111]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [112]:
#실습
#np.arange= 매개변수 1부터 2까지 3을 더하면서 ..
#range = 정수만 사용가능

params={'min_impurity_decrease' : np.arange(0.0001,0.001,0.0001),\
        'max_depth':range(5,20,1),\
        'min_samples_split' : range(2,100,10) }

In [113]:
gs=GridSearchCV(DecisionTreeClassifier(random_state=42),params,n_jobs=-1)
gs.fit(X_train,y_train)


GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [114]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [115]:
#교차검증 점수

print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [116]:
#best_estimator_ = 최적의 매개변수로 전체 데이터를 훈련하여 score추출
#cv_results_ = K번의 교차검증으로 추출한 score



In [117]:
#랜덤서치

In [118]:
from scipy.stats import uniform , randint  #uniform = 실수, randint = 정수      randomize
rgen=randint(0,10)
rgen.rvs(10) #10개를 랜덤하게 추출

array([8, 6, 0, 5, 8, 0, 9, 2, 4, 8])

In [119]:
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 98, 110, 115,  98,  92,  98,  86, 114, 100,  89], dtype=int64))

In [120]:
ugen=uniform(0,1)
ugen.rvs(10)

array([0.41980398, 0.10499936, 0.72569078, 0.98781683, 0.64748369,
       0.53822977, 0.34904012, 0.87237185, 0.62070993, 0.44258596])

In [121]:
#######################################
# 랜덤서치로 추출
params={'min_impurity_decrease' :uniform(0.0001,0.001),\
        'max_depth':randint(20,50),\
        'min_samples_split' : randint(2,25),
        'min_samples_leaf' : randint(1,25) }

In [122]:
from sklearn.model_selection import RandomizedSearchCV

gs=RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,n_iter=100,n_jobs=-1, random_state=42)
gs.fit(X_train,y_train)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001996A26DA30>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001996A29ABB0>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001996B4C1850>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001996B5F1730>},
                   random_state=42)

In [123]:
gs.best_estimator_

DecisionTreeClassifier(max_depth=39,
                       min_impurity_decrease=0.00034102546602601173,
                       min_samples_leaf=7, min_samples_split=13,
                       random_state=42)

In [124]:
gs.best_params_

{'max_depth': 39,
 'min_impurity_decrease': 0.00034102546602601173,
 'min_samples_leaf': 7,
 'min_samples_split': 13}

In [128]:
#교차검증 최고의 점수
print(np.max(gs.cv_results_['mean_test_score']))

0.8695428296438884


In [131]:
dt=gs.best_estimator_
print(dt.score(X_test,y_test))

0.86
