In [1]:
'''

    wine 분류
    
    1.0 4898 - white wine
    0.0 1599 - red wine
    
    하이퍼 파라미터 튜닝
    
    1) GridSearchCV
    
        - 지정한 params 값을 하나도 빠뜨리지 않고 촘촘하게 조합해서 최고의 파라미터를 찾는다.
        - 장점: 촘촘하게 찾음
        - 단점: 시간이 많이 걸림
        
    
    2) RandomizedSearchCV
    
        - GridSearchCV 처럼 모든 파라미터를 사용하지 않고 랜덤하게 추출된 샘플링 사용해서 최고의 파라미터를 찾는다.
        - n_iter 수에 좌우된다.=> params 에서 500개만 랜덤하게 뽑음
        

    
'''

'\n\n    wine 분류\n    \n    1.0 4898 - white wine\n    0.0 1599 - red wine\n    \n    하이퍼 파라미터 튜닝\n    \n    1) GridSearchCV\n    \n        - 지정한 params 값을 하나도 빠뜨리지 않고 촘촘하게 조합해서 최고의 파라미터를 찾는다.\n        - 장점: 촘촘하게 찾음\n        - 단점: 시간이 많이 걸림\n        \n    \n    2) RandomizedSearchCV\n    \n        - GridSearchCV 처럼 모든 파라미터를 사용하지 않고 랜덤하게 추출된 샘플링 사용해서 최고의 파라미터를 찾는다.\n        - n_iter 수에 좌우된다.=> params 에서 500개만 랜덤하게 뽑음\n        \n\n    \n'

In [2]:
import numpy as np
import pandas as pd

### 1. 데이터 준비

In [3]:
wine = pd.read_csv(r"C:\Users\YB\Desktop\tf24_study\01_지도학습\kNN_wine분류_사이킷런API활용_수강생제공용\wine.csv")
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [4]:
wine.shape

(6497, 4)

In [5]:
wine['class'].value_counts()

1.0    4898
0.0    1599
Name: class, dtype: int64

### 2. 입력데이터와 label 생성

In [6]:
wine_data = wine.iloc[:,:3]
wine_target = wine.iloc[:,3]
wine_data.shape, wine_target.shape

((6497, 3), (6497,))

### 3. 훈련 데이터 및 테스트 데이터 분리

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train,X_test,y_train,y_test = train_test_split(wine_data,wine_target,test_size=0.2,random_state=1)

In [9]:
X_train.shape,y_train.shape

((5197, 3), (5197,))

In [10]:
X_test.shape,y_test.shape

((1300, 3), (1300,))

### 결정트리는 표준화 작업이 필요없다. 매우 큰 장점이다. 

### 결정트리는 대신 과적합에 매우 민감하다 매우 큰 단점이다. (가지치기 -> 하이퍼파라미터로 과적합 방지)

### 4. 모델 생성

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
dt = DecisionTreeClassifier(random_state = 1)
dt.fit(X_train,y_train)

### 5. 하이퍼 파리미터 튜닝

In [13]:
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [14]:
params = {'max_depth':range(5,20,1),#15
          "min_samples_split":range(2,100,10),#10
          'min_impurity_decrease':np.arange(0.0001, 0.001, 0.0001)#9
 
         }

In [15]:
gs = RandomizedSearchCV(dt,param_distributions=params,scoring='accuracy',
                  n_jobs=-1,
                  refit=True,
                  cv=5,
                  verbose=1,
                  n_iter=500) #verbose => log


In [16]:
gs.fit(X_train,y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


### GridSearchCV 실행결과

In [17]:
gs.cv_results_

{'mean_fit_time': array([0.0090816 , 0.00762744, 0.00853148, 0.00842505, 0.00919204,
        0.00956216, 0.00934753, 0.01064267, 0.00788965, 0.00816259,
        0.00999818, 0.00586367, 0.00801988, 0.00877028, 0.00662093,
        0.00927782, 0.00998058, 0.00851173, 0.00811653, 0.00685077,
        0.00696898, 0.00776873, 0.00791421, 0.0080801 , 0.00716   ,
        0.00608702, 0.0082068 , 0.0092514 , 0.00650473, 0.00778866,
        0.00717874, 0.00769186, 0.00829492, 0.00839758, 0.00660996,
        0.00861096, 0.00680842, 0.00757985, 0.00671673, 0.00631242,
        0.0075007 , 0.00691738, 0.00755682, 0.00662827, 0.0067699 ,
        0.00737524, 0.00681047, 0.0067522 , 0.00589566, 0.00666828,
        0.00687089, 0.00654202, 0.00759754, 0.00696955, 0.00713992,
        0.00702734, 0.00802488, 0.00768847, 0.00916195, 0.00618997,
        0.00698624, 0.00653887, 0.00664859, 0.00752344, 0.00578117,
        0.00680051, 0.00684538, 0.00593643, 0.00916572, 0.00759578,
        0.00863466, 0.006112  ,

In [18]:
### 실행결과를 보기편하게 df로 변경
df=pd.DataFrame(gs.cv_results_)
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_min_impurity_decrease,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009082,0.00133,0.002687,0.00027,82,0.0003,12,"{'min_samples_split': 82, 'min_impurity_decrea...",0.852885,0.872115,0.849856,0.860443,0.870067,0.861073,0.008901,264
1,0.007627,0.001084,0.002674,0.000939,92,0.0006,5,"{'min_samples_split': 92, 'min_impurity_decrea...",0.840385,0.868269,0.851781,0.869105,0.864293,0.858766,0.011085,370
2,0.008531,0.000877,0.002987,0.001082,92,0.0004,6,"{'min_samples_split': 92, 'min_impurity_decrea...",0.840385,0.866346,0.851781,0.869105,0.85563,0.856649,0.010377,469
3,0.008425,0.001755,0.002776,0.001566,82,0.0002,8,"{'min_samples_split': 82, 'min_impurity_decrea...",0.846154,0.869231,0.850818,0.86333,0.86718,0.859343,0.009184,332
4,0.009192,0.001904,0.003891,0.002532,42,0.0006,8,"{'min_samples_split': 42, 'min_impurity_decrea...",0.85,0.867308,0.849856,0.869105,0.868142,0.860882,0.008962,266


In [19]:
### 실행결과에서 Test_score 평균 점수
gs.cv_results_['mean_test_score']
#len(gs.cv_results_['mean_test_score']) #500 ==> n_iter값

array([0.86107315, 0.85876638, 0.85664933, 0.85934256, 0.86088214,
       0.86126508, 0.86126601, 0.85664766, 0.86184367, 0.85761253,
       0.86126601, 0.86145887, 0.85510865, 0.85876657, 0.86165063,
       0.85818946, 0.86107407, 0.85876508, 0.86165155, 0.85818798,
       0.85915174, 0.85818835, 0.86318909, 0.86396109, 0.85915044,
       0.85876471, 0.85645702, 0.85761198, 0.85876508, 0.85915044,
       0.86318964, 0.862999  , 0.85761198, 0.86242208, 0.86203617,
       0.86242115, 0.85645702, 0.86030373, 0.86088214, 0.85761198,
       0.85703395, 0.85838195, 0.86184349, 0.86126657, 0.86319057,
       0.85895924, 0.8618433 , 0.86088195, 0.86184349, 0.86261383,
       0.86203358, 0.85664933, 0.86126416, 0.86222699, 0.86318964,
       0.86299808, 0.86068835, 0.86145869, 0.86184349, 0.86068909,
       0.86261383, 0.86030373, 0.8618433 , 0.85991838, 0.85857407,
       0.85780429, 0.8595345 , 0.86145906, 0.86126564, 0.86165137,
       0.86203395, 0.86184349, 0.86280595, 0.85857407, 0.85818

In [20]:
### test_score의 최대 점수 index
np.argmax(gs.cv_results_['mean_test_score'])#752

86

In [28]:
### test_score의 최대 점수
gs.cv_results_['mean_test_score'][86]

0.8654993706966758

### 최고 정확도

In [29]:
# 이전 실습의 gs.cv_results_['mean_test_score'][752] 와 동일
gs.best_score_

0.8654993706966758

### 최고 파리미터

In [30]:
#  파라미터 중 최고의 파라미터
gs.best_params_

{'min_samples_split': 22,
 'min_impurity_decrease': 0.00030000000000000003,
 'max_depth': 12}

In [31]:
### refit=True로 지정했기 때문에 최고의 파라미터로 미리 훈련했고 그 최적의 모델을 반환
best_model = gs.best_estimator_
best_model

### 6. 평가

In [32]:
#dt.score(X_train,y_train) # 튜닝 전 모델 
best_model.score(X_train,y_train)

0.9028285549355397

In [33]:
best_model.score(X_test,y_test)

0.8676923076923077

In [27]:
### 결론은 GridSearchCV의 총 6750, RandomizedSearchCV 는 총 2500 이다.
### 1/3 수준의 훈련량으로 GridSearchCV의 score 값에 근접할 수 있다.