#### Ensemble ExtraTree 
- 페이스트 방식의 Ensemble : 랜덤 샘플 + 동일 모델(DT)
    * 대표 알고리즘 : ExtraTreeC/R

- 목표 : 와인 분류 -> 0과 1, 2개 종류 분류


In [2]:
# 모듈 로딩 및 데이터준비
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 데이터 준비
FILE = '../data/wine.csv'
winedf=pd.read_csv(FILE)
winedf

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.20,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0
...,...,...,...,...
6492,11.2,1.6,3.27,1.0
6493,9.6,8.0,3.15,1.0
6494,9.4,1.2,2.99,1.0
6495,12.8,1.1,3.34,1.0


In [3]:
# 컬럼별 고윳값 확인 코드 만들기 unique
for i in winedf.columns:
    print(f"{i}의 종류 : {winedf[i].unique()}\n")

alcohol의 종류 : [ 9.4         9.8        10.          9.5        10.5         9.2
  9.9         9.1         9.3         9.          9.7        10.1
 10.6         9.6        10.8        10.3        13.1        10.2
 10.9        10.7        12.9        10.4        13.         14.
 11.5        11.4        12.4        11.         12.2        12.8
 12.6        12.5        11.7        11.3        12.3        12.
 11.9        11.8         8.7        13.3        11.2        11.6
 11.1        13.4        12.1         8.4        12.7        14.9
 13.2        13.6        13.5        10.03333333  9.55        8.5
 11.06666667  9.56666667 10.55        8.8        13.56666667 11.95
  9.95        9.23333333  9.25        9.05       10.75        8.6
  8.9        13.9        13.7         8.         14.2        11.94
 12.89333333 11.46666667 10.98       11.43333333 10.53333333  9.53333333
 10.93333333 11.36666667 11.33333333  9.73333333 11.05        9.75
 11.35       11.45       14.05       12.33333333 12.75

In [4]:
print(winedf['class'].value_counts())
print(winedf['sugar'].value_counts())
print(winedf['pH'].value_counts())
print(winedf['alcohol'].value_counts())

class
1.0    4898
0.0    1599
Name: count, dtype: int64
sugar
2.00     235
1.80     228
1.60     223
1.40     219
1.20     195
        ... 
5.95       1
7.45       1
65.80      1
14.05      1
18.40      1
Name: count, Length: 316, dtype: int64
pH
3.16    200
3.14    193
3.22    185
3.20    176
3.15    170
       ... 
2.77      1
2.72      1
3.81      1
3.82      1
2.82      1
Name: count, Length: 108, dtype: int64
alcohol
9.500000     367
9.400000     332
9.200000     271
10.000000    229
10.500000    227
            ... 
11.366667      1
9.750000       1
11.350000      1
14.050000      1
12.050000      1
Name: count, Length: 111, dtype: int64


In [11]:
winedf.describe()
featuredf = winedf.drop(['class'], axis=1)
target = winedf['class']

print(featuredf.shape, target.shape)

# 학습준비
# 학습용 테스트용 데이터셋 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(featuredf, target , test_size=0.2, stratify=target, random_state=1)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print('-'*100)
# 학습 진행 (모델 만들기)
# 학습 방법 : 지도학습 -> 분류
# 알고리즘 : 앙상블 -> 배깅 -> randomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# 인스턴스 생성 
RFmodel = ExtraTreesClassifier(random_state=7) # oob_score : 매개변수(샘플 데이터셋 추출 후 남은 데이터셋 검증용으로 사용)
RFmodel.fit(featuredf, target)

# 모델 파라미터
print(RFmodel.classes_)
print(RFmodel.n_classes_)
print('-'*100)
print(RFmodel.feature_importances_)
print(RFmodel.feature_names_in_)
print('-'*100)
print(RFmodel.estimators_)

print('-'*100)

# 성능평가
train_score=RFmodel.score(X_train, y_train)
test_score =RFmodel.score(X_test, y_test)


print(train_score)
print(test_score)

# 튜닝
# RandomizedSearchCV 하이퍼파라미터 최적화 클래스
# 범위가 넓은 하이퍼 파라미터 설정에 좋음
# 지정된 범위에서 지정된 횟무 만큼 하이퍼파라미터를 추출하여 조합 진행
from sklearn.model_selection import RandomizedSearchCV

# RandomForestClassifier 하이퍼파라미터 설정
params = {'max_depth': range(2,16), 'min_samples_leaf':range(5,16), 'criterion': ['gini', 'entropy', 'log_loss']}
rf_model = ExtraTreesClassifier(n_estimators=300, random_state=7)
searchCV = RandomizedSearchCV(rf_model, param_distributions=params,
                              n_iter=1000,
                              verbose=4) # 작업하는 상황을 실시간 모니터링가능

print('-'*100)
searchCV.fit(X_train, y_train)





(6497, 3) (6497,)
(5197, 3) (5197,)
(1300, 3) (1300,)
----------------------------------------------------------------------------------------------------
[0. 1.]
2
----------------------------------------------------------------------------------------------------
[0.18631321 0.52925242 0.28443437]
['alcohol' 'sugar' 'pH']
----------------------------------------------------------------------------------------------------
[ExtraTreeClassifier(random_state=327741615), ExtraTreeClassifier(random_state=976413892), ExtraTreeClassifier(random_state=1202242073), ExtraTreeClassifier(random_state=1369975286), ExtraTreeClassifier(random_state=1882953283), ExtraTreeClassifier(random_state=2053951699), ExtraTreeClassifier(random_state=959775639), ExtraTreeClassifier(random_state=1956722279), ExtraTreeClassifier(random_state=2052949340), ExtraTreeClassifier(random_state=1322904761), ExtraTreeClassifier(random_state=165338510), ExtraTreeClassifier(random_state=1133316631), ExtraTreeClassifier(rand



[CV 1/5] END criterion=gini, max_depth=2, min_samples_leaf=5;, score=0.754 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=2, min_samples_leaf=5;, score=0.754 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=2, min_samples_leaf=5;, score=0.755 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=2, min_samples_leaf=5;, score=0.754 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=2, min_samples_leaf=5;, score=0.754 total time=   0.1s
[CV 1/5] END criterion=gini, max_depth=2, min_samples_leaf=6;, score=0.754 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=2, min_samples_leaf=6;, score=0.754 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=2, min_samples_leaf=6;, score=0.755 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=2, min_samples_leaf=6;, score=0.754 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=2, min_samples_leaf=6;, score=0.754 total time=   0.1s
[CV 1/5] END criterion=gini, max_depth=2, min_samples_leaf=7

In [12]:
# 모델 파라미터
print(searchCV.best_score_)
print(searchCV.best_params_)
print(searchCV.best_estimator_)

cv_resdf = pd.DataFrame(searchCV.cv_results_)
cv_resdf

0.75389649811209
{'min_samples_leaf': 5, 'max_depth': 2, 'criterion': 'gini'}
ExtraTreesClassifier(max_depth=2, min_samples_leaf=5, n_estimators=300,
                     random_state=7)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.210643,0.006571,0.025949,0.008065,5,2,gini,"{'min_samples_leaf': 5, 'max_depth': 2, 'crite...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
1,0.221127,0.006980,0.019172,0.007022,6,2,gini,"{'min_samples_leaf': 6, 'max_depth': 2, 'crite...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
2,0.221403,0.007303,0.021205,0.003517,7,2,gini,"{'min_samples_leaf': 7, 'max_depth': 2, 'crite...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
3,0.223313,0.009239,0.019116,0.001819,8,2,gini,"{'min_samples_leaf': 8, 'max_depth': 2, 'crite...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
4,0.221915,0.008518,0.021327,0.005684,9,2,gini,"{'min_samples_leaf': 9, 'max_depth': 2, 'crite...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,0.403918,0.052309,0.040566,0.007410,11,15,log_loss,"{'min_samples_leaf': 11, 'max_depth': 15, 'cri...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
458,0.482452,0.049079,0.049613,0.011886,12,15,log_loss,"{'min_samples_leaf': 12, 'max_depth': 15, 'cri...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
459,0.435150,0.045395,0.037047,0.008476,13,15,log_loss,"{'min_samples_leaf': 13, 'max_depth': 15, 'cri...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
460,0.460072,0.034936,0.047209,0.014091,14,15,log_loss,"{'min_samples_leaf': 14, 'max_depth': 15, 'cri...",0.753846,0.753846,0.754572,0.753609,0.753609,0.753896,0.000354,1
