# 앙상블 (Ensemble)
- 다양한 모델을 결합하여 예측 성능을 향상시키는 방법
- 투표(Voting), 배깅(Bagging), 부스팅(Boosting), 스태킹(Stacking) 네 가지로 구분

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
import warnings
warnings.filterwarnings('ignore')

### Voting
- hard voting: 여러 개의 예측치에 대해 다수결로 결정
- soft voting: 여러 개의 예측 확률을 평균내어 결정

In [5]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [6]:
df['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [8]:
# 데이터 준비 (분리)
from sklearn.model_selection import train_test_split

x = data.data
y = data.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

### hard voting

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

knn_clf = KNeighborsClassifier()
lr_clf = LogisticRegression()
dt_clf = DecisionTreeClassifier()

voting_clf = VotingClassifier(
    estimators=[
        ('knn_clf', knn_clf),
        ('lr_clf', lr_clf),
        ('dt_clf', dt_clf)    
    ],
    voting='hard'    # 기본값
)

# 앙상블 모델 학습
voting_clf.fit(x_train, y_train)

# 예측 및 평가
y_pred_train = voting_clf.predict(x_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
print('학습 정수:', acc_score_train)

y_pred_test = voting_clf.predict(x_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print('테스트 평가 점수:', acc_score_test)

학습 정수: 0.9714285714285714
테스트 평가 점수: 0.956140350877193


In [19]:
# hard voting 작동 원리 == 다수결
start, end = 40, 50

voting_clf_pred = voting_clf.predict(x_test[start:end])
print('앙상블 예측값:', voting_clf_pred)

for classfier in [knn_clf, lr_clf, dt_clf]:
    # 개별 학습 및 예측
    classfier.fit(x_train, y_train)
    pred = classfier.predict(x_test)
    acc_score = accuracy_score(y_test, pred)

    class_name = classfier.__class__.__name__       # 클래스의 이름 속성
    print(f'{class_name} 개별 정확도: {acc_score:.4f}')
    print(f'{class_name} 예측값: {pred[start:end]}')

앙상블 예측값: [0 1 0 1 0 0 1 1 1 0]
KNeighborsClassifier 개별 정확도: 0.9386
KNeighborsClassifier 예측값: [0 1 0 1 0 0 1 1 1 0]
LogisticRegression 개별 정확도: 0.9649
LogisticRegression 예측값: [0 1 0 1 0 0 1 1 1 0]
DecisionTreeClassifier 개별 정확도: 0.8947
DecisionTreeClassifier 예측값: [0 1 0 1 0 0 1 1 1 0]


### soft voting

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

knn_clf = KNeighborsClassifier()
lr_clf = LogisticRegression()
dt_clf = DecisionTreeClassifier(random_state=0)

voting_clf = VotingClassifier(
    estimators=[
        ('knn_clf', knn_clf),
        ('lr_clf', lr_clf),
        ('dt_clf', dt_clf)    
    ],
    voting='soft'    # 기본값
)

# 앙상블 모델 학습
voting_clf.fit(x_train, y_train)

# 예측 및 평가
y_pred_train = voting_clf.predict(x_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
print('학습 정수:', acc_score_train)

y_pred_test = voting_clf.predict(x_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print('테스트 평가 점수:', acc_score_test)

학습 정수: 0.989010989010989
테스트 평가 점수: 0.9649122807017544


In [None]:
# soft voting 작동 원리 == 각 예측가의 확률값 평균

start, end = 40, 50

voting_clf_pred_proba = voting_clf.predict_proba(x_test[start:end])
print('앙상블 예측값:', voting_clf_pred_proba)

for classfier in [knn_clf, lr_clf, dt_clf]:
    # 개별 학습 및 예측
    classfier.fit(x_train, y_train)                                                             
    
    pred = classfier.predict(x_test)
    acc_score = accuracy_score(y_test, pred)
    pred_proba = classfier.predict_proba(x_test[start:end])

    class_name = classfier.__class__.__name__       # 클래스의 이름 속성
    print(f'{class_name} 개별 정확도: {acc_score:.4f}')
    print(f'{class_name} 예측값: {pred[start:end]}')

앙상블 예측값: [[5.77219228e-01 4.22780772e-01]
 [6.81827764e-04 9.99318172e-01]
 [9.99526759e-01 4.73241064e-04]
 [2.77579326e-04 9.99722421e-01]
 [9.00069906e-01 9.99300938e-02]
 [1.00000000e+00 1.59599463e-13]
 [6.74633161e-05 9.99932537e-01]
 [1.53322319e-02 9.84667768e-01]
 [9.19791079e-04 9.99080209e-01]
 [9.99839494e-01 1.60506219e-04]]
KNeighborsClassifier 개별 정확도: 0.9386
KNeighborsClassifier 예측값: [0 1 0 1 0 0 1 1 1 0]
LogisticRegression 개별 정확도: 0.9649
LogisticRegression 예측값: [0 1 0 1 0 0 1 1 1 0]
DecisionTreeClassifier 개별 정확도: 0.9123
DecisionTreeClassifier 예측값: [1 1 0 1 0 0 1 1 1 0]


### Bagging
- Bootstrap Aggregation
- Bootstrap 방식의 샘플링: 각 estimator 마다 훈련 데이터를 뽑을 때, 중복 값이 허용하는 방식
- 분류 모델의 경우, 각 tree(estimator)의 예측값을 다수결(hard voting) 결정
- 회구 모델의 경우, 각 tree(estimator)의 예측값을 평균내어 결정
- 기본적으로 100개의 tree 사용

##### 하이퍼 파라미터

In [44]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)

# 학습
rf_clf.fit(x_train, y_train)

y_pred_train = rf_clf.predict(x_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
print('학습 점수:', acc_score_train)

y_pred_test = rf_clf.predict(x_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print('테스트 평가 점수:', acc_score_test)


학습 점수: 1.0
테스트 평가 점수: 0.9649122807017544
