In [1]:
# 주피터 노트북 환경설정
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

from IPython.display import Image

from IPython.core.display import display, HTML
# display(HTML("<style>.container { font-weight: bold !important; font-family:'Malgun Gothic' !important;}</style>"))
display(HTML("<style>.container { font-weight: bold !important;}</style>"))
display(HTML("<style>.container { width: 98% !important; }</style>"))

In [2]:
import numpy as np
import pandas as pd
import os

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 관련 라이브러리 임포트 
import matplotlib.font_manager as fm

#  한글글꼴로 변경
# plt.rcParams['font.family'] = '한글글꼴명'
plt.rcParams['font.size'] = 11.0
# plt.rcParams['font.family'] = 'batang'
# plt.rcParams['font.family'] = 'Malgun Gothic'

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
matplotlib.rcParams['axes.unicode_minus'] = False

# 그래프 기본 크기 설정 
plt.rcParams['figure.figsize'] = [10, 6]

In [3]:
from sklearn.ensemble import VotingClassifier, VotingRegressor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score, recall_score, precision_score, classification_report, r2_score

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.pipeline import Pipeline

# 앙상블(Ensemble) 

- Ensemble : 조화라는 사전적인 의미를 지님
- Ensemble Learning
    - 여러 개의 기본 모델을 활용하여 하나의 새로운 모델을 만들어내는 개념 
    - 기본모델(Base Model)을 weak learner, classifier, base 

###  보팅(Voting) 
    - 여러개의 분류기가 투표를 통헤 최종 예측 결과를 결정하는 방식 
    - 하드보팅 : 다수의 classifier 간 다수결의 최종 class 결정 
    - 소프트보팅 : 다수의 classfier 들의 class 확률을 평균하여 결정 
    - Voting Classifier¶ 활용 

    모델=[(키1,예측기1),(키2,예측기2)] , voting='soft/hard' )


# Voting Classification

## Voting

<img src='https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fb5UNZc%2FbtrhVZEJkJA%2F1bqKVdovvqY2NY050MWMSK%2Fimg.jpg'>

**위스콘신 유방암 데이터 로드**

- cancer.data
- cancer.feature_names
- cancer.target 

In [4]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [7]:
# print(cancer.DESCR)

In [8]:
# 전체 갯수 확인
data_df.shape

(569, 30)

In [11]:
# cancer.target

In [None]:
# cancer.DESCR

### VotingClassifier로 개별모델은 로지스틱 회귀와 KNN을 보팅방식으로 결합하고 성능 비교

In [12]:
# from sklearn.ensemble import VotingClassifier

# 1) 모델 2개 생성 
model_lr = LogisticRegression() 
model_knn = KNeighborsClassifier(n_neighbors=8)

# 2) voting 형식으로 새로운 모델 생성 
# 모델명 = VotingClassifier( estimators = [('측정기변수', 측정기생성자), ...])
model_vo = VotingClassifier( estimators=[('LR',model_lr),('KNN',model_knn)] , voting='soft' )

# 3) 학습, 테스트 분리
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                                                    test_size=0.2 , random_state= 156)

# 4) 학습 LogisticRegression() + KNeighborsClassifier(n_neighbors=8)
model_vo.fit(X_train , y_train)

# 5) 평가 
model_vo.score(X_train , y_train)
model_vo.score(X_test , y_test)


0.9473684210526315

In [15]:
# 각 모델을 리스트로 생성해서 
# 모델 학습및 평가 데이타 확인 
classifiers = [model_lr, model_knn, model_vo]
for clf in classifiers:
    clf.fit(X_train , y_train)
    print(clf, clf.score(X_test , y_test))
    print(classification_report(y_test, clf.predict(X_test)))
    print()

LogisticRegression() 0.9385964912280702
              precision    recall  f1-score   support

           0       0.94      0.86      0.90        37
           1       0.94      0.97      0.96        77

    accuracy                           0.94       114
   macro avg       0.94      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114


KNeighborsClassifier(n_neighbors=8) 0.9385964912280702
              precision    recall  f1-score   support

           0       0.89      0.92      0.91        37
           1       0.96      0.95      0.95        77

    accuracy                           0.94       114
   macro avg       0.93      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114


VotingClassifier(estimators=[('LR', LogisticRegression()),
                             ('KNN', KNeighborsClassifier(n_neighbors=8))],
                 voting='soft') 0.9473684210526315
              precision    recall  f1-score   support

        

### voting='hard' 방식

In [57]:
# from sklearn.ensemble import VotingClassifier

model_lr = LogisticRegression() 
model_knn = KNeighborsClassifier(n_neighbors=8)

# hard 가 기본값 
model_vo2 = VotingClassifier( estimators=[('LR',model_lr),('KNN',model_knn)] , voting='hard' )
# model_vo2 = VotingClassifier( estimators=[('LR',model_lr),('KNN',model_knn)] , voting='soft' )

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, 
                                                    test_size=0.2 , random_state= 156)

model_vo2.fit(X_train , y_train)
model_vo2.score(X_train , y_train)
model_vo2.score(X_test , y_test)

0.9385964912280702

In [58]:
# 각 모델을 리스트로 생성해서 
# 모델 학습및 평가 데이타 확인 
classifiers = [model_lr, model_knn, model_vo2]
for clf in classifiers:
    clf.fit(X_train , y_train)
    print(clf, clf.score(X_test , y_test))
    print()

LogisticRegression() 0.9385964912280702

KNeighborsClassifier(n_neighbors=8) 0.9385964912280702

VotingClassifier(estimators=[('LR', LogisticRegression()),
                             ('KNN', KNeighborsClassifier(n_neighbors=8))]) 0.9385964912280702



In [59]:
y_test[0]

1

In [61]:
X_test[0]

array([1.311e+01, 2.254e+01, 8.702e+01, 5.294e+02, 1.002e-01, 1.483e-01,
       8.705e-02, 5.102e-02, 1.850e-01, 7.310e-02, 1.931e-01, 9.223e-01,
       1.491e+00, 1.509e+01, 5.251e-03, 3.041e-02, 2.526e-02, 8.304e-03,
       2.514e-02, 4.198e-03, 1.455e+01, 2.916e+01, 9.948e+01, 6.393e+02,
       1.349e-01, 4.402e-01, 3.162e-01, 1.126e-01, 4.128e-01, 1.076e-01])

In [63]:
# predict_proba is not available when voting='hard'
# predict_proba() => soft 방식일때만 가능 
# voting='hard' 인 경우에는 에러 발생 
model_vo.predict_proba([X_test[0]])

array([[0.0118036, 0.9881964]])

In [65]:
# model_vo2.predict_proba([X_test[0]])

# Voting Regressor

In [18]:
from sklearn.ensemble import VotingRegressor

In [19]:
# 회기용 데이타 
from sklearn.datasets import load_boston

boston = load_boston()

boston_df = pd.DataFrame(boston.data , columns = boston.feature_names)
 
boston_df['PRICE'] = boston.target

In [20]:
y_target = boston_df['PRICE']
X_data = boston_df.drop(['PRICE'], axis=1, inplace=False)

X_train , X_test , y_train , y_test = train_test_split( X_data , y_target , \
                                                       test_size=0.3, random_state=156)

In [24]:
# from sklearn.ensemble import VotingRegressor

model_linear = LinearRegression()
model_lasso = Lasso(alpha=100)
model_ridge = Ridge(alpha=100)

# voting='soft' (X)
model_vo_r = VotingRegressor( estimators=[('LINEAR', model_linear),('LASSO', model_lasso), ('RIDGE', model_ridge)])

model_vo_r.fit(X_train , y_train)
print(model_vo_r.score(X_train , y_train) , model_vo_r.score(X_test , y_test))

0.6536324706721894 0.709581503855359


In [25]:
# 사용한 알고리즘
model_vo_r.estimators_

[LinearRegression(), Lasso(alpha=100), Ridge(alpha=100)]

In [26]:
# 사용한 알고리즘의 상세 파라미터 
model_vo_r.named_estimators_

{'LINEAR': LinearRegression(),
 'LASSO': Lasso(alpha=100),
 'RIDGE': Ridge(alpha=100)}

In [27]:
model_vo_r.score(X_test , y_test), r2_score(y_test, model_vo_r.predict(X_test))

(0.709581503855359, 0.709581503855359)

In [29]:
X_test.loc[15]

CRIM         0.62739
ZN           0.00000
INDUS        8.14000
CHAS         0.00000
NOX          0.53800
RM           5.83400
AGE         56.50000
DIS          4.49860
RAD          4.00000
TAX        307.00000
PTRATIO     21.00000
B          395.62000
LSTAT        8.47000
Name: 15, dtype: float64

In [31]:
# 샘플데이타에 대한 예측 가격 
model_vo_r.predict([X_test.loc[15]])

array([22.22239808])

In [None]:
# 퀴즈 1 : 회기문제 
# - 자전거 대여 수요 예측 
# 앙상블 voting 방식으로 여러 모델을 그룹화 시켜  모델을 새로 정의하고 테스트하여라 


In [None]:
# 퀴즈 2 : 분류문제 
# 심장 사고 의료 
# 앙상블 voting 방식으로 여러 모델을 그룹화 시켜  모델을 새로 정의하고 테스트하여라 