## 1. Voting Ensemble:

In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
from sklearn.datasets import load_breast_cancer
warnings.filterwarnings(action='ignore')                  # Turn off the warnings.

### 1.1. Read in data:

In [2]:
#데이터 불러오기
data = load_breast_cancer()

In [3]:
#설명변수
X = data['data']
print(data['feature_names'])

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [4]:
X.shape

(569, 30)

In [5]:
#반응변수
#0은 양성, 1은 악성이 되도록 레이블 다시 저장 
Y = 1 - data['target']   
label = list(data['target_names'])
label.reverse()
print(label)

['benign', 'malignant']


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)   #train, test 데이터 쪼갬

### 1.2. Predicting with individual estimator: 3개 알고리즘으로 각각 예측해보기

In [7]:
# Tree 알고리즘
DTC = DecisionTreeClassifier(max_depth=10)
DTC.fit(X_train, Y_train)
Y_pred = DTC.predict(X_test)
print( "Tree accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

Tree accuracy : 0.936


In [8]:
# KNN 알고리즘
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(X_train, Y_train)
Y_pred = KNN.predict(X_test)
print( "KNN accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

KNN accuracy : 0.936


In [9]:
# 로지스틱 회귀
LR = LogisticRegression()
LR.fit(X_train, Y_train)
Y_pred = LR.predict(X_test)
print( "Logistic regression accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

Logistic regression accuracy : 0.924


### 1.3. Predicting with a voting ensemble: 보팅 앙상블 예측 -  hard타입, soft타입

hard 타입

In [10]:
VC = VotingClassifier(estimators=[('Tree',DTC),('knn',KNN),('Logistic',LR)],voting='hard')          
#Tree로 DTC, knn으로 KNN, 로지스틱으로 LR 객체 만들고 보팅은 하드타입

VC.fit(X_train, Y_train)
Y_pred = VC.predict(X_test)
print( "Voting Classifier Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))

#각각 예측한 결과보다 3개를 조합하니 성능이 더 좋게 나옴

Voting Classifier Accuracy : 0.953


soft 타입

In [11]:
VC = VotingClassifier(estimators=[('Tree',DTC),('knn',KNN),('Logistic',LR)],voting='soft')            
VC.fit(X_train, Y_train)
Y_pred = VC.predict(X_test)
print( "Voting Classifier Accuracy : " + str(np.round(metrics.accuracy_score(Y_test,Y_pred),3)))


#hard, soft값은 똑같이 나옴 => 타입별 차이 없음
#어느 알고리즘끼리 조합했느냐가 영향을 끼침

Voting Classifier Accuracy : 0.953
