# Voting 방식 모델 구현
- 데이터 : load_breast_cancer
- 유형 : 지도학습 + 분류
- 방법 : Voting방식으로 진행 => LogisticRegression, DecisionTree(전처리 영향X), SVC, RandomForest(랜덤 포레스트 자체가 앙상블이니 빼자)
- 학습 데이터셋 : 동일한 데이터셋으로 3개의 모델로 학습 진행 

In [9]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split # 분리
import pandas as pd
import numpy as np

In [10]:
X, y = load_breast_cancer(as_frame = True, return_X_y=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=5)

In [12]:
X_train.shape, y_train.shape

((398, 30), (398,))

# <hr> 2. 학습진행

## 2-1 앙상블 보팅 학습에 사용할 모델 인스턴스 생성

In [13]:
# LogisticRegression

from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver="liblinear",random_state=5)
lr_model.fit(X_train, y_train)

In [14]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=5)
dt_model.fit(X_train, y_train)

In [34]:
from sklearn.svm import SVC

svc_model = SVC(probability=True)
svc_model.fit(X_train, y_train)

## 2-2 Ensemble 알고리즘 기반 분류

In [35]:
from sklearn.ensemble import VotingClassifier

In [36]:
# 동일 데이터셋을 병렬학습 진행할 모델 리스트 선정 및 결과 결정 방법 설정 
vt_models = VotingClassifier(estimators=[("lr_model", lr_model), 
                                        ("dt_model", dt_model), 
                                        ("svc_model", svc_model)],
                           verbose = True)

In [37]:
# 동일 데이터셋을 전달해서 3개의 모델 동시에 학습 진행
vt_models.fit(X_train, y_train)

[Voting] ................. (1 of 3) Processing lr_model, total=   0.0s
[Voting] ................. (2 of 3) Processing dt_model, total=   0.0s
[Voting] ................ (3 of 3) Processing svc_model, total=   0.0s


In [38]:
# 예측하기
vt_models.predict(X_test)

array([1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0])

In [39]:
# 보팅 인스턴스 내의 학습기들
vt_models.estimators_[0]

In [40]:
# 예측하기
new_data = pd.DataFrame([X_test.iloc[0]], columns=X_test.columns)
new_data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
440,10.97,17.2,71.73,371.5,0.08915,0.1113,0.09457,0.03613,0.1489,0.0664,...,12.36,26.87,90.14,476.4,0.1391,0.4082,0.4779,0.1555,0.254,0.09532


In [41]:
# 예측하기
vt_models.predict(new_data)
vt_models.predict_proba(new_data) # soft일 대만 됨, 그리고 모든 모델에 predict proba가 있어야함

AttributeError: predict_proba is not available when voting='hard'

In [42]:
X_test.iloc[0].to_frame().T # 둘이 똑같음 

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
440,10.97,17.2,71.73,371.5,0.08915,0.1113,0.09457,0.03613,0.1489,0.0664,...,12.36,26.87,90.14,476.4,0.1391,0.4082,0.4779,0.1555,0.254,0.09532


In [43]:
# 보팅 인스턴스 내의 학습기들 접근방법
vt_models.estimators_[0], vt_models.named_estimators_

(LogisticRegression(random_state=5, solver='liblinear'),
 {'lr_model': LogisticRegression(random_state=5, solver='liblinear'),
  'dt_model': DecisionTreeClassifier(random_state=5),
  'svc_model': SVC(probability=True)})

In [44]:
vt_models.named_estimators_.get("lr_model").predict(new_data)

array([1], dtype=int64)

In [45]:
for key, val in vt_models.named_estimators_.items():
    print(f"{key} {val}")
    print(val.predict(new_data))
    print()

lr_model LogisticRegression(random_state=5, solver='liblinear')
[1]

dt_model DecisionTreeClassifier(random_state=5)
[1]

svc_model SVC(probability=True)
[1]



In [26]:
print(f"[LogisticRegression] Train : {lr_model.score(X_train, y_train)} Test : {lr_model.score(X_test, y_test)}")

[LogisticRegression] Train : 0.9522613065326633 Test : 0.9590643274853801


In [27]:
print(f"[DecisionTreeClassifier] Train : {dt_model.score(X_train, y_train)} Test : {dt_model.score(X_test, y_test)}")

[DecisionTreeClassifier] Train : 1.0 Test : 0.9415204678362573


In [28]:
print(f"[RandomForestClassifier] Train : {rf_model.score(X_train, y_train)} Test : {rf_model.score(X_test, y_test)}")

NotFittedError: This VotingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.