In [21]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [22]:
# breast cancer 데이터 로딩 함수
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
def load_dataset(data_name, stratify=False):
    pass
    if data_name == "breast_cancer":
        dataset = load_breast_cancer()
    if data_name == "iris":
        dataset = load_iris()
        
    if stratify == False:
        stratify = None
    else:
        stratify = dataset.target
    x_tr, x_te, y_tr, y_te = train_test_split(dataset.data,
                           dataset.target,
                           random_state=0,
                           stratify=stratify)
    return x_tr, x_te, y_tr, y_te

x_tr, x_te, y_tr, y_te = load_dataset("iris", stratify=True)

model = DecisionTreeClassifier(max_depth=3).fit(x_tr, y_tr)
model.score(x_tr, y_tr),model.score(x_te, y_te)

(0.9642857142857143, 0.9473684210526315)

In [25]:
cancer = load_breast_cancer()

In [26]:
X = cancer["data"]
Y = cancer["target"]

x_tr, x_te, y_tr, y_te = train_test_split(X,Y, random_state=0, stratify=Y)

In [8]:
# 학습 분류
knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

### Voting 보팅

In [12]:
from sklearn.ensemble import VotingClassifier

hard = VotingClassifier([('knn1', knn1),
                        ('knn2', knn2),
                        ('lr', lr),
                        ('dt3', dt3),
                        ('dt5', dt5)])

soft = VotingClassifier([('knn1', knn1),
                        ('knn2', knn2),
                        ('lr', lr),
                        ('dt3', dt3),
                        ('dt5', dt5)], voting='soft')
hard, soft

(VotingClassifier(estimators=[('knn1', KNeighborsClassifier()),
                              ('knn2', KNeighborsClassifier(n_neighbors=3)),
                              ('lr', LogisticRegression(max_iter=10000)),
                              ('dt3', DecisionTreeClassifier(max_depth=3)),
                              ('dt5', DecisionTreeClassifier(max_depth=5))]),
 VotingClassifier(estimators=[('knn1', KNeighborsClassifier()),
                              ('knn2', KNeighborsClassifier(n_neighbors=3)),
                              ('lr', LogisticRegression(max_iter=10000)),
                              ('dt3', DecisionTreeClassifier(max_depth=3)),
                              ('dt5', DecisionTreeClassifier(max_depth=5))],
                  voting='soft'))

In [16]:
# 분류기 성능 평가
names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']

for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_tr, y_tr)
    name = names[idx]
    train_score = model.score(x_tr, y_tr) *100
    test_score = model.score(x_te, y_te) * 100
    print(f'{name} Train : {train_score:.2f}%')
    print(f'{name} Test : {test_score:.2f}%')
    print()

hard Train : 98.12%
hard Test : 94.41%

soft Train : 99.53%
soft Test : 95.10%

knn1 Train : 94.60%
knn1 Test : 91.61%

knn2 Train : 95.77%
knn2 Test : 91.61%

lr Train : 96.71%
lr Test : 93.71%

dt3 Train : 97.65%
dt3 Test : 93.01%

dt5 Train : 100.00%
dt5 Test : 91.61%



### Bagging

In [45]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5).fit(x_tr, y_tr)
model.score(x_tr, y_tr),model.score(x_te, y_te)

(1.0, 0.9440559440559441)

In [34]:
# 비교
model = DecisionTreeClassifier(max_depth=5).fit(x_tr, y_tr)
model.score(x_tr, y_tr),model.score(x_te, y_te)

(1.0, 0.9230769230769231)

### Boosting 
- 일반적으로 가장 성능이 좋은 기법

In [55]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(max_depth=3).fit(x_tr, y_tr)
model.score(x_tr, y_tr),model.score(x_te, y_te)

(1.0, 0.958041958041958)

### Stacking
- 여러 모델이 예측한 결과 값을 다른 모델의 학습 데이터로입력하여 재 학습

In [56]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
             ('gd', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())

model.fit(x_tr, y_tr).score(x_te, y_te)

0.951048951048951