In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
from collections import Counter, defaultdict
from functools import partial
import math, random
style.use('ggplot')
%matplotlib inline

In [None]:
#  유방암 환자 데이터
from sklearn import datasets
cancer = datasets.load_breast_cancer()

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

# 1. 다수결 (Voting Classifier)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier

model1 = LogisticRegression(random_state=1)
model2 = QuadraticDiscriminantAnalysis()
model3 = GaussianNB()

vc = VotingClassifier(estimators=[('lr', model1), ('qda', model2), ('gnb', model3)], 
                            voting='soft', weights=[1, 1, 2])

### (1) 각 classifier 결과

In [None]:
model1 = LogisticRegression(random_state=1)
model2 = QuadraticDiscriminantAnalysis()
model3 = GaussianNB()

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)

print("LogisticRegression test set 정확도: {:.3f}".format(model1.score(X_test, y_test)))
print("QuadraticDiscriminantAnalysis  test set 정확도: {:.3f}".format(model2.score(X_test, y_test)))
print("GaussianNB test set 정확도: {:.3f}".format(model3.score(X_test, y_test)))

### (2) Voting 결과

In [None]:
vc.fit(X_train, y_train)

print("training set 정확도: {:.3f}".format(vc.score(X_train, y_train)))
print("test set 정확도: {:.3f}".format(vc.score(X_test, y_test)))

------

# 2. 배깅 (Bagging)

## (1) 의사결정 나무

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

print("training set 정확도: {:.3f}".format(dt.score(X_train, y_train)))
print("test set 정확도: {:.3f}".format(dt.score(X_test, y_test)))

## (2) Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
bg = BaggingClassifier(base_estimator=dt)
bg.fit(X_train, y_train)

print("training set 정확도: {:.3f}".format(bg.score(X_train, y_train)))
print("test set 정확도: {:.3f}".format(bg.score(X_test, y_test)))

------

# 3. 랜덤포레스트 (Random Forest)

## (1) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

print("training set 정확도: {:.3f}".format(rf.score(X_train, y_train)))
print("test set 정확도: {:.3f}".format(rf.score(X_test, y_test)))

## (2) feature importance

In [None]:
plt.figure(figsize =(20, 10))
sns.barplot(rf.feature_importances_, cancer.feature_names)
plt.yticks(fontsize = 15);

------

# 4. Boosting

## (1)-1 AdaBoost
- 가장 기본적인 부스팅 알고리즘
- 약분류기를 가중 선형 결합한다.

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier()
ab.fit(X_train, y_train)

print("training set 정확도: {:.3f}".format(ab.score(X_train, y_train)))
print("test set 정확도: {:.3f}".format(ab.score(X_test, y_test)))

## (1)-2 feature importance

In [None]:
plt.figure(figsize =(20, 10))
sns.barplot(ab.feature_importances_, cancer.feature_names)
plt.yticks(fontsize = 15);

## (2)-1 Gradient Boost
- 오차함수를 최소화시키는데 gradient descent 방식을 사용한다.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

print("training set 정확도: {:.3f}".format(gb.score(X_train, y_train)))
print("test set 정확도: {:.3f}".format(gb.score(X_test, y_test)))

## (2)-2 feature importance

In [None]:
plt.figure(figsize =(20, 10))
sns.barplot(gb.feature_importances_, cancer.feature_names)
plt.yticks(fontsize = 15);

# 5. xgboost
- 현재 Kaggle 같은 많은 머신러닝 대회에서 압도적으로 많은 비율로 사용되는 모델.
- 기존 Gradient Boosting 모델에 정규화 방식을 차용하여 과적합을 방지하고, 병렬 처리를 가능하게해 정확도, 속도 모두 향상 시켰다.
- https://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf

## (1) xgboost

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

print("training set 정확도: {:.3f}".format(xgb.score(X_train, y_train)))
print("test set 정확도: {:.3f}".format(xgb.score(X_test, y_test)))

## (2) feature importance

In [None]:
plt.figure(figsize =(20, 10))
sns.barplot(xgb.feature_importances_, cancer.feature_names)
plt.yticks(fontsize = 15);

> ### 빠르게 실행이 가능하고, 성능이 우수하므로 본격적인 머신러닝 모델링을 하기 전에 가장 먼저 실행해서 결과를 살펴보는 것이 좋다.