# Hands on Lab - Bagging, Boosting and Stacking

**2022 [FinanceData.KR](http://financedata.kr) | [facebook.com/financedata](http://facebook.com/financedata)**


In [None]:
# mlxtend 업그레이드 설치 필요
!pip install -U mlxtend

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (14,4)
plt.rcParams['lines.linewidth'] = 2
plt.rcParams["axes.grid"] = True

plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["axes.formatter.useoffset"] = False
plt.rcParams["axes.formatter.limits"] = -10000, 10000

In [None]:
import pandas as pd
import numpy as np

## 데이터 준비 make_moons

In [None]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
df.head(10)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_moons.html
plt.rcParams["figure.figsize"] = (10,8)

colors = {0:'red', 1:'blue'}
fig, ax = plt.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)

estimators=[('lr', log_clf),('rf', rnd_clf),('svc', svm_clf)]
voting_clf = VotingClassifier(estimators=estimators, voting='soft')
voting_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(random_state=42), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1, random_state=42
    )
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print('BaggingClassifier : ', accuracy_score(y_test, y_pred))

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
print('DecisionTreeClassifier : ', accuracy_score(y_test, y_pred_tree))

In [None]:
from matplotlib.colors import ListedColormap

def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)
    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

plt.figure(figsize=(11,4))
plt.subplot(121)
plot_decision_boundary(tree_clf, X, y)
plt.title("Decision Tree", fontsize=14)
plt.subplot(122)
plot_decision_boundary(bag_clf, X, y)
plt.title("Decision Trees with Bagging", fontsize=14)
plt.show()

### Out-of-bag evaluation
훈련에 사용하지 않는 데이터로 테스트하도록 지정(oob_score=True)

In [None]:
# Out-of-bag evaluation

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42),
    n_estimators=500, bootstrap=True, n_jobs=-1, oob_score=True, random_state=40
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

## RandomForestClassifier()
* n_estimators: 트리갯수
* max_leaf_nodes: 최대 리프 노드 (과적합 방지)
* n_jobs: 병렬처리 개수 (-1=모든 프로세서 사용)


In [None]:
from sklearn.ensemble import RandomForestClassifier

random_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
random_clf.fit(X_train, y_train)

y_pred = random_clf.predict(X_test)
accuracy_score(y_pred, y_test)

# Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

plt.figure(figsize=(10, 8))

clf = DecisionTreeClassifier(max_depth=2)
ada_clf = AdaBoostClassifier(clf, n_estimators=200, algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)
plot_decision_boundary(ada_clf, X, y)
plt.show()

## StackingClassifier
http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/
       
* classifiers: 분류기 리스트
* meta_classifier: 다른 분류기의 결과를 학습하는 분류기

In [None]:
# Simple Stacked Classification
from sklearn import datasets
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

estimator_names = ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']

print('3-fold cross validation:\n')
for clf, label in zip([clf1, clf2, clf3, sclf], estimator_names):
    scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [None]:
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools

gs = gridspec.GridSpec(2, 2)

fig = plt.figure(figsize=(10,8))

estimator_names = ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']
for clf, lab, grd in zip([clf1, clf2, clf3, sclf], estimator_names, itertools.product([0, 1], repeat=2)):

    clf.fit(X, y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=X, y=y, clf=clf)
    plt.title(lab)

**2018-2022 [FinanceData.KR](http://financedata.kr) | [facebook.com/financedata](http://facebook.com/financedata)**
