# ML Session 6차시 실습

## 앙상블(ensemble)
- 여러 개의 단순한 모델을 결합하여 보다 정확한 모델을 만드는 방법

## Bagging(Bootstrap Aggregating)
- 각 모델별로 기존 데이터 셋에서 중복을 허용하여 무작위로 N개의 feature를 선택한 후, 선택한 feature를 통해 만들어진 각 모델의 결과를 취합하는 앙상블 기법

In [1]:
import pandas as pd
import time

from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings(action='ignore')

# 유방암 진단 데이터 불러오기
data = load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [2]:
# 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, 
                                                    test_size=0.2 , random_state= 1004)

# <span style="color:gray">분류</span>

## BaggingClassifier
sklearn에서 지원하는 Bagging 분류모델
- base_estimator : bagging에 사용될 예측기(default = DecisionTreeClassifier)
- n_estimator : base_estimator의 갯수(default = 10)
- bootstrap : bootstrap 여부(default = True)
- oob_score : 모델 평가에 oob 샘플의 사용 여부(default = False)

In [None]:
#위에는 대표적인 parameter 

In [3]:
ba_clf = BaggingClassifier(base_estimator = LinearSVC())
ba_clf.fit(x_train, y_train)
ba_pred = ba_clf.predict(x_test)
print("BaggingClassifier_LR :", accuracy_score(y_test, ba_pred))

BaggingClassifier_LR : 0.8859649122807017


In [4]:
ba_clf = BaggingClassifier(base_estimator = DecisionTreeClassifier())
ba_clf.fit(x_train, y_train)
ba_pred = ba_clf.predict(x_test)
print("BaggingClassifier_DT :", accuracy_score(y_test, ba_pred))

BaggingClassifier_DT : 0.9122807017543859


## RandomForestClassifier
수많은 Decision Tree가 합쳐져 만들어진 <span style="color:red">Bagging</span> 분류 모델
- Bootstrap 방식으로 feature 선택
- <span style="color:gray">(version 1.1 이전)</span> DecisionTree와 같이 모든 feature의 information gain을 고려하여 노드를 분할
- <span style="color:gray">(version 1.1 이후)</span> 선택된 N개의 feature를 랜덤으로 $\sqrt(N)$개로 나눈 후 그 중 informatin gain을 고려하여 노드를 분할
- 각 Decision Tree의 결과를 voting(투표)하여 RandomForest의 결과를 도출한다.

## ExtraTreesClassifier
수많은 Decision Tree가 합쳐져 만들어진 <span style="color:red">앙상블</span> 분류 모델
- 비복원 추출로 feature 선택
- 선택된 N개의 feature를 랜덤으로 $\sqrt(N)$개로 나눈 후 그 중 informatin gain을 고려하여 노드를 분할
- 각 Decision Tree의 결과를 voting(투표)하여 ExtraTrees의 결과를 도출한다.

In [5]:
ba = BaggingClassifier()
dt = DecisionTreeClassifier()
et = ExtraTreesClassifier()
rf10 = RandomForestClassifier(n_estimators=10)
rf50 = RandomForestClassifier(n_estimators=50) #변수를 다르게 변경 
rf = RandomForestClassifier()

models = [ba, dt, et, rf10, rf50, rf]
benchmark = pd.DataFrame()
for model in models:
    start = time.time()
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    tm = time.time() - start
    acc = accuracy_score(y_test, pred)
    if model in [ba, et, rf10, rf50, rf]:
        n_est = model.get_params()["n_estimators"]
        benchmark[str(model.__class__.__name__) + str(n_est)] = [acc, tm, n_est]
    else:
        n_est = None
        benchmark[str(model.__class__.__name__)] = [acc, tm, n_est]
        
    

benchmark.index = (["Accuracy", "time", "n_estimators"])
benchmark.T

Unnamed: 0,Accuracy,time,n_estimators
BaggingClassifier10,0.938596,0.028996,10.0
DecisionTreeClassifier,0.903509,0.002997,
ExtraTreesClassifier100,0.938596,0.065001,100.0
RandomForestClassifier10,0.903509,0.012001,10.0
RandomForestClassifier50,0.929825,0.052999,50.0
RandomForestClassifier100,0.921053,0.104999,100.0


### RandomForest 하이퍼파라미터 튜닝

In [6]:
from sklearn.model_selection import GridSearchCV
#dt의 파라미터를 가짐 

params = {
    'n_estimators':[100, 200],
    'max_depth' : [6, 8, 10, 12], 
    'min_samples_leaf' : [8, 12, 18 ],
    'min_samples_split' : [8, 16, 20]
}

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state=0, n_jobs=-1) # n_jobs = -1 : 컴퓨터의 모든 코어 사용
grid_cv = GridSearchCV(rf_clf , param_grid=params , cv=2, n_jobs=-1)
grid_cv.fit(x_train , y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:
 {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 200}
최고 예측 정확도: 0.9539


In [7]:
rf_grid = RandomForestClassifier(**grid_cv.best_params_)
rf_grid.fit(x_train , y_train)
pred = rf_grid.predict(x_test)
print('예측 정확도: {}'.format(accuracy_score(y_test , pred)))

예측 정확도: 0.9298245614035088


### ExtraTrees 하이퍼파라미터 튜닝

In [8]:
# ExtraTreesClassifier 객체 생성 후 GridSearchCV 수행
et_clf = ExtraTreesClassifier(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(et_clf , param_grid=params , cv=2, n_jobs=-1)
grid_cv.fit(x_train , y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:
 {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고 예측 정확도: 0.9561


In [9]:
et_grid = ExtraTreesClassifier(**grid_cv.best_params_)
et_grid.fit(x_train , y_train)
pred = et_grid.predict(x_test)
print('예측 정확도: {}'.format(accuracy_score(y_test , pred)))

예측 정확도: 0.8947368421052632


In [10]:
# 데이터가 너무 작아 성능 개선이 잘 되지 않음
df.shape

(569, 30)

# <span style="color:gray">회귀</span>

In [11]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

from sklearn.datasets import load_diabetes

In [12]:
# 당뇨병 환자 데이터 불러오기
data = load_diabetes()

df = pd.DataFrame(data.data, columns=data.feature_names)
df.head(3)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593


In [13]:
# 데이터 분할
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, 
                                                    test_size=0.2 , random_state= 1004)

## BaggingRegressor
sklearn에서 지원하는 Bagging 회귀모델
- base_estimator : bagging에 사용될 예측기(default = DecisionTreeRegressor)
- n_estimator : base_estimator의 갯수(default = 10)
- bootstrap : bootstrap 여부(default = True)
- oob_score : 모델 평가에 oob 샘플의 사용 여부(default = False)

In [14]:
ba_clf = BaggingRegressor(base_estimator = Ridge())
ba_clf.fit(x_train, y_train)
ba_pred = ba_clf.predict(x_test)
print("BaggingRegressor_Ridge :", mean_squared_error(y_test, ba_pred))

BaggingRegressor_Ridge : 3558.4153186928957


In [15]:
ba_clf = BaggingRegressor(base_estimator = DecisionTreeRegressor())
ba_clf.fit(x_train, y_train)
ba_pred = ba_clf.predict(x_test)
print("BaggingRegressor_DT :", mean_squared_error(y_test, ba_pred))

BaggingRegressor_DT : 3316.190674157303


## RandomForestRegressor
수많은 Decision Tree가 합쳐져 만들어진 <span style="color:red">Bagging</span> 회귀 모델
- Bootstrap 방식으로 feature 선택
- <span style="color:gray">(version 1.1 이전)</span> DecisionTree와 같이 모든 feature의 information gain을 고려하여 노드를 분할
- <span style="color:gray">(version 1.1 이후)</span> 선택된 N개의 feature를 랜덤으로 $\sqrt(N)$개로 나눈 후 그 중 informatin gain을 고려하여 노드를 분할
- 각 Decision Tree의 결과를 평균하여 RandomForest의 결과를 도출한다.

## ExtraTreesRegressor
수많은 Decision Tree가 합쳐져 만들어진 <span style="color:red">앙상블</span> 회귀 모델
- 비복원 추출로 feature 선택
- 선택된 N개의 feature를 랜덤으로 $\sqrt(N)$개로 나눈 후 그 중 informatin gain을 고려하여 노드를 분할
- 각 Decision Tree의 결과를 평균하여 ExtraTrees의 결과를 도출한다.

In [16]:
ba = BaggingRegressor()
dt = DecisionTreeRegressor()
et = ExtraTreesRegressor()
rf10 = RandomForestRegressor(n_estimators=10)
rf50 = RandomForestRegressor(n_estimators=50)
rf = RandomForestRegressor()


models = [ba, dt,  et, rf10, rf50, rf]
benchmark = pd.DataFrame()
for model in models:
    start = time.time()
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    tm = time.time() - start
    mse = mean_squared_error(y_test, pred)
    if model in [ba, et, rf10, rf50, rf]:
        n_est = model.get_params()["n_estimators"]
        benchmark[str(model.__class__.__name__) + str(n_est)] = [mse, tm, n_est]
    else:
        n_est = None
        benchmark[str(model.__class__.__name__)] = [mse, tm, n_est]
        
    

benchmark.index = (["MSE", "time", "n_estimators"])
benchmark.T

Unnamed: 0,MSE,time,n_estimators
BaggingRegressor10,3645.815955,0.015995,10.0
DecisionTreeRegressor,6154.853933,0.001,
ExtraTreesRegressor100,3128.137724,0.092,100.0
RandomForestRegressor10,3660.700674,0.014,10.0
RandomForestRegressor50,3332.581092,0.096003,50.0
RandomForestRegressor100,3306.896356,0.138996,100.0


### RandomForest 하이퍼파라미터 튜닝

In [17]:
params = {
    'n_estimators':[100, 200],
    'max_depth' : [6, 8, 10, 12], 
    'min_samples_leaf' : [8, 12, 18 ],
    'min_samples_split' : [8, 16, 20]
}

# RandomForestRegressor 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestRegressor(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf , param_grid=params , cv=2, n_jobs=-1)
grid_cv.fit(x_train , y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:
 {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}
최고 예측 정확도: 0.4166


In [18]:
rf_grid = RandomForestRegressor(**grid_cv.best_params_)
rf_grid.fit(x_train , y_train)
pred = rf_grid.predict(x_test)
print('예측 정확도: {}'.format(mean_squared_error(y_test , pred)))

예측 정확도: 3001.138948855262


### ExtraTrees 하이퍼파라미터 튜닝

In [19]:
# ExtraTreesRegressor 객체 생성 후 GridSearchCV 수행
et_clf = ExtraTreesRegressor(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(et_clf , param_grid=params , cv=2, n_jobs=-1)
grid_cv.fit(x_train , y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_)) 

최적 하이퍼 파라미터:
 {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
최고 예측 정확도: 0.4403


In [20]:
et_grid = ExtraTreesRegressor(**grid_cv.best_params_)
et_grid.fit(x_train , y_train)
pred = et_grid.predict(x_test)
print('예측 정확도: {}'.format(mean_squared_error(y_test , pred)))

예측 정확도: 3129.5528695300513
