# Bagging (1)
## 01. 패키지 참조

In [114]:
import pandas as pd
from sklearn.ensemble import BaggingClassifier, BaggingRegressor

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score

In [115]:
origin  = pd.read_excel("https://data.hossam.kr/G02/breast_cancer.xlsx")
origin

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


## 2. 데이터 전처리
### 독립/종속 변수 분리

In [116]:
x = origin.drop("target", axis=1)
y = origin['target']
x.shape, y.shape

((569, 30), (569,))

In [117]:
scaler = StandardScaler()
std_x = scaler.fit_transform(x)
std_x[:1]

array([[ 1.09706398, -2.07333501,  1.26993369,  0.9843749 ,  1.56846633,
         3.28351467,  2.65287398,  2.53247522,  2.21751501,  2.25574689,
         2.48973393, -0.56526506,  2.83303087,  2.48757756, -0.21400165,
         1.31686157,  0.72402616,  0.66081994,  1.14875667,  0.90708308,
         1.88668963, -1.35929347,  2.30360062,  2.00123749,  1.30768627,
         2.61666502,  2.10952635,  2.29607613,  2.75062224,  1.93701461]])

### 훈련/데이터 분할

In [118]:
x_train, x_test, y_train, y_test = train_test_split(
    std_x, y, test_size = 0.3 , random_state = 111)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((398, 30), (171, 30), (398,), (171,))

## 3. 분류 모델 구현

In [119]:
lr = LogisticRegression()

In [120]:
clf = BaggingClassifier(
    base_estimator= lr,
    n_estimators= 50,     #부트스트랩 샘플 개수
    max_samples=1,      # 부트스트랩 샘플 비율 => 1이면 핵심데이터를 모두 샘플링한다.
    bootstrap=True,       # 복원 추출, False이면 비복원 추출
    random_state=111,
    # 하나의 예측기에 들어가는 샘플에 대하여 컬럼의 중복 사용여부를 겾렁
    bootstrap_features=False,
    n_jobs=1
)
clf.fit(x_train, y_train)

print("BaggingClassifier 훈련 정확도: {:.3f}".format(clf.score(x_train, y_train)))

y_pred = clf.predict(x_test)    
print('BaggingClassifer 테스트 정확도: {:.3f}'.format(accuracy_score(y_test,y_pred)))



BaggingClassifier 훈련 정확도: 0.638
BaggingClassifer 테스트 정확도: 0.602


## 1. 데이터 가져오기

In [121]:
origin = pd.read_excel("https://data.hossam.kr/E04/boston.xlsx")
origin

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4,0
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6,0
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9,0
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0,0


## 2. 데이터 전처리
### 독립/종속 변수 분리

In [122]:
x = origin.drop('MEDV', axis = 1)
y = origin['MEDV']
x.shape, y.shape

((506, 14), (506,))

### 훈련/검증 데이터 분리

In [123]:
scaler = StandardScaler()
std_x = scaler.fit_transform(x)
std_x[:1]

array([[-0.41978194,  0.28482986, -1.2879095 , -0.27259857, -0.14421743,
         0.41367189, -0.12001342,  0.1402136 , -0.98284286, -0.66660821,
        -1.45900038,  0.44105193, -1.0755623 , -0.44615259]])

In [124]:
x_train, x_test, y_train, y_test = train_test_split(
    std_x, y, test_size = 0.3 , random_state = 111)
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((354, 14), (152, 14), (354,), (152,))

## 3. 회귀 모델 구현
### 회귀 알고리즘 객체 정의

In [125]:
Li = LinearRegression()

### 배깅 모델 구현

In [126]:
reg = BaggingRegressor(
    base_estimator=Li,
    n_estimators= 50,   # 부트스트랩 샘플 개수
    max_samples=1,      # 부트스트랩 샘플 비율 => 1이면 학습데이터를 모두 샘플링한다.
    bootstrap= True,      # 복원 추출,  False이면 비복원 추출
    random_state= 111,

    # 하나의 예측기에 들어가는 샘플에 대하여 컬럼의 중복 사용여부를 결정
    bootstrap_features= False,
    n_jobs= -1)

reg.fit(x_train,y_train)
print("BaggingClassifier 훈련 R2: {:f}".format(reg.score(x_train, y_train)))

y_pred = reg.predict(x_test)    
print('BaggingClassifer 테스트 R2: {:f}'.format(r2_score(y_test,y_pred)))



BaggingClassifier 훈련 R2: -0.000752
BaggingClassifer 테스트 R2: -0.013808
