# Stacking - Ensemble의 ensemble

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Predict house price - regression

In [2]:
# 데이터 불러오기
data = pd.read_csv("./data/kc_house_data.csv") 
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [3]:
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis = 1)

In [4]:
feature_columns = list(data.columns.difference(['price'])) # Price를 제외한 모든 열

X = data[feature_columns]
y = data['price']

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(15129, 8) (6484, 8) (15129,) (6484,)


### Training

In [5]:
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y)

lgb_param = {'max_depth': 10,
            'learning_rate': 0.01,
#             'n_estimators': 500,
            'objective': 'regression'}

lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain)

rmse = sqrt(mean_squared_error(lgb_model.predict(test_x), test_y))
print(rmse)

249111.2021710895


### Ensemble of ensemble

In [6]:
train_x.shape[0]

15129

In [7]:
import random

bagging_predict_result = [] # 빈 리스트 생성

for k in range(10):
    data_index = [data_index for data_index in range(train_x.shape[0])] # 학습 데이터의 인덱스
    random_data_index = np.random.choice(data_index, train_x.shape[0]) # 랜덤 샘플링
    
    print(k, '-- random_data :', len(set(random_data_index)))
    
    lgb_dtrain = lgb.Dataset(data = train_x.iloc[random_data_index,], label = train_y.iloc[random_data_index,]) # 학습 데이터 변환
    lgb_param = {'max_depth': 14,
                 'learning_rate': 0.01,
                 'objective': 'regression'}
    
    lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain)
    predict1 = lgb_model.predict(test_x)
    
    bagging_predict_result.append(predict1)

0 -- random_data : 9602
1 -- random_data : 9603
2 -- random_data : 9569
3 -- random_data : 9532
4 -- random_data : 9587
5 -- random_data : 9554
6 -- random_data : 9538
7 -- random_data : 9603
8 -- random_data : 9549
9 -- random_data : 9558


In [8]:
# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산

bagging_predict = []

for lst2_index in range(test_x.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = []
    for lst_index in range(len(bagging_predict_result)):
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
        
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [9]:
# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이트의 타겟변수와 비교하여 성능 평가

print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

RMSE: 247477.14219843628
