In [None]:
import ast
import warnings

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics.scorer import make_scorer

from lightgbm import LGBMRegressor
from bayes_opt import BayesianOptimization

warnings.filterwarnings(action='ignore')

# Load Data & Change Column Name

In [None]:
df_rf = pd.read_excel('df_rf_prediction.xlsx')
df_et = pd.read_excel('df_et_prediction.xlsx')
df_lgb = pd.read_csv('df_lgbm_prediction.csv')
df_xgb = pd.read_csv('df_xgb_prediction.csv')
df_cb = pd.read_csv('df_cb_prediction.csv')

colnames = ['모델명', 'valid_pred', 'valid_true', 'vaild_SMAPE', 'test_pred']
df_rf.columns = colnames
df_et.columns = colnames
df_lgb.columns = colnames
df_xgb.columns = colnames
df_cb.columns = colnames

print(f'df_rf: {df_rf.shape}')
print(f'df_et: {df_et.shape}')
print(f'df_lgb: {df_lgb.shape}')
print(f'df_xgb: {df_xgb.shape}')
print(f'df_cb: {df_cb.shape}')

# fold별 데이터프레임을 건물별로 변경

In [None]:
def make_building_df(df):


    # str2list
    df['건물명'] = df['모델명'].apply(lambda x: x.split('_')[0])
    df['건물명'] = df['건물명'].astype('int')
    df['valid_pred'] = df['valid_pred'].apply(lambda x: ast.literal_eval(x))
    df['valid_true'] = df['valid_true'].apply(lambda x: ast.literal_eval(x))
    df['test_pred'] = df['test_pred'].apply(lambda x: ast.literal_eval(x))

    valid_preds = []
    valid_trues = []
    test_preds = []
    valid_smapes = df.groupby('건물명').mean()['vaild_SMAPE'].tolist()


    # 연산
    for BUILDING_NUM in range(1, 61):
        tmp_valid_pred = df.loc[df['건물명'] == BUILDING_NUM, 'valid_pred']
        tmp_valid_true = df.loc[df['건물명'] == BUILDING_NUM, 'valid_true']
        tmp_test_pred = df.loc[df['건물명'] == BUILDING_NUM, 'test_pred']

        tmp_valid_preds = []
        tmp_valid_trues = []
        tmp_test_preds = np.zeros(168)

        for valid_pred, valid_true, test_pred in zip(tmp_valid_pred, tmp_valid_true, tmp_test_pred):
            tmp_valid_preds.extend(valid_pred)
            tmp_valid_trues.extend(valid_true)
            tmp_test_preds += (np.array(test_pred) / 5)

        valid_preds.append(tmp_valid_preds)
        valid_trues.append(tmp_valid_trues)
        test_preds.append(tmp_test_preds.tolist())

    # 데이터프레임에 추가    
    df_final = pd.DataFrame()
    BUILDING_NUM = 1

    for VALID_PRED, VALID_TRUE, VALID_SMAPE, TEST_PRED in zip(valid_preds, valid_trues, valid_smapes, test_preds):
        tmp = pd.DataFrame({'건물명':[BUILDING_NUM],
                            'valid_pred':[VALID_PRED],
                            'valid_true':[VALID_TRUE],
                            'valid_SMAPE':[VALID_SMAPE],
                            'test_pred':[TEST_PRED]})

        df_final = df_final.append(tmp, ignore_index=True)

        BUILDING_NUM += 1
        
    return df_final

In [None]:
# df_rf_per_building = make_building_df(df_rf)
# df_et_per_building = make_building_df(df_et)
df_lgb_per_building = make_building_df(df_lgb)
df_xgb_per_building = make_building_df(df_xgb)
df_cb_per_building = make_building_df(df_cb)

# print(f'df_rf_per_building: {df_rf_per_building.shape}')
# print(f'df_et_per_building: {df_et_per_building.shape}')
print(f'df_lgb_per_building: {df_lgb_per_building.shape}')
print(f'df_xgb_per_building: {df_xgb_per_building.shape}')
print(f'df_cb_per_building: {df_cb_per_building.shape}')

# 모델링

In [None]:
B_num = 1
stacking_train = pd.DataFrame({'lgb':df_lgb_per_building.loc[df_lgb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                 'xgb':df_xgb_per_building.loc[df_xgb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                 'cb':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                 'true':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'valid_true'][0]}) # valid_true는 아무대서나 가져와도 가능

stacking_test = pd.DataFrame({'lgb':df_lgb_per_building.loc[df_lgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                 'xgb':df_xgb_per_building.loc[df_xgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                 'cb':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'test_pred'][0]})

### 1. 파라미터 찾기

In [None]:
def SMAPE_LGBM(y_pred, y_true):
 
    v = 2 * abs(y_pred - y_true) / (abs(y_pred) + abs(y_true))
    SMAPE = - (np.mean(v) * 100)
    
    return SMAPE

In [None]:
def lgb_evaluate(num_leaves, colsample_bytree, subsample, min_split_gain, min_child_samples, subsample_freq ):
    params = {
        'learning_rate' : 0.01,
        'n_estimators': 500,
        'random_state': 42, 
        'max_depth': -1,
        'num_leaves': int(num_leaves),
        'colsample_bytree': colsample_bytree,
        'subsample':subsample,
        'min_split_gain': min_split_gain,
        'min_child_samples' : int(min_child_samples),
        'subsample_freq' : int(subsample_freq) 
    }
    lgb = LGBMRegressor(**params)
    
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    scores = cross_val_score(lgb, X, y, cv=kf, scoring=make_scorer(SMAPE_LGBM, greater_is_better=True))
    return np.mean(scores)

In [None]:
lgb_params = {}

for B_num in range(1,61):
    
    print(f'\n\n================== 건물명: {B_num} 시작!! ==================')

    stacking_train = pd.DataFrame({'lgb':df_lgb_per_building.loc[df_lgb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                   'xgb':df_xgb_per_building.loc[df_xgb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                   'cb':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                   'true':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'valid_true'][0]}) # valid_true는 아무대서나 가져와도 가능

    stacking_test = pd.DataFrame({'lgb':df_lgb_per_building.loc[df_lgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                   'xgb':df_xgb_per_building.loc[df_xgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                   'cb':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'test_pred'][0]})
    
    X = stacking_train.drop(['true'], axis = 1)
    y = stacking_train['true']

    lgbBO = BayesianOptimization(lgb_evaluate, 
                             {'num_leaves':(63, 255),
                              'colsample_bytree':  (0.6, 1),
                              'subsample' : (0.8, 1),
                              'min_split_gain': (0.001, 0.1),
                              'min_child_samples' : (20, 100),
                              'subsample_freq' : (0, 5)})

    lgbBO.maximize(init_points=5, n_iter=50)

    lgb_param = lgbBO.max['params']
    lgb_param['learning_rate'] = 0.01
    lgb_params[B_num] = lgb_param

### 2. 모델에 대입

In [None]:
def SMAPE(y_pred, dataset):
    y_true = dataset.get_label()

    v = 2 * abs(y_pred - y_true) / (abs(y_pred) + abs(y_true))
    SMAPE = np.mean(v) * 100
    
    return 'SMAPE', SMAPE, False

In [None]:
import lightgbm as lgb

n_splits = 5

lgb_models={}


for BUILDING_NUM in range(1,61):
    
    print(f'\n\n================== 건물명: {BUILDING_NUM} 시작!! ==================')

    step = 1
    cv = KFold(n_splits = n_splits, shuffle = True, random_state=42)
    
    stacking_train = pd.DataFrame({'lgb':df_lgb_per_building.loc[df_lgb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                   'xgb':df_xgb_per_building.loc[df_xgb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                   'cb':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'valid_pred'][0],
                                   'true':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'valid_true'][0]}) # valid_true는 아무대서나 가져와도 가능

    stacking_test = pd.DataFrame({'lgb':df_lgb_per_building.loc[df_lgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                   'xgb':df_xgb_per_building.loc[df_xgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                   'cb':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'test_pred'][0]})
    
    X = stacking_train.drop(['true'], axis = 1)
    y = stacking_train['true']
    
    lgb_final_param = {
                      "objective" : "regression",
                      "num_leaves" : 90,
                      "max_depth": -1,
                      "learning_rate" : 0.01,
                      "bagging_fraction" : 1,  # subsample
                      "feature_fraction" : 0.9,  # colsample_bytree
                      "bagging_freq" : 5,        # subsample_freq
                      "bagging_seed" : 2018,
                      "verbosity" : -1,
                      "max_bin" : 1000}
    
    lgb_final_param['num_leaves'] = int(lgb_params[BUILDING_NUM]['num_leaves'])
    lgb_final_param['feature_fraction'] = lgb_params[BUILDING_NUM]['colsample_bytree']
    lgb_final_param['bagging_fraction'] = lgb_params[BUILDING_NUM]['subsample']
    lgb_final_param['min_gain_to_split'] = lgb_params[BUILDING_NUM]['min_split_gain']
    lgb_final_param['min_data_in_leaf'] = int(lgb_params[BUILDING_NUM]['min_child_samples'])
    lgb_final_param['bagging_freq'] = int(lgb_params[BUILDING_NUM]['subsample_freq'])
    
    # kfold를 통한 모델 구축
    for tr_idx, val_idx in cv.split(X):  

        print(f'\n\n ============================ {step} ============================')    

        X_train = X.iloc[tr_idx, :].values
        y_train = y[tr_idx].values

        X_valid = X.iloc[val_idx, :].values
        y_valid = y[val_idx].values

        lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
        lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

        lgb_model = lgb.train(lgb_final_param, lgb_dtrain, 20000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=1000, feval=SMAPE)

        # 모델 저장
        dict_key = f'{BUILDING_NUM}_{step}'
        lgb_models[dict_key] = lgb_model

        step += 1

# Submission

In [None]:
answer_list = []


for BUILDING_NUM in range(1,61):
    
    if BUILDING_NUM % 10 == 0:
        print(f'== 건물명: {BUILDING_NUM} 시작!! ==')
    
    values = []
    stacking_test = pd.DataFrame({'lgb':df_lgb_per_building.loc[df_lgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                  'xgb':df_xgb_per_building.loc[df_xgb_per_building['건물명'] == B_num, 'test_pred'][0],
                                  'cb':df_cb_per_building.loc[df_cb_per_building['건물명'] == B_num, 'test_pred'][0]})
    
    for step in range(1,1+n_splits):

        dict_key = f'{BUILDING_NUM}_{step}'
        value = pd.Series(lgb_models[dict_key].predict(stacking_test) / n_splits)

        if step == 1:
            values = value.copy()
        else:
            values += value

    answer_list.extend(values.tolist())

In [None]:
submission = sample_submission.copy()
submission['answer'] = answer_list
submission.to_csv('submission_0623(5개모델전체스태킹앙상블).csv', index= False) 
submission