#### download package

In [None]:
# download sktime package 
!pip install sktime

Collecting sktime
  Downloading sktime-0.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 5.9 MB/s 
Collecting numba>=0.53
  Downloading numba-0.54.1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.0 MB/s 
[?25hCollecting scikit-learn>=0.24.0
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 1.8 MB/s 
Collecting statsmodels>=0.12.1
  Downloading statsmodels-0.13.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 47.5 MB/s 
Collecting llvmlite<0.38,>=0.37.0rc1
  Downloading llvmlite-0.37.0-cp37-cp37m-manylinux2014_x86_64.whl (26.3 MB)
[K     |████████████████████████████████| 26.3 MB 90 kB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (1

In [None]:
# 필요한 패키지 import
import os
import sys
import warnings
import plotly
import numpy as np
import pandas as pd
import datetime
import tensorflow as tf
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler, StandardScaler
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from scipy.stats import reciprocal 

from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.5f}'.format

## Model training

### making datasets

In [None]:
df = pd.read_pickle('corp_id_GP001_cnee_addr_대구.pkl')

Unnamed: 0,BKG_DATE,ITEM_QTY,month,10대 비율,"2,30대 비율","4,5,60대 비율",남성 비율,여성 비율,총인구수,1인,2인,3인,4인,5인,전체가구수,요일,휴일여부,DAY_1,DAY_2,DAY_3,week,WEEK_AMT,MEAN_PRICE,0.0,1.0,10,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
0,2021-03-01,0.00000,03,0.09260,0.25300,0.47140,0.49320,0.50680,2408875.00000,0.37160,0.24300,0.18500,0.15740,0.04300,1060866.00000,0,1,0.00000,0.00000,0.00000,1,0.00000,0.00000,0,0.00000,0.00000,0.00000,0.00000,0,0,0,0.00000,0,0
1,2021-03-02,0.00000,03,0.09260,0.25300,0.47140,0.49320,0.50680,2408875.00000,0.37160,0.24300,0.18500,0.15740,0.04300,1060866.00000,1,0,0.00000,0.00000,0.00000,1,0.00000,0.00000,0,0.00000,0.00000,0.00000,0.00000,0,0,0,0.00000,0,0
2,2021-03-03,0.00000,03,0.09260,0.25300,0.47140,0.49320,0.50680,2408875.00000,0.37160,0.24300,0.18500,0.15740,0.04300,1060866.00000,2,0,0.00000,0.00000,0.00000,1,0.00000,0.00000,0,0.00000,0.00000,0.00000,0.00000,0,0,0,0.00000,0,0
3,2021-03-04,0.00000,03,0.09260,0.25300,0.47140,0.49320,0.50680,2408875.00000,0.37160,0.24300,0.18500,0.15740,0.04300,1060866.00000,3,0,0.00000,0.00000,0.00000,1,0.00000,0.00000,0,0.00000,0.00000,0.00000,0.00000,0,0,0,0.00000,0,0
4,2021-03-05,0.00000,03,0.09260,0.25300,0.47140,0.49320,0.50680,2408875.00000,0.37160,0.24300,0.18500,0.15740,0.04300,1060866.00000,4,0,0.00000,0.00000,0.00000,1,0.00000,0.00000,0,0.00000,0.00000,0.00000,0.00000,0,0,0,0.00000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,2021-06-26,177.00000,06,0.09220,0.25200,0.47280,0.49310,0.50690,2397646.00000,0.37310,0.24470,0.18430,0.15570,0.04220,1059826.00000,5,1,227.00000,229.00000,274.00000,17,1815.00000,184969.48052,0,133.00000,1.00000,10.00000,2.00000,0,0,0,31.00000,0,0
118,2021-06-27,305.00000,06,0.09220,0.25200,0.47280,0.49310,0.50690,2397646.00000,0.37310,0.24470,0.18430,0.15570,0.04220,1059826.00000,6,1,177.00000,227.00000,229.00000,17,1815.00000,150041.81102,0,229.00000,2.00000,20.00000,6.00000,0,0,0,48.00000,0,0
119,2021-06-28,490.00000,06,0.09220,0.25200,0.47280,0.49310,0.50690,2397646.00000,0.37310,0.24470,0.18430,0.15570,0.04220,1059826.00000,0,0,305.00000,177.00000,227.00000,18,2473.00000,152389.13753,0,251.00000,2.00000,159.00000,49.00000,0,0,0,29.00000,0,0
120,2021-06-29,412.00000,06,0.09220,0.25200,0.47280,0.49310,0.50690,2397646.00000,0.37310,0.24470,0.18430,0.15570,0.04220,1059826.00000,1,0,490.00000,305.00000,177.00000,18,2473.00000,141681.67614,0,264.00000,5.00000,49.00000,52.00000,0,0,0,42.00000,0,0


In [None]:
corp_id_df = ['GP001', 'KX007']
cnee_addr_df = ['대구', '제주', '서울', '경기', '부산', '대전', '세종', '광주', '경남', '전북', '충남', '경북', '강원', '인천', '전남', '충북', '울산']
cnee_addr_dict = {'대구':'DAEGU', '제주':'JEJU', '서울':'SEOUL', '경기':'GG', '부산':'BUSAN', '대전':'DJ', '세종':'SJ', '광주':'KG', '경남':'KN', '전북':'JB', '충남':'CN', '경북':'KB', '강원':'KW', '인천':'IC', '전남':'JN', '충북':'CB', '울산':'ULSAN'}

In [None]:
# 학습에 필요한 train 및 test dataset 만드는 과정
def get_train_test_set(corp_id, cnee_addr):
  df = pd.read_pickle("corp_id_"+corp_id+"_cnee_addr_"+cnee_addr+".pkl")
  train = df[df['BKG_DATE'] <= '2021-06-20']
  test = df[df['BKG_DATE'] > '2021-06-20']
  # 나머지 Scaling
  scaling_features = ['DAY_1', 'DAY_2', 'DAY_3',
       'WEEK_AMT','MEAN_PRICE', '1인', '2인', '3인', '4인', '5인', '전체가구수', '요일', '휴일여부',
       '0.0', '1.0', '10', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10']

  scaler = StandardScaler()
  train.loc[:, scaling_features] = scaler.fit_transform(train[scaling_features])
  test.loc[:, scaling_features] = scaler.transform(test[scaling_features])
  train_x = train.drop(['ITEM_QTY','BKG_DATE'], axis=1)
  train_y = train['ITEM_QTY']

  test_x = test.drop(['ITEM_QTY','BKG_DATE'], axis=1)
  test_y = test['ITEM_QTY']
  return train_x, train_y, test_x, test_y

### 모델 별 정의 및 파라미터 최적화

In [None]:
# 모델별 학습 파라미터 정의
XGBRegressor_param = {'n_estimators' : [100], 'eta' : [0.01], 'min_child_weight' : np.arange(1, 8, 1), 'max_depth' : np.arange(3,9,1) , 'colsample_bytree' :np.arange(0.8, 1.0, 0.1), 'subsample' :np.arange(0.8, 1.0, 0.1)}
LGBMRegressor_param = {'max_depth' : range(3,15,3), 'min_child_weight': range(1,6,2), 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100], 'learning_rate':[0.1, 0.01], 'max_depth' : [6,8,10]}
SVR_param = {'kernel':['linear'], 'C':[1.0], 'epsilon':[0.1]}
GradientBoostingRegressor_param = {'n_estimators':[100], 'max_depth':np.arange(3,20,3)}
AdaBoostRegressor_param = {'n_estimators' : np.arange(25, 100, 25), 'loss': ['linear', 'square', 'exponential'], 'learning_rate': np.arange(0.1, 1)} 

In [None]:
# 모델별 정의
XGBRegressor_model = XGBRegressor(n_estimators = 100, objective = 'reg:squarederror')
LGBMRegressor_model = LGBMRegressor(n_estimators = 80)
SVR_model = SVR(kernel='linear', C=1.0, epsilon=0.1)
GradientBoostingRegressor_model = GradientBoostingRegressor(n_estimators=100, max_depth=3)
AdaBoostRegressor_model = AdaBoostRegressor(base_estimator=None)

In [None]:
# 단일 모델별 최적 파라미터로 모델링
def print_best_params(model, params, x_train, x_test, y_train, y_test, log=False):

  tss = TimeSeriesSplit(n_splits=5)
  grid_model=GridSearchCV(model, cv = tss, param_grid=params, scoring='neg_mean_absolute_error')
  grid_model.fit(x_train, y_train)
  mae = -1 * grid_model.best_score_
  #print('{0} 최적 평균 mae값 : {1}, 최적 파라미터:{2}'.format(model.__class__.__name__, np.round(mae, 4), grid_model.best_params_))

  best_model=grid_model.best_estimator_
  pred=best_model.predict(x_test)

  if log:
    y_test=np.expm1(y_test)
    pred=np.expm1(pred)
  
  single_min_list = np.round(mean_absolute_error(y_test, pred), 4)

  return best_model, single_min_list, pred

In [None]:
# 단일 모델에서의 MAE 값이 가장 작은 세 개의 모델로 stacking, stacking model의 dataset 만드는 함수
def get_stacking_base_datasets(model, x_train_n, y_train_n, x_test_n, n_splits=5):
  # 지정된 n_folds 값으로 KFold 생성
  tss = TimeSeriesSplit(n_splits)

  # 추후 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
  train_fold_pred = np.zeros((x_train_n.shape[0], 1))
  test_pred = np.zeros((x_test_n.shape[0], n_splits))
  #print(model.__class__.__name__, ' model 시작')

  for folder_counter, (train_index, valid_index) in enumerate(tss.split(x_train_n)):
    # 입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 세트 추출
    #print('\t 폴드 세트: ', folder_counter, ' 시작')
    x_tr = x_train_n[train_index]
    y_tr = y_train_n[train_index]
    x_te = x_train_n[valid_index]

    # 폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행
    model.fit(x_tr, y_tr)
    # 폴드 세트 내부에서 다시 만들어지 검증 데이터로 기반 모델 예측 후 데이터 저장
    train_fold_pred[valid_index, :] = model.predict(x_te).reshape(-1, 1)
    # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
    test_pred[:, folder_counter] = model.predict(x_test_n)

  # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
  test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

  # train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
  return train_fold_pred, test_pred_mean

### 모델 학습 과정
    - 단일 모델 학습 및 stacking 모델 학습
    - 모델별 성능 비교 후 최적의 모델 반환

In [None]:
import sys
mod = sys.modules[__name__]

In [None]:

def get_optimal_model(corp_id, cnee_addr):
  x_train, y_train, x_test, y_test = get_train_test_set(corp_id, cnee_addr)
  cnee_addr_ = cnee_addr_dict[cnee_addr]
  x_train_n=x_train.values
  x_test_n=x_test.values
  y_train_n=y_train.values
  
  # 단일 모델별 성능 리스트
  single_min_list = dict()
  
  # 단일 모델 학습
  globals()["XGBRegressor_model_tuned_{}_{}".format(corp_id, cnee_addr_)], single_min_list["XGBRegressor"], globals()["single_pred_XGBRegressor_{}_{}".format(corp_id, cnee_addr_)] = print_best_params(XGBRegressor_model, XGBRegressor_param, x_train, x_test, y_train, y_test)
  globals()["LGBMRegressor_model_tuned_{}_{}".format(corp_id, cnee_addr_)], single_min_list["LGBMRegressor"], globals()["single_pred_LGBMRegressor_{}_{}".format(corp_id, cnee_addr_)] = print_best_params(LGBMRegressor_model, LGBMRegressor_param, x_train, x_test, y_train, y_test)
  globals()["SVR_model_tuned_{}_{}".format(corp_id, cnee_addr_)], single_min_list["SVR"], globals()["single_pred_SVR_{}_{}".format(corp_id, cnee_addr_)] = print_best_params(SVR_model, SVR_param, x_train, x_test, y_train, y_test)
  globals()["GradientBoostingRegressor_model_tuned_{}_{}".format(corp_id, cnee_addr_)], single_min_list["GradientBoostingRegressor"], globals()["single_pred_GradientBoostingRegressor_{}_{}".format(corp_id, cnee_addr_)] = print_best_params(GradientBoostingRegressor_model, GradientBoostingRegressor_param, x_train, x_test, y_train, y_test)
  globals()["AdaBoostRegressor_model_tuned_{}_{}".format(corp_id, cnee_addr_)], single_min_list["AdaBoostRegressor"], globals()["single_pred_AdaBoostRegressor_{}_{}".format(corp_id, cnee_addr_)] = print_best_params(AdaBoostRegressor_model, AdaBoostRegressor_param, x_train, x_test, y_train, y_test)

  single_model_mae = sorted(single_min_list.items(), key = lambda item: item[1])
  
  # Stacking 모델별 성능 리스트
  stacking_list = dict()

# stacking model dataset 생성
  globals()["{}_train_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)], globals()["{}_test_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)), x_train_n, y_train_n, x_test_n, 5)                                                                                                          
  globals()["{}_train_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)], globals()["{}_test_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            
  # 첫번째 경우
  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)), getattr(mod, "{}_train_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)), getattr(mod, "{}_test_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_))), axis=1)

  globals()["meta_model_{}_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)] = getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_))

  getattr(mod, "meta_model_{}_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)] = getattr(mod, "meta_model_{}_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[2][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)))

  # 두번째 경우
  globals()["{}_train_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)], globals()["{}_test_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)), x_train_n, y_train_n, x_test_n, 5)
                                                                                                            
  globals()["{}_train_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)], globals()["{}_test_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            

  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)), getattr(mod, "{}_train_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)), getattr(mod, "{}_test_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_))), axis=1)

  globals()["meta_model_{}_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)] = getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_))

  getattr(mod, "meta_model_{}_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)] = getattr(mod, "meta_model_{}_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[1][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)))


  # 세번째 경우
  globals()["{}_train_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)], globals()["{}_test_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)), x_train_n, y_train_n, x_test_n, 5)
                                                                                                            
  globals()["{}_train_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)], globals()["{}_test_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            

  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)), getattr(mod, "{}_train_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}_{}".format(single_model_mae[1][0], corp_id, cnee_addr_)), getattr(mod, "{}_test_{}_{}".format(single_model_mae[2][0], corp_id, cnee_addr_))), axis=1)

  globals()["meta_model_{}_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)] = getattr(mod, "{}_model_tuned_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_))

  getattr(mod, "meta_model_{}_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)] = getattr(mod, "meta_model_{}_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[0][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_)))

  stacking_model_mae = sorted(stacking_list.items(), key = lambda item: item[1])

# 단일 모델과 stacking 모델의 MAE 값을 비교하여 작은 값으로 모델링 결과 반환
  a = single_model_mae[0][1]
  b = stacking_model_mae[0][1]

  if a < b:
    print("{} 허브에서의 주소 코드 {}의 최적 모델은 단일 모델 {} : (test MAE값) {}".format(corp_id, cnee_addr, single_model_mae[0][0], single_model_mae[0][1]))
    globals()["best_model_{}_{}".format(corp_id, cnee_addr_)]  = single_model_mae[0][0]
    globals()["test_mae_{}_{}".format(corp_id, cnee_addr_)] = single_model_mae[0][1]
    globals()["best_pred_{}_{}".format(corp_id, cnee_addr_)] = getattr(mod, "single_pred_{}_{}_{}".format(single_model_mae[0][0], corp_id, cnee_addr_))
  else:
    print("{} 허브에서의 주소 코드 {}의 최적 모델은 stacking meta 모델 {} : (test MAE값) {}".format(corp_id, cnee_addr, stacking_model_mae[0][0], stacking_model_mae[0][1]))
    globals()["best_model_{}_{}".format(corp_id, cnee_addr_)] = stacking_model_mae[0][0]
    globals()["test_mae_{}_{}".format(corp_id, cnee_addr_)] = stacking_model_mae[0][1]
    globals()["best_pred_{}_{}".format(corp_id, cnee_addr_)] = getattr(mod, "stack_pred_{}_{}_{}".format(stacking_model_mae[0][0], corp_id, cnee_addr_))


In [None]:
# 단일 모델과 stacking 모델의 MAE 값을 비교하여 작은 값으로 모델링 결과 반환
for corp_id in corp_id_df:
  for cnee_addr in cnee_addr_df:
    get_optimal_model(corp_id, cnee_addr)

GP001 허브에서의 주소 코드 대구의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 175.2857
GP001 허브에서의 주소 코드 제주의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 30.336
GP001 허브에서의 주소 코드 서울의 최적 모델은 단일 모델 AdaBoostRegressor : (test MAE값) 667.2886
GP001 허브에서의 주소 코드 경기의 최적 모델은 단일 모델 AdaBoostRegressor : (test MAE값) 1226.2622
GP001 허브에서의 주소 코드 부산의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 262.3907
GP001 허브에서의 주소 코드 대전의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 114.3335
GP001 허브에서의 주소 코드 세종의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 58.9662
GP001 허브에서의 주소 코드 광주의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 116.6767
GP001 허브에서의 주소 코드 경남의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 272.2543
GP001 허브에서의 주소 코드 전북의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 111.3432
GP001 허브에서의 주소 코드 충남의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 186.6558
GP001 허브에서의 주소 코드 경북의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 146.6006
GP001 허브에서의 주소 코드 강원의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 93.42
GP001 허브에서의 주소 코드 인천

### 모델 학습 결과에 대한 예측

In [None]:
# test dataset에 대한 예측값 
cnee_pred_df = pd.DataFrame()
for corp_id in corp_id_df:
  for cnee_addr in cnee_addr_df:
    cnee_pred_df[str(corp_id) + '_' + str(cnee_addr)] = getattr(mod, 'best_pred_{}_{}'.format(corp_id, cnee_addr_dict[cnee_addr]))

In [None]:
cnee_pred_df

Unnamed: 0,GP001_대구,GP001_제주,GP001_서울,GP001_경기,GP001_부산,GP001_대전,GP001_세종,GP001_광주,GP001_경남,GP001_전북,GP001_충남,GP001_경북,GP001_강원,GP001_인천,GP001_전남,GP001_충북,GP001_울산,KX007_대구,KX007_제주,KX007_서울,KX007_경기,KX007_부산,KX007_대전,KX007_세종,KX007_광주,KX007_경남,KX007_전북,KX007_충남,KX007_경북,KX007_강원,KX007_인천,KX007_전남,KX007_충북,KX007_울산
0,266.84524,63.13691,1243.33333,1809.64706,411.30084,171.11591,60.62929,210.89046,506.66302,220.27405,265.07031,326.96191,200.48432,318.16248,210.59877,177.70598,141.91632,798.11111,97.5374,4913.18423,6223.86929,1110.26343,578.4295,231.22917,518.45365,1222.43231,720.29375,946.1319,880.45331,656.40509,1227.14783,613.46921,688.81732,318.90616
1,238.52731,47.39287,1136.88889,1618.0,374.08722,149.41684,53.95527,162.29953,482.91245,183.59094,316.10361,325.88298,202.7339,346.76052,173.88922,201.07335,134.81477,798.11111,124.74054,4014.81037,5146.37897,1229.41968,496.46005,159.52363,506.64507,1181.50694,519.4682,966.34052,832.41547,560.74347,986.04199,638.25826,570.31543,333.79306
2,242.82222,60.94118,1125.09091,1468.3125,426.14999,152.75861,48.84479,184.33655,531.80518,188.12408,286.31558,326.5915,213.17329,318.32338,173.8437,195.37018,160.96159,821.39394,143.47143,3730.75708,4945.21864,1048.07471,738.37402,113.08312,451.6576,1023.45706,493.25881,744.98315,766.48395,496.91449,991.83813,577.2908,600.48621,257.72757
3,158.61286,55.69253,1079.11111,1408.28571,358.00397,125.30203,39.66014,159.94031,463.98962,161.57452,281.18225,243.79152,208.42462,338.15925,187.27812,170.37631,153.14108,769.18182,106.08237,3902.98497,4885.7937,1022.09338,532.62897,142.49081,537.50377,1023.45888,478.85685,704.17877,899.15912,519.97888,1037.66882,480.99343,622.03625,265.88208
4,230.54101,59.86367,951.5,1375.14286,279.23511,123.46432,57.08689,158.8349,298.96127,175.11566,187.4505,252.14428,125.92429,200.35992,144.70972,117.15843,99.42002,798.11111,161.88575,3897.60986,4834.65433,1033.91907,459.93152,136.14138,458.22028,929.65899,495.27602,734.49194,839.03589,540.38458,973.15698,488.9686,557.09662,264.69809
5,164.10035,44.20194,755.72222,1118.33333,225.59822,102.73479,30.1831,163.16225,268.47235,110.51201,135.12032,172.86014,125.96607,180.95328,136.00162,121.11353,94.94286,741.75,63.36943,3430.82171,4751.56402,1032.91724,415.68906,130.21414,412.49229,937.85424,440.75892,756.14966,710.65247,459.62506,852.1275,462.32302,532.82062,258.79028
6,257.68436,66.91006,1057.57143,1408.28571,346.98257,180.23431,44.59936,138.37679,376.2681,159.12234,166.72865,248.97501,156.50138,299.7506,168.21397,132.67664,84.05082,769.18182,96.35046,3933.71969,6137.9277,1082.98596,472.50977,150.89559,520.89457,1052.99974,487.15749,752.28925,951.35455,631.80304,993.48077,480.27877,493.08841,338.76947
7,338.81481,79.8227,2186.0,2868.0,414.28186,197.76462,99.34183,174.38635,520.28577,211.83179,294.18161,450.98929,253.7775,451.84362,253.17146,189.86322,170.86667,769.18182,66.96092,4324.67375,6310.95412,1258.87939,595.52472,208.88022,624.06357,1305.30244,683.78279,872.79279,957.83234,675.51074,1269.74536,568.63828,594.79352,330.7034
8,353.24079,67.3382,1662.58333,1936.66667,443.93451,164.67754,68.82184,176.7516,600.09314,196.92358,354.26355,350.57596,293.1647,364.84626,188.50542,244.33122,189.24286,774.63333,144.82248,4002.75333,5949.4769,1014.62256,548.27954,188.38855,483.31208,1009.89776,756.27632,743.56348,779.30396,600.88623,945.48383,632.82388,576.60144,387.32303
9,291.03624,65.5093,1173.3125,1581.7037,467.81897,159.19557,82.55874,165.9135,569.55194,177.73082,315.59708,381.98709,279.97885,351.89274,223.0592,260.84668,130.01637,768.0,114.75393,3698.2213,5944.00451,1082.81812,554.55908,138.20019,532.95042,1088.51224,461.499,670.50818,816.6012,615.91809,943.56396,495.25159,588.43201,262.37433


In [None]:
cnee_pred_df.to_csv('./모델링/예측값/cnee_예측값.csv', encoding = 'utf-8', index = False)