#### download package

In [None]:
# download sktime package 
!pip install sktime



In [None]:
# 필요한 패키지 import
import os
import sys
import warnings
import plotly
import numpy as np
import pandas as pd
import datetime
import tensorflow as tf
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler, StandardScaler
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from scipy.stats import reciprocal 

from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.5f}'.format

## Model training

### making datasets

In [None]:
shpr_df = ['90001302', '90001441', '90001542', '90001341', '90001541',
       '90001443', '90001381', '90001521', '90001582', '90001602',
       '90001662', '90001622', '90001682', '90001683', '90001702',
       '90001703', '90001705', '90001704', '90001664', '90001768',
       '90001765', '90001776', '90001774', '90001842']

In [None]:
# 학습에 필요한 train 및 test dataset 만드는 과정
def get_train_test_set(shpr_cd):
  df = pd.read_pickle("shpr_cd_"+shpr_cd+".pkl")
  train = df[df['BKG_DATE'] <= '2021-06-20']
  test = df[df['BKG_DATE'] > '2021-06-20']
  # 나머지 Scaling
  scaling_features = ['DAY_1', 'DAY_2', 'DAY_3', 'WEEK_AMT',
       '100이상 27307.5미만','27307.5이상 63200미만','63200이상 133375미만', '133375이상 290100미만',
       '290100이상 1170901미만','MEAN_PRICE', '강원', '경기', '경남', '경북', '광주',
       '대구', '대전', '부산', '서울', '세종', '울산', '인천', '전남', '전북', '제주', '충남', '충북',
       '0.0', '1.0', '10', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0']

  scaler = StandardScaler()
  train.loc[:, scaling_features] = scaler.fit_transform(train[scaling_features])
  test.loc[:, scaling_features] = scaler.transform(test[scaling_features])
  train_x = train.drop(['ITEM_QTY','BKG_DATE'], axis=1)
  train_y = train['ITEM_QTY']

  test_x = test.drop(['ITEM_QTY','BKG_DATE'], axis=1)
  test_y = test['ITEM_QTY']
  return train_x, train_y, test_x, test_y

### 모델 별 정의 및 파라미터 최적화

In [None]:
# 모델별 학습 파라미터 정의
XGBRegressor_param = {'n_estimators' : [100], 'eta' : [0.01], 'min_child_weight' : np.arange(1, 8, 1), 'max_depth' : np.arange(3,9,1) , 'colsample_bytree' :np.arange(0.8, 1.0, 0.1), 'subsample' :np.arange(0.8, 1.0, 0.1)}
LGBMRegressor_param = {'max_depth' : range(3,15,3), 'min_child_weight': range(1,6,2), 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100], 'learning_rate':[0.1, 0.01], 'max_depth' : [6,8,10]}
SVR_param = {'kernel':['linear'], 'C':[1.0], 'epsilon':[0.1]}
GradientBoostingRegressor_param = {'n_estimators':[100], 'max_depth':np.arange(3,20,3)}
AdaBoostRegressor_param = {'n_estimators' : np.arange(25, 100, 25), 'loss': ['linear', 'square', 'exponential'], 'learning_rate': np.arange(0.1, 1)} 

In [None]:
# 모델별 정의
XGBRegressor_model = XGBRegressor(n_estimators = 100, objective = 'reg:squarederror')
LGBMRegressor_model = LGBMRegressor(n_estimators = 80)
SVR_model = SVR(kernel='linear', C=1.0, epsilon=0.1)
GradientBoostingRegressor_model = GradientBoostingRegressor(n_estimators=100, max_depth=3)
AdaBoostRegressor_model = AdaBoostRegressor(base_estimator=None)

In [None]:
# 단일 모델별 최적 파라미터로 모델링
def print_best_params(model, params, x_train, x_test, y_train, y_test, log=False):

  tss = TimeSeriesSplit(n_splits=5)
  grid_model = GridSearchCV(model, cv = tss, param_grid=params, scoring='neg_mean_absolute_error')
  grid_model.fit(x_train, y_train)
  mae = -1 * grid_model.best_score_
  #print('{0} 최적 평균 mae값 : {1}, 최적 파라미터:{2}'.format(model.__class__.__name__, np.round(mae, 4), grid_model.best_params_))

  best_model = grid_model.best_estimator_
  pred = best_model.predict(x_test)

  if log:
    y_test = np.expm1(y_test)
    pred = np.expm1(pred)
  
  single_min_list = np.round(mean_absolute_error(y_test, pred), 4)

  return best_model, single_min_list, pred

In [None]:
# 단일 모델에서의 MAE 값이 가장 작은 세 개의 모델로 stacking, stacking model의 dataset 만드는 함수
def get_stacking_base_datasets(model, x_train_n, y_train_n, x_test_n, n_splits=5):
  # 지정된 n_folds 값으로 KFold 생성
  tss = TimeSeriesSplit(n_splits)

  # 추후 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
  train_fold_pred = np.zeros((x_train_n.shape[0], 1))
  test_pred = np.zeros((x_test_n.shape[0], n_splits))
  #print(model.__class__.__name__, ' model 시작')

  for folder_counter, (train_index, valid_index) in enumerate(tss.split(x_train_n)):
    # 입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 세트 추출
    #print('\t 폴드 세트: ', folder_counter, ' 시작')
    x_tr = x_train_n[train_index]
    y_tr = y_train_n[train_index]
    x_te = x_train_n[valid_index]

    # 폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행
    model.fit(x_tr, y_tr)
    # 폴드 세트 내부에서 다시 만들어지 검증 데이터로 기반 모델 예측 후 데이터 저장
    train_fold_pred[valid_index, :]=model.predict(x_te).reshape(-1, 1)
    # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
    test_pred[:, folder_counter] = model.predict(x_test_n)

  # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
  test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

  # train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
  return train_fold_pred, test_pred_mean

### 모델 학습 과정
    - 단일 모델 학습 및 stacking 모델 학습
    - 모델별 성능 비교 후 최적의 모델 반환

In [None]:
import sys
mod = sys.modules[__name__]

In [None]:
# 최적의 파라미터로 각각의 모델 학습
def get_optimal_model(shpr_cd):
  x_train, y_train, x_test, y_test = get_train_test_set(shpr_cd)
  x_train_n=x_train.values
  x_test_n=x_test.values
  y_train_n=y_train.values
  
  # 단일 모델별 성능 리스트
  single_min_list = dict()
  
  # 단일 모델 학습
  globals()["XGBRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["XGBRegressor"], globals()["single_pred_XGBRegressor_{}".format(shpr_cd)] = print_best_params(XGBRegressor_model, XGBRegressor_param, x_train, x_test, y_train, y_test)
  globals()["LGBMRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["LGBMRegressor"], globals()["single_pred_LGBMRegressor_{}".format(shpr_cd)] = print_best_params(LGBMRegressor_model, LGBMRegressor_param, x_train, x_test, y_train, y_test)
  globals()["SVR_model_tuned_{}".format(shpr_cd)], single_min_list["SVR"], globals()["single_pred_SVR_{}".format(shpr_cd)] = print_best_params(SVR_model, SVR_param, x_train, x_test, y_train, y_test)
  globals()["GradientBoostingRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["GradientBoostingRegressor"], globals()["single_pred_GradientBoostingRegressor_{}".format(shpr_cd)] = print_best_params(GradientBoostingRegressor_model, GradientBoostingRegressor_param, x_train, x_test, y_train, y_test)
  globals()["AdaBoostRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["AdaBoostRegressor"], globals()["single_pred_AdaBoostRegressor_{}".format(shpr_cd)] = print_best_params(AdaBoostRegressor_model, AdaBoostRegressor_param, x_train, x_test, y_train, y_test)

  single_model_mae = sorted(single_min_list.items(), key = lambda item: item[1])
  
  # Stacking 모델별 성능 리스트
  stacking_list = dict()

# stacking model dataset 생성
  globals()["{}_train_{}".format(single_model_mae[0][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[0][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[0][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5)                                                                                                          
  globals()["{}_train_{}".format(single_model_mae[1][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[1][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[1][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            
  # 첫번째 경우
  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_train_{}".format(single_model_mae[1][0], shpr_cd))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_test_{}".format(single_model_mae[1][0], shpr_cd))), axis=1)

  globals()["meta_model_{}_{}".format(single_model_mae[2][0], shpr_cd)] = getattr(mod, "{}_model_tuned_{}".format(single_model_mae[2][0], shpr_cd))

  getattr(mod, "meta_model_{}_{}".format(single_model_mae[2][0], shpr_cd)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}".format(single_model_mae[2][0], shpr_cd)] = getattr(mod, "meta_model_{}_{}".format(single_model_mae[2][0], shpr_cd)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[2][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}".format(single_model_mae[2][0], shpr_cd)))

  # 두번째 경우
  globals()["{}_train_{}".format(single_model_mae[0][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[0][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[0][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5)
                                                                                                            
  globals()["{}_train_{}".format(single_model_mae[2][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[2][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[2][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            

  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_train_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_test_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)

  globals()["meta_model_{}_{}".format(single_model_mae[1][0], shpr_cd)] = getattr(mod, "{}_model_tuned_{}".format(single_model_mae[1][0], shpr_cd))

  getattr(mod, "meta_model_{}_{}".format(single_model_mae[1][0], shpr_cd)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}".format(single_model_mae[1][0], shpr_cd)] = getattr(mod, "meta_model_{}_{}".format(single_model_mae[1][0], shpr_cd)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[1][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}".format(single_model_mae[1][0], shpr_cd)))


  # 세번째 경우
  globals()["{}_train_{}".format(single_model_mae[1][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[1][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[1][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5)
                                                                                                            
  globals()["{}_train_{}".format(single_model_mae[2][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[2][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[2][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            

  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}".format(single_model_mae[1][0], shpr_cd)), getattr(mod, "{}_train_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}".format(single_model_mae[1][0], shpr_cd)), getattr(mod, "{}_test_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)

  globals()["meta_model_{}_{}".format(single_model_mae[0][0], shpr_cd)] = getattr(mod, "{}_model_tuned_{}".format(single_model_mae[0][0], shpr_cd))

  getattr(mod, "meta_model_{}_{}".format(single_model_mae[0][0], shpr_cd)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}".format(single_model_mae[0][0], shpr_cd)] = getattr(mod, "meta_model_{}_{}".format(single_model_mae[0][0], shpr_cd)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[0][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}".format(single_model_mae[0][0], shpr_cd)))

  stacking_model_mae = sorted(stacking_list.items(), key = lambda item: item[1])

# 단일 모델과 stacking 모델의 MAE 값을 비교하여 작은 값으로 모델링 결과 반환
  a = single_model_mae[0][1]
  b = stacking_model_mae[0][1]

  if a < b:
    print("쇼핑몰 코드 {}의 최적 모델은 단일 모델 {} : (test MAE값) {}".format(shpr_cd, single_model_mae[0][0], single_model_mae[0][1]))
    globals()["best_model_{}".format(shpr_cd)]  = single_model_mae[0][0]
    globals()["test_mae_{}".format(shpr_cd)] = single_model_mae[0][1]
    globals()["best_pred_{}".format(shpr_cd)] = getattr(mod, "single_pred_{}_{}".format(single_model_mae[0][0], shpr_cd))
  else:
    print("쇼핑몰 코드 {}의 최적 모델은 stacking meta 모델 {} : (test MAE값) {}".format(shpr_cd, stacking_model_mae[0][0], stacking_model_mae[0][1]))
    globals()["best_model_{}".format(shpr_cd)] = stacking_model_mae[0][0]
    globals()["test_mae_{}".format(shpr_cd)] = stacking_model_mae[0][1]
    globals()["best_pred_{}".format(shpr_cd)] = getattr(mod, "stack_pred_{}_{}".format(stacking_model_mae[0][0], shpr_cd))


In [None]:
# 단일 모델과 stacking 모델의 MAE 값을 비교하여 작은 값으로 모델링 결과 반환
for shpr_cd in shpr_df:
  get_optimal_model(shpr_cd)

쇼핑몰 코드 90001302의 최적 모델은 stacking meta 모델 SVR : (test MAE값) 60.218552047014235
쇼핑몰 코드 90001441의 최적 모델은 stacking meta 모델 GradientBoostingRegressor : (test MAE값) 30.406663026077627
쇼핑몰 코드 90001542의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 30.0344
쇼핑몰 코드 90001341의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 30.2651
쇼핑몰 코드 90001541의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 34.5527
쇼핑몰 코드 90001443의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 139.5502
쇼핑몰 코드 90001381의 최적 모델은 stacking meta 모델 AdaBoostRegressor : (test MAE값) 0.0
쇼핑몰 코드 90001521의 최적 모델은 단일 모델 SVR : (test MAE값) 29.3362
쇼핑몰 코드 90001582의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 142.9214
쇼핑몰 코드 90001602의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 88.0169
쇼핑몰 코드 90001662의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 14.6649
쇼핑몰 코드 90001622의 최적 모델은 단일 모델 XGBRegressor : (test MAE값) 136.0799
쇼핑몰 코드 90001682의 최적 모델은 단일 모델 GradientBoostingRegressor : (test MAE값) 35.9206
쇼핑몰 코드 900

### 모델 학습 결과에 대한 예측

In [None]:
for shpr_cd in shpr_df:
  x_train, y_train, x_test, y_test = get_train_test_set(shpr_cd)
  globals()['pred_{}'.format(shpr_cd)]= getattr(mod, "stack_pred_{}_{}".format(getattr(mod, "best_model_{}".format(shpr_cd)), shpr_cd))

In [None]:
# test dataset에 대한 예측값 
shpr_pred_df = pd.DataFrame()
for shpr_cd in shpr_df:
  shpr_pred_df[str(shpr_cd)] = getattr(mod, 'best_pred_{}'.format(shpr_cd))

In [None]:
shpr_pred_df

Unnamed: 0,90001302,90001441,90001542,90001341,90001541,90001443,90001381,90001521,90001582,90001602,90001662,90001622,90001682,90001683,90001702,90001703,90001705,90001704,90001664,90001768,90001765,90001776,90001774,90001842
0,3226.09386,1184.94575,1416.28051,1477.83446,2389.53713,1776.70269,0.0,658.68083,1751.88053,2932.72793,174.40498,2634.38721,1450.75144,109.5,1353.64319,2611.08004,1406.9,891.76732,69.85862,210.30665,1e-05,28.17773,0.0,0.0
1,3011.82735,2050.50788,1238.59269,1424.99268,2135.31718,1871.27577,0.0,847.62429,1631.23147,2215.02971,123.79654,1464.72363,1330.24467,130.41667,1361.34961,2072.81506,1082.36364,875.11641,61.64254,110.79168,1e-05,36.07948,0.0,0.0
2,2794.30549,890.04308,1559.96406,1334.68888,1793.24514,1871.52703,0.0,558.44538,1539.70471,2446.65606,184.56906,1546.18298,1263.98054,107.22727,1316.06238,2026.09197,960.0,999.82339,41.10904,103.41867,1e-05,28.5696,0.0,0.0
3,3186.88129,809.55347,1261.85854,1129.26486,1584.23225,1515.24092,0.0,922.16851,1450.13645,1999.05078,199.51991,1285.97253,1296.2301,108.0,1167.8175,1932.93337,1092.75,793.45183,54.07507,110.12193,1e-05,26.11894,0.0,0.0
4,3860.56491,701.91556,1238.17586,1241.06078,1156.71312,1241.10308,0.0,577.27586,2558.92395,1595.77271,138.92957,2669.20337,869.64703,83.84,1111.61829,1434.33167,856.75,1329.68656,39.63033,103.1428,1e-05,33.97936,0.0,0.0
5,2176.744,690.30646,1140.97781,867.23572,984.39306,4220.84825,0.0,345.6618,1065.20288,1486.88818,104.92847,1041.50696,926.40427,107.22727,925.45831,1303.00698,861.66667,601.43248,35.05866,96.64119,1e-05,13.44701,0.0,0.0
6,2773.45692,901.28155,2095.44362,1333.9502,1359.3921,2860.46988,0.0,419.00848,2191.98755,2431.02847,160.56714,1664.49744,1442.41318,246.2,1416.61011,1744.96246,1084.85714,890.08443,56.74568,103.18358,1e-05,36.02044,0.0,0.0
7,2793.50248,961.56303,1248.82314,1333.77664,2230.6199,4058.13191,0.0,1817.44508,3941.76067,2117.9182,187.61429,1363.29578,1197.382,94.5,1593.49475,1627.57344,963.0,830.33288,46.26468,4708.8766,1e-05,127.33578,0.0,0.0
8,3297.37037,900.47106,1505.8822,2084.6048,1684.43836,1952.29428,0.0,749.63406,1766.64692,2287.93031,193.76361,2609.33325,1477.11179,76.21739,1378.05933,1968.1275,1059.11111,1199.68109,34.59394,890.52009,1e-05,390.6758,0.0,0.0
9,2912.04489,995.3511,1378.53901,1472.73507,1944.4952,1568.07465,0.0,457.1989,1667.25212,2453.16488,230.95,2587.87524,1272.85608,160.33333,1568.12134,2249.80849,1084.85714,907.15326,38.35415,783.60746,1e-05,10109.11512,0.0,0.0


In [None]:
shpr_pred_df.to_csv('./모델링/예측값/shpr_예측값.csv', encoding = 'utf-8', index = False)