In [None]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# download sktime package 
!pip install sktime

Collecting sktime
  Downloading sktime-0.8.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.1 MB)
[K     |████████████████████████████████| 6.1 MB 4.8 MB/s 
Collecting statsmodels>=0.12.1
  Downloading statsmodels-0.13.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 47.8 MB/s 
[?25hCollecting numba>=0.53
  Downloading numba-0.54.1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 28.7 MB/s 
[?25hCollecting scikit-learn>=0.24.0
  Downloading scikit_learn-1.0.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.2 MB)
[K     |████████████████████████████████| 23.2 MB 1.3 MB/s 
[?25hCollecting llvmlite<0.38,>=0.37.0rc1
  Downloading llvmlite-0.37.0-cp37-cp37m-manylinux2014_x86_64.whl (26.3 MB)
[K     |████████████████████████████████| 26.3 MB 92 kB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-non

In [None]:
# 필요한 패키지 import
import os
import sys
import warnings
import plotly
import numpy as np
import pandas as pd
import datetime
import tensorflow as tf
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split, TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler, StandardScaler
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from scipy.stats import reciprocal 

from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
import lightgbm as lgb
from lightgbm.sklearn import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.5f}'.format

In [None]:
%cd /content/gdrive/Shareddrives/cj공모전/최종코드/모델링/shpr_max

/content/gdrive/Shareddrives/cj공모전/최종코드/모델링/shpr_max


In [None]:
shpr_df = ['90001702', '90001705']

In [None]:
# 학습에 필요한 train 및 test dataset 만드는 과정
def get_train_test_set(shpr_cd):
  df = pd.read_pickle("item_cd_"+shpr_cd+".pkl")
  train = df[df['BKG_DATE'] <= '2021-06-20']
  test = df[df['BKG_DATE'] > '2021-06-20']
  # 나머지 Scaling
  scaling_features = ['요일', '휴일여부', 'DAY_1', 'DAY_2', 'DAY_3', 'WEEK_AMT', 'MEAN_PRICE',
       '강원', '경기', '경남', '경북', '광주', '대구', '대전', '부산', '서울', '세종', '울산', '인천',
       '전남', '전북', '제주', '충남', '충북']

  scaler = StandardScaler()
  train.loc[:, scaling_features] = scaler.fit_transform(train[scaling_features])
  test.loc[:, scaling_features] = scaler.transform(test[scaling_features])
  train_y = train['ITEM_QTY']
  train_x = train.drop(['ITEM_QTY','BKG_DATE', 'week'], axis=1)

  test_y = test['ITEM_QTY']
  test_x = test.drop(['ITEM_QTY','BKG_DATE', 'week'], axis=1)

  
  return train_x, train_y, test_x, test_y

In [None]:
# 모델별 학습 파라미터 정의
XGBRegressor_param = {'n_estimators' : [100], 'eta' : [0.01], 'min_child_weight' : np.arange(1, 8, 1), 'max_depth' : np.arange(3,9,1) , 'colsample_bytree' :np.arange(0.8, 1.0, 0.1), 'subsample' :np.arange(0.8, 1.0, 0.1)}
LGBMRegressor_param = {'max_depth' : range(3,15,3), 'min_child_weight': range(1,6,2), 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100], 'learning_rate':[0.1, 0.01], 'max_depth' : [6,8,10]}
SVR_param = {'kernel':['linear'], 'C':[1.0], 'epsilon':[0.1]}
GradientBoostingRegressor_param = {'n_estimators':[100], 'max_depth':np.arange(3,20,3)}
AdaBoostRegressor_param = {'n_estimators' : np.arange(25, 100, 25), 'loss': ['linear', 'square', 'exponential'], 'learning_rate': np.arange(0.1, 1)} 

In [None]:
# 모델별 정의
XGBRegressor_model = XGBRegressor(n_estimators = 100, objective = 'reg:squarederror')
LGBMRegressor_model = LGBMRegressor(n_estimators = 80)
SVR_model = SVR(kernel='linear', C=1.0, epsilon=0.1)
GradientBoostingRegressor_model = GradientBoostingRegressor(n_estimators=100, max_depth=3)
AdaBoostRegressor_model = AdaBoostRegressor(base_estimator=None)

In [None]:
# 단일 모델별 최적 파라미터로 모델링
def print_best_params(model, params, x_train, x_test, y_train, y_test, log=False):

  tss = TimeSeriesSplit(n_splits=5)
  grid_model=GridSearchCV(model, cv = tss, param_grid=params, scoring='neg_mean_absolute_error')
  grid_model.fit(x_train, y_train)
  mae = -1 * grid_model.best_score_
  #print('{0} 최적 평균 mae값 : {1}, 최적 파라미터:{2}'.format(model.__class__.__name__, np.round(mae, 4), grid_model.best_params_))

  best_model=grid_model.best_estimator_
  pred=best_model.predict(x_test)

  if log:
    y_test=np.expm1(y_test)
    pred=np.expm1(pred)
  
  single_min_list = np.round(mean_absolute_error(y_test, pred), 4)

  return best_model, single_min_list, pred

In [None]:
# 단일 모델에서의 MAE 값이 가장 작은 세 개의 모델로 stacking, stacking model의 dataset 만드는 함수
def get_stacking_base_datasets(model, x_train_n, y_train_n, x_test_n, n_splits=5):
  # 지정된 n_folds 값으로 KFold 생성
  tss = TimeSeriesSplit(n_splits)

  # 추후 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
  train_fold_pred=np.zeros((x_train_n.shape[0], 1))
  test_pred=np.zeros((x_test_n.shape[0], n_splits))
  #print(model.__class__.__name__, ' model 시작')

  for folder_counter, (train_index, valid_index) in enumerate(tss.split(x_train_n)):
    # 입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 세트 추출
    #print('\t 폴드 세트: ', folder_counter, ' 시작')
    x_tr=x_train_n[train_index]
    y_tr=y_train_n[train_index]
    x_te=x_train_n[valid_index]

    # 폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행
    model.fit(x_tr, y_tr)
    # 폴드 세트 내부에서 다시 만들어지 검증 데이터로 기반 모델 예측 후 데이터 저장
    train_fold_pred[valid_index, :]=model.predict(x_te).reshape(-1, 1)
    # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
    test_pred[:, folder_counter]=model.predict(x_test_n)

  # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
  test_pred_mean=np.mean(test_pred, axis=1).reshape(-1, 1)

  # train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
  return train_fold_pred, test_pred_mean

In [None]:
import sys
mod = sys.modules[__name__]

In [None]:

def get_optimal_model(shpr_cd):
  x_train, y_train, x_test, y_test = get_train_test_set(shpr_cd)
  x_train_n=x_train.values
  x_test_n=x_test.values
  y_train_n=y_train.values
  
  single_min_list = dict()
  
  # 단일 모델 학습
  globals()["XGBRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["XGBRegressor"], globals()["single_pred_XGBRegressor_{}".format(shpr_cd)] = print_best_params(XGBRegressor_model, XGBRegressor_param, x_train, x_test, y_train, y_test)
  globals()["LGBMRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["LGBMRegressor"], globals()["single_pred_LGBMRegressor_{}".format(shpr_cd)] = print_best_params(LGBMRegressor_model, LGBMRegressor_param, x_train, x_test, y_train, y_test)
  globals()["SVR_model_tuned_{}".format(shpr_cd)], single_min_list["SVR"], globals()["single_pred_SVR_{}".format(shpr_cd)] = print_best_params(SVR_model, SVR_param, x_train, x_test, y_train, y_test)
  globals()["GradientBoostingRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["GradientBoostingRegressor"], globals()["single_pred_GradientBoostingRegressor_{}".format(shpr_cd)] = print_best_params(GradientBoostingRegressor_model, GradientBoostingRegressor_param, x_train, x_test, y_train, y_test)
  globals()["AdaBoostRegressor_model_tuned_{}".format(shpr_cd)], single_min_list["AdaBoostRegressor"], globals()["single_pred_AdaBoostRegressor_{}".format(shpr_cd)] = print_best_params(AdaBoostRegressor_model, AdaBoostRegressor_param, x_train, x_test, y_train, y_test)

  single_model_mae = sorted(single_min_list.items(), key = lambda item: item[1])
  
  stacking_list = dict()

# stacking model dataset 생성
  globals()["{}_train_{}".format(single_model_mae[0][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[0][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[0][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5)                                                                                                          
  globals()["{}_train_{}".format(single_model_mae[1][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[1][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[1][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            
  # 첫번째 경우
  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_train_{}".format(single_model_mae[1][0], shpr_cd))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_test_{}".format(single_model_mae[1][0], shpr_cd))), axis=1)

  globals()["meta_model_{}_{}".format(single_model_mae[2][0], shpr_cd)] = getattr(mod, "{}_model_tuned_{}".format(single_model_mae[2][0], shpr_cd))

  getattr(mod, "meta_model_{}_{}".format(single_model_mae[2][0], shpr_cd)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}".format(single_model_mae[2][0], shpr_cd)] = getattr(mod, "meta_model_{}_{}".format(single_model_mae[2][0], shpr_cd)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[2][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}".format(single_model_mae[2][0], shpr_cd)))

  # 두번째 경우
  globals()["{}_train_{}".format(single_model_mae[0][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[0][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[0][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5)
                                                                                                            
  globals()["{}_train_{}".format(single_model_mae[2][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[2][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[2][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            

  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_train_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}".format(single_model_mae[0][0], shpr_cd)), getattr(mod, "{}_test_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)

  globals()["meta_model_{}_{}".format(single_model_mae[1][0], shpr_cd)] = getattr(mod, "{}_model_tuned_{}".format(single_model_mae[1][0], shpr_cd))

  getattr(mod, "meta_model_{}_{}".format(single_model_mae[1][0], shpr_cd)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}".format(single_model_mae[1][0], shpr_cd)] = getattr(mod, "meta_model_{}_{}".format(single_model_mae[1][0], shpr_cd)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[1][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}".format(single_model_mae[1][0], shpr_cd)))


  # 세번째 경우
  globals()["{}_train_{}".format(single_model_mae[1][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[1][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[1][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5)
                                                                                                            
  globals()["{}_train_{}".format(single_model_mae[2][0], shpr_cd)], globals()["{}_test_{}".format(single_model_mae[2][0], shpr_cd)] = get_stacking_base_datasets(getattr(mod, "{}_model_tuned_{}".format(single_model_mae[2][0], shpr_cd)), x_train_n, y_train_n, x_test_n, 5) 
                                                                                                            

  stack_final_x_train = np.concatenate((getattr(mod, "{}_train_{}".format(single_model_mae[1][0], shpr_cd)), getattr(mod, "{}_train_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)
  stack_final_x_test = np.concatenate((getattr(mod, "{}_test_{}".format(single_model_mae[1][0], shpr_cd)), getattr(mod, "{}_test_{}".format(single_model_mae[2][0], shpr_cd))), axis=1)

  globals()["meta_model_{}_{}".format(single_model_mae[0][0], shpr_cd)] = getattr(mod, "{}_model_tuned_{}".format(single_model_mae[0][0], shpr_cd))

  getattr(mod, "meta_model_{}_{}".format(single_model_mae[0][0], shpr_cd)).fit(stack_final_x_train, y_train)
  globals()["stack_pred_{}_{}".format(single_model_mae[0][0], shpr_cd)] = getattr(mod, "meta_model_{}_{}".format(single_model_mae[0][0], shpr_cd)).predict(stack_final_x_test)
  stacking_list["{}".format(single_model_mae[0][0])] =  mean_absolute_error(y_test, getattr(mod, "stack_pred_{}_{}".format(single_model_mae[0][0], shpr_cd)))

  stacking_model_mae = sorted(stacking_list.items(), key = lambda item: item[1])

# 단일 모델과 stacking 모델의 MAE 값을 비교하여 작은 값으로 모델링 결과 반환
  a = single_model_mae[0][1]
  b = stacking_model_mae[0][1]

  if a < b:
    print("쇼핑몰 코드 {}의 최적 모델은 단일 모델 {} : (test MAE값) {}".format(shpr_cd, single_model_mae[0][0], single_model_mae[0][1]))
    globals()["best_model_{}".format(shpr_cd)]  = single_model_mae[0][0]
    globals()["test_mae_{}".format(shpr_cd)] = single_model_mae[0][1]
    globals()["best_pred_{}".format(shpr_cd)] = getattr(mod, "single_pred_{}_{}".format(single_model_mae[0][0], shpr_cd))
  else:
    print("쇼핑몰 코드 {}의 최적 모델은 stacking meta 모델 {} : (test MAE값) {}".format(shpr_cd, stacking_model_mae[0][0], stacking_model_mae[0][1]))
    globals()["best_model_{}".format(shpr_cd)] = stacking_model_mae[0][0]
    globals()["test_mae_{}".format(shpr_cd)] = stacking_model_mae[0][1]
    globals()["best_pred_{}".format(shpr_cd)] = getattr(mod, "stack_pred_{}_{}".format(stacking_model_mae[0][0], shpr_cd))


In [None]:
# 단일 모델과 stacking 모델의 MAE 값을 비교하여 작은 값으로 모델링 결과 반환
for shpr_cd in shpr_df:
  get_optimal_model(str(shpr_cd))

쇼핑몰 코드 90001702의 최적 모델은 단일 모델 SVR : (test MAE값) 1.1246
쇼핑몰 코드 90001705의 최적 모델은 단일 모델 SVR : (test MAE값) 0.5524


In [None]:
for shpr_cd in shpr_df:
  x_train, y_train, x_test, y_test = get_train_test_set(shpr_cd)
  globals()['pred_{}'.format(shpr_cd)]= getattr(mod, "stack_pred_{}_{}".format(getattr(mod, "best_model_{}".format(shpr_cd)), shpr_cd))

In [None]:
# test dataset에 대한 예측값 
shpr_pred_df = pd.DataFrame()
for shpr_cd in shpr_df:
  shpr_pred_df[str(shpr_cd)] = getattr(mod, 'best_pred_{}'.format(shpr_cd))

In [None]:
shpr_pred_df

Unnamed: 0,90001702,90001705
0,1.16734,2.86213
1,2.09692,0.35961
2,5.5859,0.00567
3,2.51389,6.79639
4,2.4751,1.34246
5,2.70108,-0.15893
6,2.31776,1.80235
7,11.56999,3.87486
8,1.16899,1.46049
9,1.52973,2.74834


In [None]:
shpr_pred_df.to_csv('/content/gdrive/Shareddrives/cj공모전/최종코드/모델링/예측값/shpr_max_예측값.csv', encoding = 'utf-8', index = False)