In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import statsmodels.formula.api as smf
import statsmodels.api as sm

import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
train_ = pd.read_csv('dataset/train.csv')
test_ = pd.read_csv('dataset/test.csv')

train_.head()

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales
0,1,1,05/02/2010,42.31,2.572,,,,,,8.106,False,1643690.9
1,2,1,12/02/2010,38.51,2.548,,,,,,8.106,True,1641957.44
2,3,1,19/02/2010,39.93,2.514,,,,,,8.106,False,1611968.17
3,4,1,26/02/2010,46.63,2.561,,,,,,8.106,False,1409727.59
4,5,1,05/03/2010,46.5,2.625,,,,,,8.106,False,1554806.68


In [3]:
# 편한 가독성을 위해 연, 월, 일을 분리

def split_date(df):
    y = []
    m = []
    d = []
    
    for i in df['Date']:
        d_, m_, y_ = i.split('/')
        y.append(int(y_))
        m.append(int(m_))
        d.append(int(d_))
        
    df.insert(3, 'Y', y)
    df.insert(4, 'M', m)
    df.insert(5, 'D', d)
    
    return df

# holiday이면 1, 아니면 0

def holiday(df):
    tmp = []
    
    for i in df['IsHoliday']:
        if i == False:
            tmp.append(0)
        else:
            tmp.append(1)
            
    df.insert(15, 'Holiday', tmp)
    
    return df

In [4]:
train_ = split_date(train_)
train_ = holiday(train_)

train_ = train_.fillna(0)

test_ = split_date(test_)
test_ = holiday(test_)

test_ = test_.fillna(0)

train_.head()

Unnamed: 0,id,Store,Date,Y,M,D,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Holiday,Weekly_Sales
0,1,1,05/02/2010,2010,2,5,42.31,2.572,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1643690.9
1,2,1,12/02/2010,2010,2,12,38.51,2.548,0.0,0.0,0.0,0.0,0.0,8.106,True,1,1641957.44
2,3,1,19/02/2010,2010,2,19,39.93,2.514,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1611968.17
3,4,1,26/02/2010,2010,2,26,46.63,2.561,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1409727.59
4,5,1,05/03/2010,2010,3,5,46.5,2.625,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1554806.68


In [5]:
test_.head()

Unnamed: 0,id,Store,Date,Y,M,D,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Holiday
0,1,1,05/10/2012,2012,10,5,68.55,3.617,8077.89,0.0,18.22,3617.43,3626.14,6.573,False,0
1,2,1,12/10/2012,2012,10,12,62.99,3.601,2086.18,0.0,8.11,602.36,5926.45,6.573,False,0
2,3,1,19/10/2012,2012,10,19,67.97,3.594,950.33,0.0,4.93,80.25,2312.85,6.573,False,0
3,4,1,26/10/2012,2012,10,26,69.16,3.506,2585.85,31.75,6.0,1057.16,1305.01,6.573,False,0
4,5,2,05/10/2012,2012,10,5,70.27,3.617,6037.76,0.0,10.04,3027.37,3853.4,6.17,False,0


## XGBoost + Robust

In [6]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

In [7]:
# Robust부터 실행

scaler = RobustScaler()

features = train_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

train_robust = pd.concat([train_, features_scaled], axis = 1)
train_robust.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)

features = test_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

test_robust = pd.concat([test_, features_scaled], axis = 1)
test_robust.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)

In [8]:
X = train_robust.drop('Weekly_Sales', axis = 1)
Y = train_robust['Weekly_Sales']

train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.2, random_state = 908)

In [9]:
# 해당 값을 계속 변경하면서 최적의 파라미터를 찾는다

# floating error로 인해 수동으로 learning rate 설정
lr = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2]
depth = [5, 6, 7, 8]
min_mse = 0

for i in lr:
    for j in depth:
        model = XGBRegressor(n_estimators = 100, learning_rate = i, gamma = 0, subsample = 0.8, 
                             colsample_bytree = 1, max_depth = j)

        model.fit(train_x, train_y)

        pred = model.predict(test_x)
        mse = mean_squared_error(pred, test_y)
        print('Learning Rate : {}, Depth : {}, MSE : {}'.format(i, j, mse), end = ' ')
        
        if (min_mse > mse) | (min_mse == 0):
            min_mse = mse
            print('Minimum MSE Updated')
        else:
            print('')

Learning Rate : 0.01, Depth : 5, MSE : 301268404797.57306 Minimum MSE Updated
Learning Rate : 0.01, Depth : 6, MSE : 277058303756.9519 Minimum MSE Updated
Learning Rate : 0.01, Depth : 7, MSE : 258180499936.73572 Minimum MSE Updated
Learning Rate : 0.01, Depth : 8, MSE : 238354493116.95795 Minimum MSE Updated
Learning Rate : 0.02, Depth : 5, MSE : 107899248584.13124 Minimum MSE Updated
Learning Rate : 0.02, Depth : 6, MSE : 86179544100.73404 Minimum MSE Updated
Learning Rate : 0.02, Depth : 7, MSE : 76263929141.721 Minimum MSE Updated
Learning Rate : 0.02, Depth : 8, MSE : 62085737245.114334 Minimum MSE Updated
Learning Rate : 0.03, Depth : 5, MSE : 65365767181.20758 
Learning Rate : 0.03, Depth : 6, MSE : 42250014748.42494 Minimum MSE Updated
Learning Rate : 0.03, Depth : 7, MSE : 33150633608.588615 Minimum MSE Updated
Learning Rate : 0.03, Depth : 8, MSE : 26890453115.40651 Minimum MSE Updated
Learning Rate : 0.04, Depth : 5, MSE : 42887421033.50426 
Learning Rate : 0.04, Depth : 6, 

In [10]:
# lr = 0.14, depth = 8일때로 시도해보기

model = XGBRegressor(n_estimators = 100, learning_rate = 0.14, gamma = 0, 
                     subsample = 0.8, colsample_bytree = 1, max_depth = 8)

model.fit(train_x, train_y)

y = model.predict(test_robust)
y

array([1473718.5 , 1425116.6 , 1433519.2 , 1365167.6 , 1656738.5 ,
       1757799.6 , 1612071.4 , 1544623.1 ,  727432.9 ,  673706.3 ,
        665236.25,  659205.25, 2183056.8 , 2138862.2 , 2175711.8 ,
       2177997.5 ,  608568.8 ,  557174.94,  555559.56,  556345.  ,
       1435120.5 , 1372534.  , 1382885.8 , 1363810.6 ,  809967.5 ,
        775870.9 ,  791111.56,  879424.44, 1412258.1 , 1335155.2 ,
       1361801.  , 1335413.  , 1155598.  , 1114955.6 , 1107401.5 ,
       1086934.2 , 1675533.1 , 1679979.1 , 1646272.4 , 1677231.9 ,
       1369648.4 , 1289149.8 , 1322169.1 , 1302233.4 , 1455792.5 ,
       1470380.9 , 1434483.9 , 1491019.5 , 1938895.2 , 1748558.4 ,
       1880876.6 , 1962102.1 , 1903849.9 , 1835492.8 , 1840262.5 ,
       1883645.2 ,  613373.2 ,  607098.3 ,  577088.56,  675565.6 ,
        539957.9 ,  527559.8 ,  486708.62,  549661.94,  951764.3 ,
        911130.44,  912085.2 ,  938025.6 , 1021800.3 , 1003580.56,
       1002515.06, 1111268.9 , 1377063.8 , 1361613.5 , 1330710

In [11]:
df_robust = pd.DataFrame({'id' : list(range(1, len(y) + 1)), 'Weekly_Sales' : y})
df_robust.set_index('id', inplace = True)
df_robust

Unnamed: 0_level_0,Weekly_Sales
id,Unnamed: 1_level_1
1,1.473718e+06
2,1.425117e+06
3,1.433519e+06
4,1.365168e+06
5,1.656738e+06
...,...
176,3.751719e+05
177,7.350537e+05
178,7.145128e+05
179,7.402311e+05


In [12]:
df_robust.to_csv('./result/XGBoost_Robust.csv', encoding = 'utf-8-sig')

## XGBoost + Standard

In [13]:
# Standard도 실행

scaler = StandardScaler()

features = train_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

train_standard = pd.concat([train_, features_scaled], axis = 1)
train_standard.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)

features = test_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

test_standard = pd.concat([test_, features_scaled], axis = 1)
test_standard.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)

In [14]:
X = train_standard.drop('Weekly_Sales', axis = 1)
Y = train_standard['Weekly_Sales']

train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.2, random_state = 908)

In [15]:
# 해당 값을 계속 변경하면서 최적의 파라미터를 찾는다

# floating error로 인해 수동으로 learning rate 설정
lr = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2]
depth = [5, 6, 7, 8]
min_mse = 0

for i in lr:
    for j in depth:
        model = XGBRegressor(n_estimators = 100, learning_rate = i, gamma = 0, subsample = 0.8, 
                             colsample_bytree = 1, max_depth = j)

        model.fit(train_x, train_y)

        pred = model.predict(test_x)
        mse = mean_squared_error(pred, test_y)
        print('Learning Rate : {}, Depth : {}, MSE : {}'.format(i, j, mse), end = ' ')
        
        if (min_mse > mse) | (min_mse == 0):
            min_mse = mse
            print('Minimum MSE Updated')
        else:
            print('')

Learning Rate : 0.01, Depth : 5, MSE : 301268404797.57306 Minimum MSE Updated
Learning Rate : 0.01, Depth : 6, MSE : 277058303756.9519 Minimum MSE Updated
Learning Rate : 0.01, Depth : 7, MSE : 258180499936.73572 Minimum MSE Updated
Learning Rate : 0.01, Depth : 8, MSE : 238354493116.95795 Minimum MSE Updated
Learning Rate : 0.02, Depth : 5, MSE : 107899248584.13124 Minimum MSE Updated
Learning Rate : 0.02, Depth : 6, MSE : 86179544100.73404 Minimum MSE Updated
Learning Rate : 0.02, Depth : 7, MSE : 76263929141.721 Minimum MSE Updated
Learning Rate : 0.02, Depth : 8, MSE : 62085737245.114334 Minimum MSE Updated
Learning Rate : 0.03, Depth : 5, MSE : 65365767181.20758 
Learning Rate : 0.03, Depth : 6, MSE : 42250014748.42494 Minimum MSE Updated
Learning Rate : 0.03, Depth : 7, MSE : 33150633608.588615 Minimum MSE Updated
Learning Rate : 0.03, Depth : 8, MSE : 26890453115.40651 Minimum MSE Updated
Learning Rate : 0.04, Depth : 5, MSE : 42886341147.31372 
Learning Rate : 0.04, Depth : 6, 

In [16]:
# lr = 0.14, depth = 8일때로 시도해보기

model = XGBRegressor(n_estimators = 100, learning_rate = 0.14, gamma = 0, 
                     subsample = 0.8, colsample_bytree = 1, max_depth = 8)

model.fit(train_x, train_y)

y = model.predict(test_robust)
y

array([1650051.4 , 1638193.9 , 1432106.6 , 1355794.8 , 1736729.  ,
       1733513.8 , 1688696.6 , 1572382.6 ,  748042.06,  681171.  ,
        657026.75,  667266.1 , 2118762.8 , 2270829.8 , 2274044.8 ,
       2115593.8 ,  624795.44,  582978.44,  557286.2 ,  563031.94,
       1601403.6 , 1357845.1 , 1423166.  , 1387543.  ,  855627.56,
        780169.2 ,  846051.8 ,  908176.4 , 1312064.2 , 1194961.6 ,
       1348886.1 , 1337046.5 , 1179043.1 , 1125832.  , 1127298.4 ,
       1130865.1 , 1825865.5 , 1647197.6 , 1630383.5 , 1671581.1 ,
       1569413.6 , 1430478.1 , 1441143.4 , 1465914.1 , 1385199.4 ,
       1409291.9 , 1450689.8 , 1419448.1 , 2078457.4 , 1821550.2 ,
       1884532.9 , 2248381.5 , 1708862.8 , 1772087.2 , 1707658.1 ,
       1823896.4 ,  586199.3 ,  595962.06,  568302.9 ,  787969.4 ,
        670752.75,  548785.8 ,  494343.2 ,  556181.6 ,  929706.1 ,
        930826.1 ,  910979.75,  952756.  , 1192837.5 , 1100165.1 ,
        991723.3 , 1318646.2 , 1520063.8 , 1389766.5 , 1422869

In [17]:
df_standard = pd.DataFrame({'id' : list(range(1, len(y) + 1)), 'Weekly_Sales' : y})
df_standard.set_index('id', inplace = True)
df_standard

Unnamed: 0_level_0,Weekly_Sales
id,Unnamed: 1_level_1
1,1.650051e+06
2,1.638194e+06
3,1.432107e+06
4,1.355795e+06
5,1.736729e+06
...,...
176,3.585702e+05
177,9.133026e+05
178,8.030437e+05
179,7.203064e+05


In [18]:
df_standard.to_csv('./result/XGBoost_Standard.csv', encoding = 'utf-8-sig')