In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

In [2]:
train_ = pd.read_csv('dataset/train.csv')
test_ = pd.read_csv('dataset/test.csv')

train_.head()

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales
0,1,1,05/02/2010,42.31,2.572,,,,,,8.106,False,1643690.9
1,2,1,12/02/2010,38.51,2.548,,,,,,8.106,True,1641957.44
2,3,1,19/02/2010,39.93,2.514,,,,,,8.106,False,1611968.17
3,4,1,26/02/2010,46.63,2.561,,,,,,8.106,False,1409727.59
4,5,1,05/03/2010,46.5,2.625,,,,,,8.106,False,1554806.68


In [3]:
test_.head()

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday
0,1,1,05/10/2012,68.55,3.617,8077.89,,18.22,3617.43,3626.14,6.573,False
1,2,1,12/10/2012,62.99,3.601,2086.18,,8.11,602.36,5926.45,6.573,False
2,3,1,19/10/2012,67.97,3.594,950.33,,4.93,80.25,2312.85,6.573,False
3,4,1,26/10/2012,69.16,3.506,2585.85,31.75,6.0,1057.16,1305.01,6.573,False
4,5,2,05/10/2012,70.27,3.617,6037.76,,10.04,3027.37,3853.4,6.17,False


In [4]:
# 편한 가독성을 위해 연, 월, 일을 분리

def split_date(df):
    y = []
    m = []
    d = []
    
    for i in df['Date']:
        d_, m_, y_ = i.split('/')
        y.append(int(y_))
        m.append(int(m_))
        d.append(int(d_))
        
    df.insert(3, 'Y', y)
    df.insert(4, 'M', m)
    df.insert(5, 'D', d)
    
    return df

# holiday이면 1, 아니면 0

def holiday(df):
    tmp = []
    
    for i in df['IsHoliday']:
        if i == False:
            tmp.append(0)
        else:
            tmp.append(1)
            
    df.insert(15, 'Holiday', tmp)
    
    return df

In [5]:
train_ = split_date(train_)
train_ = holiday(train_)

train_ = train_.fillna(0)

train_.head()

Unnamed: 0,id,Store,Date,Y,M,D,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Holiday,Weekly_Sales
0,1,1,05/02/2010,2010,2,5,42.31,2.572,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1643690.9
1,2,1,12/02/2010,2010,2,12,38.51,2.548,0.0,0.0,0.0,0.0,0.0,8.106,True,1,1641957.44
2,3,1,19/02/2010,2010,2,19,39.93,2.514,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1611968.17
3,4,1,26/02/2010,2010,2,26,46.63,2.561,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1409727.59
4,5,1,05/03/2010,2010,3,5,46.5,2.625,0.0,0.0,0.0,0.0,0.0,8.106,False,0,1554806.68


In [6]:
test_ = split_date(test_)
test_ = holiday(test_)

test_ = test_.fillna(0)

test_.head()

Unnamed: 0,id,Store,Date,Y,M,D,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Holiday
0,1,1,05/10/2012,2012,10,5,68.55,3.617,8077.89,0.0,18.22,3617.43,3626.14,6.573,False,0
1,2,1,12/10/2012,2012,10,12,62.99,3.601,2086.18,0.0,8.11,602.36,5926.45,6.573,False,0
2,3,1,19/10/2012,2012,10,19,67.97,3.594,950.33,0.0,4.93,80.25,2312.85,6.573,False,0
3,4,1,26/10/2012,2012,10,26,69.16,3.506,2585.85,31.75,6.0,1057.16,1305.01,6.573,False,0
4,5,2,05/10/2012,2012,10,5,70.27,3.617,6037.76,0.0,10.04,3027.37,3853.4,6.17,False,0


In [7]:
# Promotion들에서 이상치가 너무 많다고 판단, 적절한 scaling 방식을 취할 필요가 있음
# Robust, Standard를 실행해 볼 계획

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

In [8]:
# Robust부터 실행

scaler = RobustScaler()

features = train_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

train_robust = pd.concat([train_, features_scaled], axis = 1)
train_robust.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)
train_robust

Unnamed: 0,Store,Y,M,D,Temperature,Fuel_Price,Unemployment,Holiday,Weekly_Sales,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled
0,1,2010,2,5,42.31,2.572,8.106,0,1643690.90,0.000000,0.0,0.000000,0.000000,0.000000
1,1,2010,2,12,38.51,2.548,8.106,1,1641957.44,0.000000,0.0,0.000000,0.000000,0.000000
2,1,2010,2,19,39.93,2.514,8.106,0,1611968.17,0.000000,0.0,0.000000,0.000000,0.000000
3,1,2010,2,26,46.63,2.561,8.106,0,1409727.59,0.000000,0.0,0.000000,0.000000,0.000000
4,1,2010,3,5,46.50,2.625,8.106,0,1554806.68,0.000000,0.0,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,45,2012,8,31,75.09,3.867,8.684,0,734297.87,12.335020,60.0,33.248658,36.566951,2.226726
6251,45,2012,9,7,75.70,3.911,8.684,1,766512.66,5.752087,128.0,18.830054,9.705248,1.146626
6252,45,2012,9,14,67.87,3.948,8.684,0,702238.27,5.952181,0.0,1.538462,17.904453,2.938893
6253,45,2012,9,21,65.32,4.038,8.684,0,723086.20,4.409997,922.8,22.626118,12.434619,4.836166


In [9]:
features = test_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

test_robust = pd.concat([test_, features_scaled], axis = 1)
test_robust.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)
test_robust

Unnamed: 0,Store,Y,M,D,Temperature,Fuel_Price,Unemployment,Holiday,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled
0,1,2012,10,5,68.55,3.617,6.573,0,1.392288,0.00,0.456136,2.640406,0.259693
1,1,2012,10,12,62.99,3.601,6.573,0,-0.062816,0.00,0.002468,0.065965,0.987452
2,1,2012,10,19,67.97,3.594,6.573,0,-0.338661,0.00,-0.140229,-0.379843,-0.155799
3,1,2012,10,26,69.16,3.506,6.573,0,0.058530,31.75,-0.092214,0.454299,-0.474653
4,2,2012,10,5,70.27,3.617,6.170,0,0.896837,0.00,0.089073,2.136579,0.331592
...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,44,2012,10,26,46.97,3.755,5.217,0,-0.560065,2.61,-0.317478,-0.448365,-0.742708
176,45,2012,10,5,64.89,3.985,8.667,0,0.656164,0.00,0.483060,1.475744,-0.147206
177,45,2012,10,12,54.47,4.000,8.667,0,-0.094363,0.00,-0.007404,0.063369,0.374980
178,45,2012,10,19,56.47,3.969,8.667,0,-0.082769,0.00,-0.218757,-0.074606,-0.401103


In [10]:
# Regression에서 의미 없는 변수 제거
# 변수 제거 순서 : D, Holiday, M

res = smf.ols(formula = 'Weekly_Sales ~ Y + Temperature + Fuel_Price + \
              Promotion1_Scaled + Promotion2_Scaled + Promotion3_Scaled + Promotion4_Scaled + Promotion5_Scaled + \
                Unemployment', data = train_robust).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           Weekly_Sales   R-squared:                       0.109
Model:                            OLS   Adj. R-squared:                  0.108
Method:                 Least Squares   F-statistic:                     85.18
Date:                Mon, 25 Jul 2022   Prob (F-statistic):          6.05e-150
Time:                        16:16:06   Log-Likelihood:                -91363.
No. Observations:                6255   AIC:                         1.827e+05
Df Residuals:                    6245   BIC:                         1.828e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          5.744e+08   3.32e+0

In [11]:
model = LinearRegression()

X = train_robust[['Y', 'Temperature', 'Fuel_Price', 'Promotion1_Scaled', 'Promotion2_Scaled', \
                 'Promotion3_Scaled', 'Promotion4_Scaled', 'Promotion5_Scaled', 'Unemployment']]
Y = train_robust['Weekly_Sales']

x = test_robust[['Y', 'Temperature', 'Fuel_Price', 'Promotion1_Scaled', 'Promotion2_Scaled', \
                 'Promotion3_Scaled', 'Promotion4_Scaled', 'Promotion5_Scaled', 'Unemployment']]

X.head(5)

Unnamed: 0,Y,Temperature,Fuel_Price,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled,Unemployment
0,2010,42.31,2.572,0.0,0.0,0.0,0.0,0.0,8.106
1,2010,38.51,2.548,0.0,0.0,0.0,0.0,0.0,8.106
2,2010,39.93,2.514,0.0,0.0,0.0,0.0,0.0,8.106
3,2010,46.63,2.561,0.0,0.0,0.0,0.0,0.0,8.106
4,2010,46.5,2.625,0.0,0.0,0.0,0.0,0.0,8.106


In [12]:
Y.head(5)

0    1643690.90
1    1641957.44
2    1611968.17
3    1409727.59
4    1554806.68
Name: Weekly_Sales, dtype: float64

In [13]:
x.head(5)

Unnamed: 0,Y,Temperature,Fuel_Price,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled,Unemployment
0,2012,68.55,3.617,1.392288,0.0,0.456136,2.640406,0.259693,6.573
1,2012,62.99,3.601,-0.062816,0.0,0.002468,0.065965,0.987452,6.573
2,2012,67.97,3.594,-0.338661,0.0,-0.140229,-0.379843,-0.155799,6.573
3,2012,69.16,3.506,0.05853,31.75,-0.092214,0.454299,-0.474653,6.573
4,2012,70.27,3.617,0.896837,0.0,0.089073,2.136579,0.331592,6.17


In [15]:
# 어짜피 성능 체크 상관없이 전부 제출할 계획이라
# 굳이 train_test_split 할 필요 없이
# train 데이터를 전부 학습에 사용해도 무관하다고 판단

# def rmse(predictions, targets):
#     return np.sqrt(((predictions - targets) ** 2).mean())

# train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.2, random_state = 908)

# print(len(train_x), len(test_x), len(train_y), len(test_y))

In [16]:
model.fit(X, Y)

LinearRegression()

In [17]:
y = model.predict(x)
y

array([ 850076.94006288,  812099.70440507,  749586.98295891,
        730754.35520124,  845519.07168782,  886272.57668531,
        810051.81764638,  810341.2678653 ,  763538.89573789,
        746066.34787226,  772705.1206826 ,  710970.91930652,
        984939.15917075,  966384.33623385,  976029.37508965,
        921431.69766283,  811144.57648969,  821928.99172735,
        778390.49745893,  758611.28203046,  948139.79650247,
        895103.66446948,  829285.16614044,  839267.62940764,
        782507.13155854,  745067.70071161,  805499.93657303,
        760520.16238534,  852769.92561615,  853215.25327015,
        834481.35423148,  803691.92848003,  836283.85954201,
        829541.22798121,  824140.51554   ,  781622.61756074,
       1006952.81964004, 1049039.16598105,  986734.5238446 ,
       1002618.61413813,  865217.74739599,  918616.68473339,
        789046.92603767,  799537.31788111,  776394.75208056,
        858051.41449785,  830088.07225382,  961666.21768653,
        981567.28907084,

In [22]:
df_robust = pd.DataFrame({'id' : list(range(1, len(y) + 1)), 'Weekly_Sales' : y})
df_robust.set_index('id', inplace = True)
df_robust

Unnamed: 0_level_0,Weekly_Sales
id,Unnamed: 1_level_1
1,850076.940063
2,812099.704405
3,749586.982959
4,730754.355201
5,845519.071688
...,...
176,836440.872487
177,812193.377369
178,811149.716889
179,772244.278307


In [23]:
df_robust.to_csv('./result/LR_Robust.csv', encoding = 'utf-8-sig')

In [8]:
# Standard도 실행

scaler = StandardScaler()

features = train_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

train_standard = pd.concat([train_, features_scaled], axis = 1)
train_standard.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)
train_standard

Unnamed: 0,Store,Y,M,D,Temperature,Fuel_Price,Unemployment,Holiday,Weekly_Sales,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled
0,1,2010,2,5,42.31,2.572,8.106,0,1643690.90,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303
1,1,2010,2,12,38.51,2.548,8.106,1,1641957.44,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303
2,1,2010,2,19,39.93,2.514,8.106,0,1611968.17,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303
3,1,2010,2,26,46.63,2.561,8.106,0,1409727.59,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303
4,1,2010,3,5,46.50,2.625,8.106,0,1554806.68,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,45,2012,8,31,75.09,3.867,8.684,0,734297.87,3.580831,-0.168998,-0.065037,1.574261,0.598779
6251,45,2012,9,7,75.70,3.911,8.684,1,766512.66,1.453422,-0.167623,-0.072573,0.222921,0.128229
6252,45,2012,9,14,67.87,3.948,8.684,0,702238.27,1.518086,-0.170211,-0.081610,0.635401,0.909037
6253,45,2012,9,21,65.32,4.038,8.684,0,723086.20,1.019698,-0.151556,-0.070589,0.360229,1.735591


In [9]:
features = test_.iloc[:, 8:13]
colnames = features.columns + ['_Scaled']

features_scaled = pd.DataFrame(scaler.fit_transform(features), columns = colnames)

test_standard = pd.concat([test_, features_scaled], axis = 1)
test_standard.drop(['Promotion1', 'Promotion2', 'Promotion3', 'Promotion4', 'Promotion5', 'id', 'Date', 'IsHoliday'], axis = 1, inplace = True)
test_standard

Unnamed: 0,Store,Y,M,D,Temperature,Fuel_Price,Unemployment,Holiday,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled
0,1,2012,10,5,68.55,3.617,6.573,0,1.576197,-0.368901,-0.198291,1.726332,0.060872
1,1,2012,10,12,62.99,3.601,6.573,0,-0.388945,-0.368901,-0.418291,-0.294492,0.862308
2,1,2012,10,19,67.97,3.594,6.573,0,-0.761478,-0.368901,-0.487490,-0.644432,-0.396683
3,1,2012,10,26,69.16,3.506,6.573,0,-0.225065,0.355530,-0.464206,0.010333,-0.747818
4,2,2012,10,5,70.27,3.617,6.170,0,0.907081,-0.368901,-0.376293,1.330850,0.140050
...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,44,2012,10,26,46.97,3.755,5.217,0,-1.060488,-0.309349,-0.573445,-0.698219,-1.043010
176,45,2012,10,5,64.89,3.985,8.667,0,0.582050,-0.368901,-0.185234,0.812123,-0.387220
177,45,2012,10,12,54.47,4.000,8.667,0,-0.431550,-0.368901,-0.423079,-0.296530,0.187830
178,45,2012,10,19,56.47,3.969,8.667,0,-0.415892,-0.368901,-0.525572,-0.404834,-0.666821


In [10]:
# Regression에서 의미 없는 변수 제거
# 변수 제거 순서 : D, Holiday, M

res = smf.ols(formula = 'Weekly_Sales ~ Y + Temperature + Fuel_Price + \
              Promotion1_Scaled + Promotion2_Scaled + Promotion3_Scaled + Promotion4_Scaled + Promotion5_Scaled + \
                Unemployment', data = train_standard).fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           Weekly_Sales   R-squared:                       0.109
Model:                            OLS   Adj. R-squared:                  0.108
Method:                 Least Squares   F-statistic:                     85.18
Date:                Mon, 25 Jul 2022   Prob (F-statistic):          6.05e-150
Time:                        16:28:21   Log-Likelihood:                -91363.
No. Observations:                6255   AIC:                         1.827e+05
Df Residuals:                    6245   BIC:                         1.828e+05
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept          5.744e+08   3.32e+0

In [11]:
model = LinearRegression()

X = train_standard[['Y', 'Temperature', 'Fuel_Price', 'Promotion1_Scaled', 'Promotion2_Scaled', \
                 'Promotion3_Scaled', 'Promotion4_Scaled', 'Promotion5_Scaled', 'Unemployment']]
Y = train_standard['Weekly_Sales']

x = test_standard[['Y', 'Temperature', 'Fuel_Price', 'Promotion1_Scaled', 'Promotion2_Scaled', \
                 'Promotion3_Scaled', 'Promotion4_Scaled', 'Promotion5_Scaled', 'Unemployment']]

X.head(5)

Unnamed: 0,Y,Temperature,Fuel_Price,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled,Unemployment
0,2010,42.31,2.572,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303,8.106
1,2010,38.51,2.548,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303,8.106
2,2010,39.93,2.514,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303,8.106
3,2010,46.63,2.561,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303,8.106
4,2010,46.5,2.625,-0.405482,-0.170211,-0.082414,-0.265323,-0.371303,8.106


In [12]:
Y.head(5)

0    1643690.90
1    1641957.44
2    1611968.17
3    1409727.59
4    1554806.68
Name: Weekly_Sales, dtype: float64

In [13]:
x.head(5)

Unnamed: 0,Y,Temperature,Fuel_Price,Promotion1_Scaled,Promotion2_Scaled,Promotion3_Scaled,Promotion4_Scaled,Promotion5_Scaled,Unemployment
0,2012,68.55,3.617,1.576197,-0.368901,-0.198291,1.726332,0.060872,6.573
1,2012,62.99,3.601,-0.388945,-0.368901,-0.418291,-0.294492,0.862308,6.573
2,2012,67.97,3.594,-0.761478,-0.368901,-0.48749,-0.644432,-0.396683,6.573
3,2012,69.16,3.506,-0.225065,0.35553,-0.464206,0.010333,-0.747818,6.573
4,2012,70.27,3.617,0.907081,-0.368901,-0.376293,1.33085,0.14005,6.17


In [14]:
model.fit(X, Y)

LinearRegression()

In [15]:
y = model.predict(x)
y

array([1036643.41473305,  861461.13327944,  695514.84674859,
        724376.1486429 ,  960202.85334218, 1055913.06720936,
        859044.58195651, 1319560.64713669,  698257.15010607,
        637864.32814384,  723182.57726514,  600584.88537359,
       1298346.58311772, 1178983.89370859, 1157031.14850295,
       1144801.81988525,  775139.53509009,  797594.95526123,
        687917.40808749,  688583.00362027, 1265957.42516315,
        997757.12335074,  825857.70918787, 1022645.53337717,
        750474.59659946,  626324.85100162,  815703.51981497,
        762376.72458005,  895607.27364731,  839924.54420829,
        812798.79355073,  812730.19302666,  809658.51856351,
        772193.18761301,  774712.56025791,  707194.91003263,
       1217920.207214  , 1148716.21283555, 1069145.78512168,
       1386927.34079945, 1043900.16078615, 1135656.35106325,
        994527.33550453, 1183846.41736877,  947058.12209451,
        878360.20250928,  801135.56875968, 1531278.47436261,
       1226992.71705627,

In [16]:
df_standard = pd.DataFrame({'id' : list(range(1, len(y) + 1)), 'Weekly_Sales' : y})
df_standard.set_index('id', inplace = True)
df_standard

Unnamed: 0_level_0,Weekly_Sales
id,Unnamed: 1_level_1
1,1.036643e+06
2,8.614611e+05
3,6.955148e+05
4,7.243761e+05
5,9.602029e+05
...,...
176,7.120725e+05
177,8.889514e+05
178,8.196976e+05
179,7.338887e+05


In [17]:
df_standard.to_csv('./result/LR_Standard.csv', encoding = 'utf-8-sig')