----
# I. Data preprocessing

## 1. Read dataset

In [1]:
import pandas as pd

train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

## 2. Missing values handling

### a) `train.csv`

In [2]:
for i in range(1, 6):
    train = train[train[f"Promotion{i}"].notnull()]
train

Unnamed: 0,id,Store,Date,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales
92,93,1,11/11/2011,59.11,3.297,10382.90,6115.67,215.07,2406.62,6551.42,7.866,False,1594938.89
93,94,1,18/11/2011,62.25,3.308,6074.12,254.39,51.98,427.39,5988.57,7.866,False,1539483.70
94,95,1,25/11/2011,60.14,3.236,410.31,98.00,55805.51,8.00,554.92,7.866,True,2033320.66
95,96,1,02/12/2011,48.91,3.172,5629.51,68.00,1398.11,2084.64,20475.32,7.866,False,1584083.95
96,97,1,09/12/2011,43.93,3.158,4640.65,19.00,105.02,3639.42,14461.82,7.866,False,1799682.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,6250,45,24/08/2012,72.62,3.834,7936.20,58.38,22.00,5518.07,2291.97,8.684,False,718232.26
6250,6251,45,31/08/2012,75.09,3.867,23641.30,6.00,92.93,6988.31,3992.13,8.684,False,734297.87
6251,6252,45,07/09/2012,75.70,3.911,11024.45,12.80,52.63,1854.77,2055.70,8.684,True,766512.66
6253,6254,45,21/09/2012,65.32,4.038,8452.20,92.28,63.24,2376.38,8670.40,8.684,False,723086.20


### b) `test.csv`

In [3]:
means = {
    "Promotion1":test["Promotion1"].mean(), "Promotion2":test["Promotion2"].mean(), 
    "Promotion3":test["Promotion3"].mean(), "Promotion4":test["Promotion4"].mean(), 
    "Promotion5":test["Promotion5"].mean()
}
test = test.fillna(value=means)

## 3. Data manipulation

### a) `Date`

In [4]:
import datetime as dt

def date_to_week(date):
    day, month, year = map(int, date.split('/'))
    t = dt.datetime(year, month, day) - dt.datetime(2010, 2, 5)
    return t.days // 7


def date_split(date):
    return tuple(map(int, date.split('/')))


# train
#train["Week"] = train["Date"].apply(date_to_week)
train["Day"] = train["Date"].apply(lambda x: date_split(x)[0])
train["Month"] = train["Date"].apply(lambda x: date_split(x)[1])
train["Year"] = train["Date"].apply(lambda x: date_split(x)[2])


# test
#test["Week"] = test["Date"].apply(date_to_week)
test["Day"] = test["Date"].apply(lambda x: date_split(x)[0])
test["Month"] = test["Date"].apply(lambda x: date_split(x)[1])
test["Year"] = test["Date"].apply(lambda x: date_split(x)[2])

### b) `IsHoliday`

In [5]:
# trian
train["IsHoliday"] = train["IsHoliday"].apply(int)

# test
test["IsHoliday"] = test["IsHoliday"].apply(int)

### c) `Store`

In [6]:
# train
train = pd.get_dummies(data=train, columns=["Store"])

# test
test = pd.get_dummies(data=test, columns=["Store"])

### d) `Promotion1`, ... , `Promotion5`

In [7]:
# Scaling
from sklearn.preprocessing import RobustScaler


promos = ['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']

scaler = RobustScaler()
scaler.fit(train[promos])
train[promos] = scaler.transform(train[promos])
test[promos] = scaler.transform(test[promos])

### e) `Weekly_Sales` 

In [8]:
import numpy as np

train["Weekly_Sales"] = np.log1p(train["Weekly_Sales"])

## 4. Add new features

In [9]:
social_features = ["Temperature", "Fuel_Price", "Unemployment"]
dummy = train[social_features].copy()

## 5. Remove not using features

In [10]:
train = train.drop(columns=['id', 'Date'])
#train = train.drop(columns=social_features)

test = test.drop(columns=['id', 'Date'])
#test = test.drop(columns=social_features)

In [11]:
train

Unnamed: 0,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
92,59.11,3.297,0.609080,2.343314,1.636777,0.188007,0.706899,7.866,0,14.282347,...,0,0,0,0,0,0,0,0,0,0
93,62.25,3.308,-0.024530,0.021188,0.212099,-0.377646,0.558861,7.866,0,14.246958,...,0,0,0,0,0,0,0,0,0,0
94,60.14,3.236,-0.857399,-0.040771,487.248832,-0.497505,-0.870274,7.866,1,14.525181,...,0,0,0,0,0,0,0,0,0,0
95,48.91,3.172,-0.089911,-0.052656,11.971260,0.095987,4.369103,7.866,0,14.275517,...,0,0,0,0,0,0,0,0,0,0
96,43.93,3.158,-0.235323,-0.072069,0.675431,0.540335,2.787458,7.866,0,14.403121,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6249,72.62,3.834,0.249290,-0.056468,-0.049793,1.077242,-0.413403,8.684,0,13.484550,...,0,0,0,0,0,0,0,0,0,1
6250,75.09,3.867,2.558741,-0.077220,0.569819,1.497429,0.033766,8.684,0,13.506671,...,0,0,0,0,0,0,0,0,0,1
6251,75.70,3.911,0.703421,-0.074526,0.217777,0.030291,-0.475545,8.684,1,13.549608,...,0,0,0,0,0,0,0,0,0,1
6253,65.32,4.038,0.325169,-0.043037,0.310461,0.179365,1.264224,8.684,0,13.491285,...,0,0,0,0,0,0,0,0,0,1


In [12]:
test

Unnamed: 0,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Day,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,68.55,3.617,0.270126,-0.053975,-0.082813,0.534050,-0.062495,6.573,0,5,...,0,0,0,0,0,0,0,0,0,0
1,62.99,3.601,-0.610961,-0.053975,-0.171129,-0.327641,0.542522,6.573,0,12,...,0,0,0,0,0,0,0,0,0,0
2,67.97,3.594,-0.777989,-0.053975,-0.198908,-0.476857,-0.407911,6.573,0,19,...,0,0,0,0,0,0,0,0,0,0
3,69.16,3.506,-0.537484,-0.067018,-0.189561,-0.197661,-0.672989,6.573,0,26,...,0,0,0,0,0,0,0,0,0,0
4,70.27,3.617,-0.029877,-0.053975,-0.154269,0.365414,-0.002722,6.170,0,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,46.97,3.755,-0.912052,-0.078563,-0.233413,-0.132734,-0.895834,5.217,0,26,...,0,0,0,0,0,0,0,0,1,0
176,64.89,3.985,-0.175608,-0.053975,-0.077572,0.144226,-0.400767,8.667,0,5,...,0,0,0,0,0,0,0,0,0,1
177,54.47,4.000,-0.630063,-0.053975,-0.173051,-0.328509,0.033348,8.667,0,12,...,0,0,0,0,0,0,0,0,0,1
178,56.47,3.969,-0.623043,-0.053975,-0.214195,-0.374691,-0.611843,8.667,0,19,...,0,0,0,0,0,0,0,0,0,1


----
# II. Modeling

## 1. Divide `train.csv` into training data and for predicting data

In [13]:
x_train = train.drop(columns=["Weekly_Sales"])
y_train = train["Weekly_Sales"]

## 2. Choose a suitable model

In [14]:
predictions = {}

### a) XGBoost

In [15]:
import time
from xgboost import XGBRegressor

start_t = time.time()
model = XGBRegressor(objective='reg:squarederror', learning_rate=0.1, max_depth = 4, n_estimators = 1000)
model.fit(x_train, y_train)
prediction = model.predict(test)
prediction = np.expm1(prediction)
predictions["XGBoost"] = prediction

print(f"[XGBoost] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{prediction[0]}, {prediction[1]}, ... , {prediction[-1]}")

[XGBoost] => time: 2.09(sec)
1604472.5, 1387647.0, ... , 717033.125


----
# III. Submission

In [16]:
import os

def name(integer):
    return str(integer).zfill(2)
    
savetime = dt.datetime.now()
folder = "-".join(map(name, [savetime.year, savetime.month, savetime.day]))
sub_folder = name(savetime.hour) + '：' + name(savetime.minute) + '：' + name(savetime.second)

for model in predictions:
    submission["Weekly_Sales"] = predictions[model]
    os.makedirs(f"dataset/submissions/{folder}", exist_ok=True)
    submission.to_csv(f"dataset/submissions/{folder}/{sub_folder+' ('+model+')'}.csv", index=False)