### Contents  
**_I. Data preprocessing_**
> 1. Read dataset  
> 2. Missing values handling
>> a) `train.csv`  
>> b) `test.csv`
> 3. Data manipulation
>> a) `Date`  
>> b) `Promotion1`, ... , `Promotion5`
> 4. Add new features
> 5. Remove not using features  

**_II. Modeling_**  
> 1. Divide `train.csv` into training data and for predicting data
> 2. Choose a suitable model
>> a) XGBooster  
>> b) Random Forest  
>> c) LightGBM

**_III. Submission_**

----
# I. Data preprocessing

## 1. Read dataset

In [55]:
import pandas as pd

train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

## 2. Missing values handling

### a) `train.csv`

In [56]:
train = train.fillna(0)

### b) `test.csv`

In [57]:
test = test.fillna(0)

## 3. Data manipulation

### a) `Date`

In [58]:
def date_manipulate(date):
    return tuple(map(int, date.split('/')))

# train
train["Day"] = train["Date"].apply(lambda x: date_manipulate(x)[0])
train["Month"] = train["Date"].apply(lambda x: date_manipulate(x)[1])
train["Year"] = train["Date"].apply(lambda x: date_manipulate(x)[2])

# test
test["Day"] = test["Date"].apply(lambda x: date_manipulate(x)[0])
test["Month"] = test["Date"].apply(lambda x: date_manipulate(x)[1])
test["Year"] = test["Date"].apply(lambda x: date_manipulate(x)[2])

### b) `Promotion1`, ... , `Promotion5`

In [59]:
# Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# train
scaler.fit(train[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
scaled = scaler.transform(train[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
train[['Scaled_Promotion1','Scaled_Promotion2','Scaled_Promotion3','Scaled_Promotion4','Scaled_Promotion5']] = scaled

# test
scaler.fit(test[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
scaled = scaler.transform(test[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
test[['Scaled_Promotion1','Scaled_Promotion2','Scaled_Promotion3','Scaled_Promotion4','Scaled_Promotion5']] = scaled

## 4. Add new features

## 5. Remove not using features

In [60]:
train = train.drop(columns=['id','Date','Promotion1','Promotion2','Promotion3','Promotion4','Promotion5','Day'])
test = test.drop(columns=['id','Date','Promotion1','Promotion2','Promotion3','Promotion4','Promotion5','Day'])

----
# II. Modeling

## 1. Divide `train.csv` into training data and for predicting data

In [61]:
x_train = train.drop(columns=["Weekly_Sales"])
y_train = train["Weekly_Sales"]

## 2. Choose a suitable model

In [62]:
predictions = {}

### a) XGBooster

import time
import xgboost
start_t = time.time()

# todo

predictions["XGBooster"] = prediction
print(f"[XGBooster] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{prediction[0]}, {prediction[1]}, ..., {prediction[-1]}")

### b) Random forest

In [63]:
import time
from sklearn.ensemble import RandomForestRegressor

start_t = time.time()
model = RandomForestRegressor()
model.fit(x_train, y_train)
prediction = model.predict(test)
predictions["RandomForest"] = prediction

print(f"[RandomForest] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{prediction[0]}, {prediction[1]}, ..., {prediction[-1]}")

[RandomForest] => time: 2.18(sec)
1533351.2660999997, 1537892.0608000008, ..., 917239.3620999998


### c) LightGBM

----
# III. Submission

In [64]:
import os
import datetime


def name(integer):
    return str(integer).zfill(2)
    
savetime = datetime.datetime.now()
folder = "-".join(map(name, [savetime.year, savetime.month, savetime.day]))
sub_folder = name(savetime.hour) + '：' + name(savetime.minute) + '：' + name(savetime.second)

for model in predictions:
    submission["Weekly_Sales"] = predictions[model]
    os.makedirs(f"dataset/submissions/{folder}/{sub_folder}", exist_ok=True)
    submission.to_csv(f"dataset/submissions/{folder}/{sub_folder}/{model}.csv", index=False)