### Contents  
**_I. Data preprocessing_**
> 1. Read dataset  
> 2. Missing values handling
>> a) `train.csv`  
>> b) `test.csv`
> 3. Data manipulation
>> a) `Date`  
>> b) `IsHoliday`  
>> c) `Store`  
>> d) `Promotion1`, ... , `Promotion5`
> 4. Add new features
> 5. Remove not using features  

**_II. Modeling_**  
> 1. Divide `train.csv` into training data and predicting data
> 2. Choose a suitable model
>> a) XGBooster  
>> b) Random Forest  

**_III. Submission_**

----
# I. Data preprocessing

## 1. Read dataset

In [12]:
import pandas as pd

train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

## 2. Missing values handling

### a) `train.csv`

In [13]:
train = train.fillna(0)

### b) `test.csv`

In [14]:
means = {
    "Promotion1":test["Promotion1"].mean(), "Promotion2":test["Promotion2"].mean(), 
    "Promotion3":test["Promotion3"].mean(), "Promotion4":test["Promotion4"].mean(), 
    "Promotion5":test["Promotion5"].mean()
}
test = test.fillna(value=means)

## 3. Data manipulation

### a) `Date`

In [15]:
def date_manipulate(date):
    return tuple(map(int, date.split('/')))

# train
train["Day"] = train["Date"].apply(lambda x: date_manipulate(x)[0])
train["Month"] = train["Date"].apply(lambda x: date_manipulate(x)[1])
train["Year"] = train["Date"].apply(lambda x: date_manipulate(x)[2])

# test
test["Day"] = test["Date"].apply(lambda x: date_manipulate(x)[0])
test["Month"] = test["Date"].apply(lambda x: date_manipulate(x)[1])
test["Year"] = test["Date"].apply(lambda x: date_manipulate(x)[2])

### b) `IsHoliday`

In [16]:
train["IsHoliday"] = train["IsHoliday"].apply(int)
test["IsHoliday"] = test["IsHoliday"].apply(int)

### c) `Store`

In [17]:
train = pd.get_dummies(data=train, columns=["Store"])
test = pd.get_dummies(data=test, columns=["Store"])

### d) `Promotion1`, ... , `Promotion5`

In [18]:
# Scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer

# train
scaler = QuantileTransformer(n_quantiles = 6255)

scaler.fit(train[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
scaled = scaler.transform(train[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
train[['Scaled_Promotion1','Scaled_Promotion2','Scaled_Promotion3','Scaled_Promotion4','Scaled_Promotion5']] = scaled

# test
scaler = QuantileTransformer(n_quantiles = 180)

scaler.fit(test[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
scaled = scaler.transform(test[['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']])
test[['Scaled_Promotion1','Scaled_Promotion2','Scaled_Promotion3','Scaled_Promotion4','Scaled_Promotion5']] = scaled

## 4. Add new features

In [19]:
social_features = ["Temperature","Fuel_Price","Unemployment"]
dummy = train[social_features]

## 5. Remove not using features

In [20]:
train = train.drop(columns=['id','Date','Promotion1','Promotion2','Promotion3','Promotion4','Promotion5'])
train = train.drop(columns=social_features)

test = test.drop(columns=['id','Date','Promotion1','Promotion2','Promotion3','Promotion4','Promotion5'])
test = test.drop(columns=social_features)

----
# II. Modeling

## 1. Divide `train.csv` into training data and for predicting data

In [21]:
x_train = train.drop(columns=["Weekly_Sales"])
y_train = train["Weekly_Sales"]

## 2. Choose a suitable model

In [22]:
predictions = {}

### a) XGBoost

In [23]:
import time
from xgboost import XGBRegressor

start_t = time.time()
model = XGBRegressor(objective='reg:squarederror', learning_rate=0.1, max_depth = 4, n_estimators = 1000)
model.fit(x_train, y_train)
prediction = model.predict(test)
predictions["XGBoost"] = prediction

print(f"[XGBoost] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{prediction[0]}, {prediction[1]}, ... , {prediction[-1]}")

[XGBoost] => time: 2.26(sec)
1667082.75, 1544718.5, ... , 809487.3125


### b) Random forest

In [24]:
import time
from sklearn.ensemble import RandomForestRegressor

start_t = time.time()
model = RandomForestRegressor()
model.fit(x_train, y_train)
prediction = model.predict(test)
predictions["RandomForest"] = prediction

print(f"[RandomForest] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{prediction[0]}, {prediction[1]}, ... , {prediction[-1]}")

[RandomForest] => time: 3.64(sec)
1633769.4414000004, 1509749.520200001, ... , 783806.6269999999


----
# III. Submission

In [25]:
import os
import datetime


def name(integer):
    return str(integer).zfill(2)
    
savetime = datetime.datetime.now()
folder = "-".join(map(name, [savetime.year, savetime.month, savetime.day]))
sub_folder = name(savetime.hour) + '：' + name(savetime.minute) + '：' + name(savetime.second)

for model in predictions:
    submission["Weekly_Sales"] = predictions[model]
    os.makedirs(f"dataset/submissions/{folder}/{sub_folder}", exist_ok=True)
    submission.to_csv(f"dataset/submissions/{folder}/{sub_folder}/{model}.csv", index=False)