### Contents  
**_I. Data preprocessing_**
> 1. Read dataset  
> 2. Missing values handling
>> a) `train.csv`  
>> b) `test.csv`
> 3. Data manipulation
>> a) `Date`  
>> b) `IsHoliday`  
>> c) `Store`  
>> d) `Promotion1`, ... , `Promotion5`
> 4. Add new features
> 5. Remove not using features  

**_II. Modeling_**  
> 1. Divide `train.csv` into training data and predicting data
> 2. Choose a suitable model
>> a) XGBooster  
>> b) Random Forest  

**_III. Submission_**

----
# I. Data preprocessing

## 1. Read dataset

In [36]:
import pandas as pd

train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
submission = pd.read_csv("dataset/sample_submission.csv")

## 2. Missing values handling

### a) `train.csv`

In [37]:
train = train.fillna(0)

### b) `test.csv`

In [38]:
means = {
    "Promotion1":test["Promotion1"].mean(), "Promotion2":test["Promotion2"].mean(), 
    "Promotion3":test["Promotion3"].mean(), "Promotion4":test["Promotion4"].mean(), 
    "Promotion5":test["Promotion5"].mean()
}
test = test.fillna(value=means)

## 3. Data manipulation

### a) `Date`

In [39]:
import datetime as dt

def date_to_week(date):
    day, month, year = map(int, date.split('/'))
    t = dt.datetime(year, month, day) - dt.datetime(2010, 2, 5)
    return t.days // 7


def date_split(date):
    return tuple(map(int, date.split('/')))


# train
#train["Week"] = train["Date"].apply(date_to_week)
train["Day"] = train["Date"].apply(lambda x: date_split(x)[0])
train["Month"] = train["Date"].apply(lambda x: date_split(x)[1])
train["Year"] = train["Date"].apply(lambda x: date_split(x)[2])


# test
#test["Week"] = test["Date"].apply(date_to_week)
test["Day"] = test["Date"].apply(lambda x: date_split(x)[0])
test["Month"] = test["Date"].apply(lambda x: date_split(x)[1])
test["Year"] = test["Date"].apply(lambda x: date_split(x)[2])

### b) `IsHoliday`

In [40]:
# trian
train["IsHoliday"] = train["IsHoliday"].apply(int)

# test
test["IsHoliday"] = test["IsHoliday"].apply(int)

### c) `Store`

In [41]:
# train
train = pd.get_dummies(data=train, columns=["Store"])

# test
test = pd.get_dummies(data=test, columns=["Store"])

### d) `Promotion1`, ... , `Promotion5`

In [42]:
# Scaling
from sklearn.preprocessing import QuantileTransformer


promos = ['Promotion1','Promotion2','Promotion3','Promotion4','Promotion5']

# train
scaler = QuantileTransformer()
scaler.fit(train[promos])
train[promos] = scaler.transform(train[promos])


# test
scaler = QuantileTransformer(n_quantiles = 180)
scaler.fit(test[promos])
test[promos] = scaler.transform(test[promos])

### e) `Weekly_Sales` 

In [43]:
# train
output_scaler = QuantileTransformer()
output_scaler.fit(train[["Weekly_Sales"]])
train[["Weekly_Sales"]] = output_scaler.transform(train[["Weekly_Sales"]])

In [44]:
# train
scaler = QuantileTransformer()
scaler.fit(train[["Temperature"]])
train[["Temperature"]] = scaler.transform(train[["Temperature"]])

# test
scaler = QuantileTransformer(n_quantiles = 180)
scaler.fit(test[["Temperature"]])
test[["Temperature"]] = scaler.transform(test[["Temperature"]])

In [45]:
# train
scaler = QuantileTransformer()
scaler.fit(train[["Fuel_Price"]])
train[["Fuel_Price"]] = scaler.transform(train[["Fuel_Price"]])

# test
scaler = QuantileTransformer(n_quantiles = 180)
scaler.fit(test[["Fuel_Price"]])
test[["Fuel_Price"]] = scaler.transform(test[["Fuel_Price"]])

In [46]:
# train
scaler = QuantileTransformer()
scaler.fit(train[["Unemployment"]])
train[["Unemployment"]] = scaler.transform(train[["Unemployment"]])

# test
scaler = QuantileTransformer(n_quantiles = 180)
scaler.fit(test[["Unemployment"]])
test[["Unemployment"]] = scaler.transform(test[["Unemployment"]])

## 5. Remove not using features

In [47]:
train = train.drop(columns=['id', 'Date'])
#train = train.drop(columns=social_features)

test = test.drop(columns=['id', 'Date'])
#test = test.drop(columns=social_features)

In [48]:
train

Unnamed: 0,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Weekly_Sales,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,0.183308,0.013013,0.000000,0.375876,0.349850,0.000000,0.000000,0.566567,0,0.832897,...,0,0,0,0,0,0,0,0,0,0
1,0.143750,0.004505,0.000000,0.375876,0.349850,0.000000,0.000000,0.566567,1,0.832337,...,0,0,0,0,0,0,0,0,0,0
2,0.156601,0.001502,0.000000,0.375876,0.349850,0.000000,0.000000,0.566567,0,0.822478,...,0,0,0,0,0,0,0,0,0,0
3,0.242994,0.007508,0.000000,0.375876,0.349850,0.000000,0.000000,0.566567,0,0.743452,...,0,0,0,0,0,0,0,0,0,0
4,0.240196,0.038539,0.000000,0.375876,0.349850,0.000000,0.000000,0.566567,0,0.806253,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6250,0.747462,0.866867,0.984969,0.768839,0.917918,0.967133,0.869701,0.764264,0,0.366644,...,0,0,0,0,0,0,0,0,0,1
6251,0.757758,0.907077,0.938933,0.782810,0.890863,0.873041,0.765066,0.764264,1,0.381225,...,0,0,0,0,0,0,0,0,0,1
6252,0.598098,0.922943,0.942905,0.375876,0.770093,0.922494,0.910518,0.764264,0,0.349835,...,0,0,0,0,0,0,0,0,0,1
6253,0.550785,0.956957,0.902804,0.839858,0.899113,0.890731,0.969742,0.764264,0,0.361497,...,0,0,0,0,0,0,0,0,0,1


In [49]:
test

Unnamed: 0,Temperature,Fuel_Price,Promotion1,Promotion2,Promotion3,Promotion4,Promotion5,Unemployment,IsHoliday,Day,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,0.659218,0.326816,0.921788,0.516760,0.592179,0.921788,0.648045,0.477654,0,5,...,0,0,0,0,0,0,0,0,0,0
1,0.533520,0.220670,0.424581,0.516760,0.396648,0.368715,0.815642,0.477654,0,12,...,0,0,0,0,0,0,0,0,0,0
2,0.625698,0.142458,0.245810,0.516760,0.240223,0.061453,0.446927,0.477654,0,19,...,0,0,0,0,0,0,0,0,0,0
3,0.681564,0.041899,0.519553,0.100559,0.284916,0.508380,0.245810,0.477654,0,26,...,0,0,0,0,0,0,0,0,0,0
4,0.726257,0.326816,0.821229,0.516760,0.458101,0.905028,0.703911,0.332402,0,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.092179,0.430168,0.050279,0.027933,0.027933,0.656425,0.039106,0.120112,0,26,...,0,0,0,0,0,0,0,0,1,0
176,0.575419,0.653631,0.754190,0.516760,0.597765,0.865922,0.452514,0.824022,0,5,...,0,0,0,0,0,0,0,0,0,1
177,0.259777,0.698324,0.396648,0.516760,0.379888,0.363128,0.715084,0.824022,0,12,...,0,0,0,0,0,0,0,0,0,1
178,0.337989,0.625698,0.407821,0.516760,0.156425,0.240223,0.312849,0.824022,0,19,...,0,0,0,0,0,0,0,0,0,1


----
# II. Modeling

## 1. Divide `train.csv` into training data and for predicting data

In [50]:
x_train = train.drop(columns=["Weekly_Sales"])
y_train = train["Weekly_Sales"]

## 2. Choose a suitable model

In [51]:
predictions = {}

### a) XGBoost

In [55]:
import time
from xgboost import XGBRegressor

start_t = time.time()
model = XGBRegressor(objective='reg:squarederror', learning_rate=0.1, max_depth = 4, n_estimators = 1000)
model.fit(x_train, y_train)
prediction = pd.DataFrame()
prediction["Weekly_Sales"] = model.predict(test)

prediction = output_scaler.inverse_transform(prediction)
prediction = [x[0] for x in prediction]
predictions["XGBoost"] = prediction

print(f"[XGBoost] => time: {round(time.time() - start_t, 2)}(sec)")
print(f"{prediction[0]}, {prediction[1]}, ... , {prediction[-1]}")

<class 'numpy.ndarray'>
[XGBoost] => time: 2.28(sec)
[1608307.2], [1473336.], ... , [764034.5]


----
# III. Submission

In [53]:
import os

def name(integer):
    return str(integer).zfill(2)
    
savetime = dt.datetime.now()
folder = "-".join(map(name, [savetime.year, savetime.month, savetime.day]))
sub_folder = name(savetime.hour) + '：' + name(savetime.minute) + '：' + name(savetime.second)

for model in predictions:
    submission["Weekly_Sales"] = predictions[model]
    os.makedirs(f"dataset/submissions/{folder}/{sub_folder}", exist_ok=True)
    submission.to_csv(f"dataset/submissions/{folder}/{sub_folder+' ('+model+')'}.csv", index=False)