<a href="https://colab.research.google.com/github/vence-andersen/M5-Forecasting-Accuracy/blob/main/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing modules required and reading the files.**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from tqdm import tqdm
from lightgbm import LGBMRegressor, Dataset
from sklearn.metrics import mean_squared_error as mse
from downcast import reduce
import warnings
warnings.filterwarnings("ignore")
from xgboost import XGBRegressor
# from catboost import CatBoostRegressor

In [None]:
# Using Pandas to reasd the files.

sales = pd.read_csv("sales_train_evaluation.csv")
sell_price = pd.read_csv("sell_prices.csv")
cal = pd.read_csv("calendar.csv")

In [None]:
# To reduce the memory usage, we'll be using reduce throughout the code

sales = reduce(sales)
sell_price = reduce(sell_price)
cal = reduce(cal)

# **Feature Engineering**

In [None]:
# We are creating new features required for the prediction for days from 1942 till 1969

for day in range(1942,1942+28):
  sales['d_' + str(day)] = np.int32(0)

In [None]:
# We are transforming our Time Series problem to Supervised Machine Learning Problem

data = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
          var_name='day', value_name='demand').dropna()
print("The new shape is ",data.shape)
data.head(2)

The new shape is  (60034810, 8)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


In [None]:
# We are then merging the all the csv files together

data = data.merge(cal, left_on='day', right_on='d')

data = data.merge(sell_price,on=['store_id','item_id', 'wm_yr_wk'], how='left')

In [None]:
data['sell_price'].fillna(data.groupby('id')['sell_price'].transform('mean'), inplace=True)

In [None]:
# we are stripping the 'd_' from day column to make it an integer feature
data['day'] = data['day'].apply(lambda x: x.split('_')[1]).astype(np.int16)

#since weekday's are represented as wday with numbers and d is a duplicate column.
data.drop(['d','weekday','date'], axis=1, inplace=True) 

In [None]:
#  We are doing a categorical encoding by replacing the categories with the memory number every category has.
# This method is just a lazy way of doing things as cat codes are produced only features of types categories.
# Hence if a feature which is not a category is passed in, it throws AttributeError and thus the excep condition excutes
# And the error is handled

for i in data.columns:
    try:
        data[i] = data[i].cat.codes
    except AttributeError:
        pass

In [None]:
# Adding lag shift features as those are good time series feature engineering steps.

lags = [28,30,35,42,49,56,63,70]
for lag in tqdm(lags):
    data["lag_" + str(lag)] = data.groupby("id")["demand"].shift(lag).astype(np.float16)

100%|██████████| 9/9 [00:46<00:00,  5.14s/it]


In [None]:
# # Adding rolling window feature with mean value

means = [5,7,28,56]
for mns in tqdm(means):
    data['rolling_median_'+str(mns)] = data.groupby(['id'])['demand'].transform(lambda x: x.rolling(mns).median())

100%|██████████| 4/4 [2:18:51<00:00, 2082.85s/it]


In [None]:
data = data[data['day']>1000]
data.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,day,demand,wm_yr_wk,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_28,lag_30,lag_35,lag_42,lag_49,lag_56,lag_63,lag_70,rolling_median_5,rolling_median_7,rolling_median_28,rolling_median_56
30490000,14370,1437,3,1,0,0,1001,2,11339,7,10,2013,-1,-1,-1,-1,0,0,0,8.257812,1.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
30490001,14380,1438,3,1,0,0,1001,0,11339,7,10,2013,-1,-1,-1,-1,0,0,0,3.970703,2.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
data.isna().sum().sum()

0

In [None]:
data.to_pickle('new.pkl')