Notebook is based on m5-baseline (harupy), which is itself based on Very fst Model (ragnar123).

https://www.kaggle.com/ragnar123/very-fst-model<br>
https://www.kaggle.com/harupy/m5-baseline

# Imports

In [1]:
%reload_ext autoreload
%autoreload 2

import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ROOT = pathlib.Path().absolute().parent
RAW_DATA_PATH = ROOT / 'data' / 'raw'
PROCESSED_DATA_PATH = ROOT / 'data' / 'processed'

DAYS_PRED = 28

# endure this project is in the path
sys.path.insert(0, ROOT.absolute().as_posix())

from src.data.process_data import reduce_memory_usage

In [4]:
df = \
 d.read_parquet(PROCESSED_DATA_PATH / 'combined_dataset.parquet').
astype({c:'category' for c in ['wday','month','year']}))

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47735397 entries, 0 to 47735396
Data columns (total 52 columns):
 #   Column                          Dtype         
---  ------                          -----         
 0   part                            category      
 1   id                              category      
 2   item_id                         category      
 3   dept_id                         category      
 4   cat_id                          category      
 5   store_id                        category      
 6   state_id                        category      
 7   d                               category      
 8   demand                          uint16        
 9   date                            datetime64[ns]
 10  weekday                         category      
 11  wday                            category      
 12  month                           category      
 13  year                            category      
 14  event_type_cultural             bool          
 

In [5]:
def make_aggregated_dataset():
    # read in the combined dataset
    df = \
    (pd.read_parquet(PROCESSED_DATA_PATH / 'combined_dataset.parquet')
    .astype({c:'category' for c in ['wday','month','year']}))

    # how will the columns be aggregated
    to_agg = {'demand':'sum'}
    to_agg.update({x:'first' for x in df.columns if x.startswith('event_') or x=='snap'})

    # groupby everything except item_id and columns top aggregate
    aggregated_sales = \
    (df
    .groupby(['state_id','store_id','date','year','month','weekday','cat_id','dept_id'], observed=True)
    .agg(to_agg)
    .sort_index(axis=1))

    # create a boolean calendar which can join the aggregated data
    aggregated_calendar = \
    (aggregated_sales
    .drop(columns=['demand'])
    .groupby(['state_id','store_id','date','year','month','weekday'], observed=True)
    .first())
    
    # add more levels to multiindex for intuitive indexing
    aggregated_calendar = pd.concat(
        [aggregated_calendar[['snap']],
         aggregated_calendar[[x for x in aggregated_calendar.columns if x.startswith('event_type_')]],
         aggregated_calendar[[x for x in aggregated_calendar.columns if x.startswith('event_name_')]]
         ], keys=['snap','type','name'], axis=1)
    
    aggregated_calendar = pd.concat(
        [aggregated_calendar[('snap','snap')],
         aggregated_calendar[['type','name']]],
        keys=['snap','events'], axis=1)
    
    # rename the columns to cutr off 'event_name_' and 'event_type_'
    aggregated_calendar.rename(lambda col: col.split('_')[-1], axis='columns', inplace=True, level=2)

    # unstack the category and department indices for more intuitive indexing
    aggregated_sales = \
    (aggregated_sales
    [['demand']]
    .unstack(['cat_id','dept_id'], fill_value=0))

    # join in the boolean calendar
    aggregated_sales = \
    (aggregated_calendar
    .join(aggregated_sales)
    .sort_index(axis=1, ascending=False)
    .sort_index(axis=0)
    .pipe(reduce_memory_usage))

    aggregated_sales.to_pickle(PROCESSED_DATA_PATH / 'aggregated_dataset.pickle')

df = make_aggregated_dataset()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,snap,events,events,events,events,events,events,events,events,events,events,events,events,events,events,events,events,events,events,events,demand
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,snap,type,type,type,type,name,name,name,name,name,name,name,name,name,name,name,name,name,name,name,HOBBIES
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,snap,sporting,religious,national,cultural,veteransday,valentinesday,thanksgiving,superbowl,stpatricksday,...,halloween,father's day,eidaladha,eid al-fitr,easter,columbusday,cinco de mayo,christmas,chanukah end,HOBBIES_1
state_id,store_id,date,year,month,weekday,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3
CA,CA_1,2011-01-29,2011,1,Saturday,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,528
CA,CA_1,2011-01-30,2011,1,Sunday,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,489
CA,CA_1,2011-01-31,2011,1,Monday,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,409
CA,CA_1,2011-02-01,2011,2,Tuesday,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,383
CA,CA_1,2011-02-02,2011,2,Wednesday,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,263
CA,CA_1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CA,CA_1,2016-06-15,2016,6,Wednesday,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
CA,CA_1,2016-06-16,2016,6,Thursday,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
CA,CA_1,2016-06-17,2016,6,Friday,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
CA,CA_1,2016-06-18,2016,6,Saturday,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
