In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv


In [5]:
# Correct data types for "calendar.csv"
calendarDTypes = {"weekday": "category", 
                  'wm_yr_wk': 'int16', 
                  "wday": "int16",
                  "month": "int16", 
                  "year": "int16", 
                  "snap_CA": "float32", 
                  'snap_TX': 'float32', 
                  'snap_WI': 'float32' }

# Reading the csv file
calendar = pd.read_csv("../input/m5-forecasting-accuracy/calendar.csv", 
                       dtype = calendarDTypes)

calendar["date"] = pd.to_datetime(calendar["date"])

calendar = calendar.drop(["event_name_1","event_name_2"],axis=1)
calendar = pd.get_dummies(calendar, columns = ["event_type_1","event_type_2"],prefix = ["event_type_1_","event_type_2_"])

for col, colDType in calendarDTypes.items():
    if colDType == "category":
        calendar[col] = calendar[col].cat.codes.astype("int16")
        calendar[col] -= calendar[col].min()

calendar.head()


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,snap_CA,snap_TX,snap_WI,event_type_1__Cultural,event_type_1__National,event_type_1__Religious,event_type_1__Sporting,event_type_2__Cultural,event_type_2__Religious
0,2011-01-29,11101,2,1,1,2011,d_1,0.0,0.0,0.0,0,0,0,0,0,0
1,2011-01-30,11101,3,2,1,2011,d_2,0.0,0.0,0.0,0,0,0,0,0,0
2,2011-01-31,11101,1,3,1,2011,d_3,0.0,0.0,0.0,0,0,0,0,0,0
3,2011-02-01,11101,5,4,2,2011,d_4,1.0,1.0,0.0,0,0,0,0,0,0
4,2011-02-02,11101,6,5,2,2011,d_5,1.0,0.0,1.0,0,0,0,0,0,0


In [7]:
# Correct data types for "sell_prices.csv"
priceDTypes = {"store_id": "category", 
               "item_id": "category", 
               "wm_yr_wk": "int16",
               "sell_price":"float32"}

# Read csv file
prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv", 
                     dtype = priceDTypes)

# Transform categorical features into integers
for col, colDType in priceDTypes.items():
    if colDType == "category":
        prices[col] = prices[col].cat.codes.astype("int16")
        prices[col] -= prices[col].min()
        
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,0,11325,9.58
1,0,0,11326,9.58
2,0,0,11327,8.26
3,0,0,11328,8.26
4,0,0,11329,8.26


In [8]:
# Correct data types for "sell_prices.csv"
priceDTypes = {"store_id": "category", 
               "item_id": "category", 
               "wm_yr_wk": "int16",
               "sell_price":"float32"}

# Read csv file
prices = pd.read_csv("../input/m5-forecasting-accuracy/sell_prices.csv", 
                     dtype = priceDTypes)

# Transform categorical features into integers
for col, colDType in priceDTypes.items():
    if colDType == "category":
        prices[col] = prices[col].cat.codes.astype("int16")
        prices[col] -= prices[col].min()
        
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,0,0,11325,9.58
1,0,0,11326,9.58
2,0,0,11327,8.26
3,0,0,11328,8.26
4,0,0,11329,8.26


In [9]:
firstDay = 250
lastDay = 1913

# Use x sales days (columns) for training
numCols = [f"d_{day}" for day in range(firstDay, lastDay+1)]

# Define all categorical columns
catCols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']

# Define the correct data types for "sales_train_validation.csv"
dtype = {numCol: "float32" for numCol in numCols} 
dtype.update({catCol: "category" for catCol in catCols if catCol != "id"})



# Read csv file
ds = pd.read_csv("../input/m5-forecasting-accuracy/sales_train_validation.csv", 
                 usecols = catCols + numCols, dtype = dtype)


# Transform categorical features into integers
for col in catCols:
    if col != "id":
        ds[col] = ds[col].cat.codes.astype("int16")
        ds[col] -= ds[col].min()
        
ds = pd.melt(ds,
             id_vars = catCols,
             value_vars = [col for col in ds.columns if col.startswith("d_")],
             var_name = "d",
             value_name = "sales")

# Merge "ds" with "calendar" and "prices" dataframe
ds = ds.merge(calendar, on = "d", copy = False)
ds = ds.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)

ds.head()



Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,...,snap_CA,snap_TX,snap_WI,event_type_1__Cultural,event_type_1__National,event_type_1__Religious,event_type_1__Sporting,event_type_2__Cultural,event_type_2__Religious,sell_price
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_250,0.0,2011-10-05,11136,...,1.0,1.0,1.0,0,0,0,0,0,0,3.97
1,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_251,0.0,2011-10-06,11136,...,1.0,1.0,1.0,0,0,0,0,0,0,3.97
2,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_252,0.0,2011-10-07,11136,...,1.0,1.0,0.0,0,0,0,0,0,0,3.97
3,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_250,0.0,2011-10-05,11136,...,1.0,1.0,1.0,0,0,0,0,0,0,4.34
4,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_251,4.0,2011-10-06,11136,...,1.0,1.0,1.0,0,0,0,0,0,0,4.34


In [10]:
ds.shape

(42372682, 24)

In [11]:
dayLags = [7, 28]
lagSalesCols = [f"lag_{dayLag}" for dayLag in dayLags]
for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
    ds[lagSalesCol] = ds[["id","sales"]].groupby("id")["sales"].shift(dayLag)

windows = [7, 28]
for window in windows:
    for dayLag, lagSalesCol in zip(dayLags, lagSalesCols):
        ds[f"rmean_{dayLag}_{window}"] = ds[["id", lagSalesCol]].groupby("id")[lagSalesCol].transform(lambda x: x.rolling(window).mean())

In [12]:
ds.shape

(42372682, 30)

In [13]:
ds.dropna(inplace = True)

unusedCols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
trainCols = ds.columns[~ds.columns.isin(unusedCols)]
X_train = ds[trainCols]
y_train = ds["sales"]

In [15]:
X_train.columns

Index(['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'wday', 'month',
       'year', 'snap_CA', 'snap_TX', 'snap_WI', 'event_type_1__Cultural',
       'event_type_1__National', 'event_type_1__Religious',
       'event_type_1__Sporting', 'event_type_2__Cultural',
       'event_type_2__Religious', 'sell_price', 'lag_7', 'lag_28', 'rmean_7_7',
       'rmean_28_7', 'rmean_7_28', 'rmean_28_28'],
      dtype='object')