<div class ="alert alert-block alert-warning">
    
- This notebook aims to apply ideas (Recursive vs Direct from this Handbook https://phdinds-aim.github.io/time_series_handbook/08_WinningestMethods/lightgbm_m5_forecasting.html#recursive-forecasting
    
- Main changes include:
    - Apply on individual item than aggregated store time series
    - take into account exogenous variables provided

# Import Libraries

In [1]:
import gc
import os
import psutil

import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import lightgbm as lgb
import warnings  # To suppress warnings
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

import random  # For generating random numbers

# Function to set a fixed random seed for reproducibility
def seed_everything(seed):
    np.random.seed(seed)  # Set numpy random seed
    random.seed(seed)  # Set built-in random seed

seed_everything(seed=2024)  # Set the seed to 2024


# Load Dataset

In [2]:
calendar = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/calendar.csv")  # Load calendar dataset
print(f"len(calendar):{len(calendar)}")  # Print the number of rows in calendar
calendar

len(calendar):1969


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


In [3]:
sales_train_evaluation = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv")
print(f"len(sales_train_evaluation): {len(sales_train_evaluation)}")
sales_train_evaluation.head()

len(sales_train_evaluation): 30490


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [4]:
sell_prices = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")
print(f"len(sell_prices):{len(sell_prices)}")
sell_prices

len(sell_prices):6841121


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26
...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00
6841117,WI_3,FOODS_3_827,11618,1.00
6841118,WI_3,FOODS_3_827,11619,1.00
6841119,WI_3,FOODS_3_827,11620,1.00


In [5]:
# Enhanced memory optimization function with object datatype handling
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2  # Initial memory usage in MB
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:  # Downcast numerics
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        elif col_type == 'object':  # Handle object types
            if col == 'date':  # Convert date column to datetime
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
            else:
                df[col] = df[col].astype('category')  # Convert other object types to category
    end_mem = df.memory_usage().sum() / 1024**2  # Final memory usage in MB
    if verbose:
        print(f'Memory usage reduced to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

In [6]:
# Apply the optimized memory reduction function to each dataframe
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)
sales_train_evaluation = reduce_mem_usage(sales_train_evaluation)

Memory usage reduced to  0.13 Mb (40.4% reduction)
Memory usage reduced to 45.76 Mb (78.1% reduction)
Memory usage reduced to 96.30 Mb (78.7% reduction)


# Convert Sales Data to Long format

In [7]:
# Specify day columns up to d_1941 for the extended dataset
d_cols_eval = [f"d_{i}" for i in range(1, 1942)]
sales_train_evaluation_long = sales_train_evaluation.melt(
    id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
    value_vars=d_cols_eval,
    var_name="d",
    value_name="sales"
)
print(f"len(sales_train_evaluation_long): {len(sales_train_evaluation_long)}")
sales_train_evaluation_long.head()

len(sales_train_evaluation_long): 59181090


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


# Label Endcode Calendar event

In [8]:
# Encode event-related features in the calendar dataframe
event_columns = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
le = LabelEncoder()
for col in event_columns:
    calendar[col] = le.fit_transform(calendar[col].astype(str))

# Merge with Calender data

In [9]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(calendar, on="d", how="left")
sales_train_evaluation_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0


# Merge with Price data

In [10]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(
    sell_prices, 
    on=["store_id", "item_id", "wm_yr_wk"], 
    how="left"
)
sales_train_evaluation_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,


# Define Train, Validation and Evaluation sets

In [11]:
# Define the specific sample id to plot
sample_id = "FOODS_3_586_TX_1_evaluation"

In [12]:
train_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1, 1914)])]
filtered_train_set = train_set[train_set['id'] == sample_id]

print("filtered_train_set sample:")
print(f"len(Train set): {len(filtered_train_set)}")
filtered_train_set.head()

filtered_train_set sample:
len(Train set): 1913


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
15006,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1,53,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,1.480469
45496,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_2,46,2011-01-30,11101,...,1,2011,30,4,4,2,0,0,0,1.480469
75986,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_3,42,2011-01-31,11101,...,1,2011,30,4,4,2,0,0,0,1.480469
106476,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_4,45,2011-02-01,11101,...,2,2011,30,4,4,2,1,1,0,1.480469
136966,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_5,28,2011-02-02,11101,...,2,2011,30,4,4,2,1,0,1,1.480469


In [13]:
# Filter the validation set for d_1914 to d_1941
validation_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1914, 1942)])]
filtered_validation_set = validation_set[validation_set['id'] == sample_id]

print("filtered_validation_set sample:")
print(f"len(Validation set): {len(filtered_validation_set)}")
filtered_validation_set.head()


filtered_validation_set sample:
len(Validation set): 28


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
58342376,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1914,61,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58372866,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1915,47,2016-04-26,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58403356,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1916,66,2016-04-27,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58433846,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1917,35,2016-04-28,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58464336,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1918,70,2016-04-29,11613,...,4,2016,30,4,4,2,0,0,0,1.679688


In [14]:
# Create a new DataFrame for the prediction period (d_1942 to d_1969)
forecast_days = [f'd_{i}' for i in range(1942, 1970)]
forecast_df = pd.DataFrame({'d': forecast_days})

# Generate one entry per product-store combination for each forecast day
prediction_set = sales_train_evaluation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
prediction_set = prediction_set.merge(forecast_df, how='cross')

# Merge with calendar data to add date information for the forecast period
prediction_set = prediction_set.merge(calendar, on='d', how='left')

# Merge with sell_prices to add price data, aligning with the correct store, item, and week
prediction_set = prediction_set.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
filtered_prediction_set = prediction_set[prediction_set['id'] == sample_id]

print("filtered_prediction_set sample:")
print(f"len(Prediction set): {len(filtered_prediction_set)}")
filtered_prediction_set.head()


filtered_prediction_set sample:
len(Prediction set): 28


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
420168,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1942,2016-05-23,11617,Monday,...,5,2016,30,4,4,2,0,0,0,1.679688
420169,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1943,2016-05-24,11617,Tuesday,...,5,2016,30,4,4,2,0,0,0,1.679688
420170,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1944,2016-05-25,11617,Wednesday,...,5,2016,30,4,4,2,0,0,0,1.679688
420171,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1945,2016-05-26,11617,Thursday,...,5,2016,30,4,4,2,0,0,0,1.679688
420172,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1946,2016-05-27,11617,Friday,...,5,2016,30,4,4,2,0,0,0,1.679688


In [15]:
# Apply the optimized memory reduction function to each dataframe
filtered_train_set = reduce_mem_usage(filtered_train_set)
filtered_validation_set = reduce_mem_usage(filtered_validation_set)
filtered_prediction_set = reduce_mem_usage(filtered_prediction_set)

Memory usage reduced to  1.49 Mb (-1.1% reduction)
Memory usage reduced to  1.33 Mb (-0.0% reduction)
Memory usage reduced to  1.33 Mb (-0.0% reduction)


In [16]:
filtered_train_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
15006,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1,53,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,1.480469
45496,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_2,46,2011-01-30,11101,...,1,2011,30,4,4,2,0,0,0,1.480469
75986,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_3,42,2011-01-31,11101,...,1,2011,30,4,4,2,0,0,0,1.480469
106476,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_4,45,2011-02-01,11101,...,2,2011,30,4,4,2,1,1,0,1.480469
136966,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_5,28,2011-02-02,11101,...,2,2011,30,4,4,2,1,0,1,1.480469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58189926,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1909,45,2016-04-20,11612,...,4,2016,30,4,4,2,0,0,0,1.679688
58220416,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1910,59,2016-04-21,11612,...,4,2016,30,4,4,2,0,0,0,1.679688
58250906,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1911,50,2016-04-22,11612,...,4,2016,30,4,4,2,0,0,0,1.679688
58281396,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1912,56,2016-04-23,11613,...,4,2016,30,4,4,2,0,0,0,1.679688


In [17]:
filtered_validation_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
58342376,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1914,61,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58372866,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1915,47,2016-04-26,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58403356,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1916,66,2016-04-27,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58433846,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1917,35,2016-04-28,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58464336,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1918,70,2016-04-29,11613,...,4,2016,30,4,4,2,0,0,0,1.679688
58494826,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1919,51,2016-04-30,11614,...,4,2016,21,2,4,2,0,0,0,1.679688
58525316,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1920,44,2016-05-01,11614,...,5,2016,20,2,4,2,1,1,0,1.679688
58555806,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1921,42,2016-05-02,11614,...,5,2016,30,4,4,2,1,0,1,1.679688
58586296,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1922,37,2016-05-03,11614,...,5,2016,30,4,4,2,1,1,1,1.679688
58616786,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1923,60,2016-05-04,11614,...,5,2016,30,4,4,2,1,0,0,1.679688


In [18]:
filtered_prediction_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
420168,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1942,2016-05-23,11617,Monday,...,5,2016,30,4,4,2,0,0,0,1.679688
420169,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1943,2016-05-24,11617,Tuesday,...,5,2016,30,4,4,2,0,0,0,1.679688
420170,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1944,2016-05-25,11617,Wednesday,...,5,2016,30,4,4,2,0,0,0,1.679688
420171,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1945,2016-05-26,11617,Thursday,...,5,2016,30,4,4,2,0,0,0,1.679688
420172,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1946,2016-05-27,11617,Friday,...,5,2016,30,4,4,2,0,0,0,1.679688
420173,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1947,2016-05-28,11618,Saturday,...,5,2016,30,4,4,2,0,0,0,1.679688
420174,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1948,2016-05-29,11618,Sunday,...,5,2016,30,4,4,2,0,0,0,1.679688
420175,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1949,2016-05-30,11618,Monday,...,5,2016,14,1,4,2,0,0,0,1.679688
420176,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1950,2016-05-31,11618,Tuesday,...,5,2016,30,4,4,2,0,0,0,1.679688
420177,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1951,2016-06-01,11618,Wednesday,...,6,2016,30,4,4,2,1,1,0,1.679688


# Multi-step Prediction Recursive

In [203]:
window_size = 60
forecast_horizon = 28  # Number of days to predict

In [204]:
# Placeholder to store forecasts for each 'id'
forecasts = {}

In [205]:
# Loop through each unique 'id'
for idx, unique_id in enumerate(filtered_train_set['id'].unique(), 1):
    # Print progress every 1,000 unique 'id's
    if idx % 1000 == 0:
        print(f"Processing ID {idx}/{filtered_train_set['id'].nunique()}")

    # Filter data for the current 'id' in both train and validation sets
    df_id_train = filtered_train_set[filtered_train_set['id'] == unique_id]
    df_id_val = filtered_validation_set[filtered_validation_set['id'] == unique_id]
    sales_series = df_id_train['sales'].values
    exogenous_vars_train = df_id_train[['wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'event_name_1', 
                                        'event_type_1', 'event_name_2', 'event_type_2']]
    exogenous_vars_val = df_id_val[['wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'event_name_1', 
                                    'event_type_1', 'event_name_2', 'event_type_2']]

    # Function to create the lagged data with exogenous variables
    def create_xy(series, exog, window_size, prediction_horizon):
        x, y = [], []
        for i in range(len(series) - window_size - prediction_horizon + 1):
            x.append(np.concatenate([series[i:i+window_size], exog.iloc[i + window_size - 1].values]))
            y.append(series[i+window_size:i+window_size+prediction_horizon])
        return np.array(x), np.array(y)

    # Prepare training data
    train_x, train_y = create_xy(sales_series, exogenous_vars_train, window_size, 1)
    train_y = train_y.flatten()
    
    params = {
        'n_estimators': 3000,
        'max_depth': 8,
        'num_leaves': 2**5,
        'learning_rate': 0.5,
        'boosting_type': 'dart',
        'verbose': -1
    }

    model = lgb.LGBMRegressor()
    model.fit(train_x, train_y, eval_metric='l2') # RMSE (l2 or rmse): Preferred when large errors are more critical and need to be penalized.

    # Recursive forecasting
    recursive_x = np.concatenate([train_x[-1, :window_size], exogenous_vars_val.iloc[0].values])
    forecast_id = []
    for i in range(forecast_horizon):
        pred = model.predict(recursive_x.reshape((1, -1)))
        forecast_id.append(pred[0])

        # Update recursive_x by removing  the oldest sales value and adding the new prediction,
        # along with shifting the exogenous variable for the next time step.
        recursive_x = np.concatenate([recursive_x[1:window_size], pred, exogenous_vars_val.iloc[i].values])

    # Store forecast for the current 'id'
    forecasts[unique_id] = forecast_id

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002830 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6087
[LightGBM] [Info] Number of data points in the train set: 1853, number of used features: 67
[LightGBM] [Info] Start training from score 58.889908


<div class ="alert alert-block alert-warning">
    
- Training time 2.5 hours for 30,490 series with 414 days
- Training time 20 secs for 1 series with 1913 days

In [206]:
# Convert the forecasts dictionary to a DataFrame with each row as a unique 'id' and each column as a day
forecasts_df = pd.DataFrame.from_dict(forecasts, orient='index')

# Set the columns as forecast dates
start_dates = train_set.groupby('id')['date'].max()  # Get the last date for each 'id' in the training set
forecast_dates = {unique_id: pd.date_range(start=start_dates[unique_id] + pd.Timedelta(days=1), periods=forecast_horizon)
                  for unique_id in forecasts.keys()}

# Apply the dates as columns based on the forecast horizon
forecast_dates_series = pd.date_range(start=start_dates[unique_id] + pd.Timedelta(days=1), periods=forecast_horizon)
forecasts_df.columns = forecast_dates_series

# Display results
forecasts_df

Unnamed: 0,2016-04-25,2016-04-26,2016-04-27,2016-04-28,2016-04-29,2016-04-30,2016-05-01,2016-05-02,2016-05-03,2016-05-04,...,2016-05-13,2016-05-14,2016-05-15,2016-05-16,2016-05-17,2016-05-18,2016-05-19,2016-05-20,2016-05-21,2016-05-22
FOODS_3_586_TX_1_evaluation,58.488994,56.226283,51.655952,51.345677,50.753118,44.182058,52.906437,54.876271,49.612112,48.102825,...,44.802321,45.268855,56.739808,65.723078,57.305754,55.595725,58.995163,64.392675,58.647613,59.161675


# Plotly plot against actual validation data for one sample

In [207]:
import plotly.graph_objects as go

In [208]:
# Define the specific sample id to plot
sample_id = "FOODS_3_586_TX_1_evaluation"

In [209]:
# Extract predicted values from forecasts_df
predicted_values = forecasts_df.loc[sample_id]

# Extract actual values from validation_set for the forecast horizon dates
actual_values = validation_set[validation_set['id'] == sample_id].set_index('date')['sales'].reindex(forecasts_df.columns)

# Create a Plotly figure
fig = go.Figure()

# Add the predicted sales line
fig.add_trace(go.Scatter(
    x=forecasts_df.columns,
    y=predicted_values,
    mode='lines',
    name="Predicted Sales",
    line=dict(dash='dash', color='blue')
))

# Add the actual sales line
fig.add_trace(go.Scatter(
    x=forecasts_df.columns,
    y=actual_values,
    mode='lines',
    name="Actual Sales",
    line=dict(color='red')
))

# Update layout
fig.update_layout(
    title=f"Forecasted vs Actual Sales for {sample_id}",
    xaxis_title="Date",
    yaxis_title="Sales",
    legend_title="Sales Type",
    template="plotly_dark"
)

# Show the figure
fig.show()


In [210]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate evaluation metrics
mse = mean_squared_error(actual_values, predicted_values)
mae = mean_absolute_error(actual_values, predicted_values)
mape = np.mean(np.abs((actual_values - predicted_values) / actual_values)) * 100
rmse = np.sqrt(mse)

# Calculate RMSSE (Root Mean Squared Scaled Error)
# This requires the mean squared error of the in-sample data (e.g., training data)
# Assuming 'train_sales' is a series of historical actual sales data before the forecast period
train_sales = validation_set[validation_set['id'] == sample_id].set_index('date')['sales'][:len(actual_values)]  # Replace with appropriate in-sample data

# Compute mean squared error for historical sales to calculate RMSSE
historical_mse = mean_squared_error(train_sales[1:], train_sales[:-1])  # lagged values for naive forecast
rmsse = rmse / np.sqrt(historical_mse)

# Print the metrics
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}%")
print(f"RMSE: {rmse}")
print(f"RMSSE: {rmsse}")


MSE: 143.35807101818932
MAE: 10.199355809985976
MAPE: 18.9601773847509%
RMSE: 11.973223083956523
RMSSE: 0.7935908986361697


<div class ="alert alert-block alert-warning">
    
Following codes are not used since we just predict over one series in the validation set

# Predict for Prediction set

In [None]:
window_size = 365
forecast_horizon = 28  # Number of days to predict

In [None]:
# Merge train_set and validation_set for a full history of sales data
full_history_set = pd.concat([train_set, validation_set]).reset_index(drop=True)

In [None]:
# Placeholder to store forecasts for each 'id'
forecasts = {}

In [None]:
# Loop through each unique 'id' in the prediction_set
for idx, unique_id in enumerate(prediction_set['id'].unique(), 1):
    # Print progress every 1,000 unique 'id's
    if idx % 1000 == 0:
        print(f"Processing ID {idx}/{prediction_set['id'].nunique()}")

    # Get the last window of data from full_history_set for the current 'id'
    history_window = full_history_set[full_history_set['id'] == unique_id].tail(window_size)['sales'].values

    # Ensure history_window has the expected length (window_size)
    if len(history_window) < window_size:
        print(f"Warning: Not enough data for {unique_id} to create a full window size.")
        continue

    # Get the exogenous variables for the current 'id' in prediction_set
    exogenous_vars_pred = prediction_set[prediction_set['id'] == unique_id][['wm_yr_wk', 'snap_CA', 'snap_TX', 
                                                                             'snap_WI', 'sell_price', 'event_name_1', 
                                                                             'event_type_1', 'event_name_2', 
                                                                             'event_type_2']].values

    # Initialize recursive_x with the last window of sales from the full history set
    recursive_x = np.concatenate([history_window, exogenous_vars_pred[0]])

    # Placeholder for predictions for this 'id'
    forecast_id = []

    # Recursive forecasting for each day in prediction_set
    for i in range(forecast_horizon):
        # Predict using the model
        pred = model.predict(recursive_x.reshape((1, -1)))
        forecast_id.append(pred[0])

        # Update recursive_x by removing the oldest sales value and adding the new prediction,
        # along with updating exogenous variables for the next day in prediction_set
        recursive_x = np.concatenate([recursive_x[1:window_size], pred, exogenous_vars_pred[i]])

    # Store the forecast for the current 'id'
    forecasts[unique_id] = forecast_id

<div class ="alert alert-block alert-warning">
    
- Prediction time 0.5 hours for 30,490 series on 28 days

In [None]:
# Convert the forecasts dictionary to a DataFrame with each row as a unique 'id' and each column as a day
predictions_df = pd.DataFrame.from_dict(forecasts, orient='index')

# Set the columns as forecast dates
start_dates = validation_set.groupby('id')['date'].max()  # Get the last date for each 'id' in the validation set
forecast_dates_series = pd.date_range(start=start_dates.iloc[0] + pd.Timedelta(days=1), periods=forecast_horizon)
predictions_df.columns = forecast_dates_series

# Display the results
predictions_df

# Submission

In [None]:
# Reset index to make 'id' a column instead of an index
forecasts_df = forecasts_df.reset_index()

# Rename the 'index' column to 'id'
forecasts_df = forecasts_df.rename(columns={'index': 'id'})

# Rename forecast columns
forecasts_df.columns = ['id'] + [f'F{i}' for i in range(1, 29)]

# Modify 'id' values for submission
forecasts_df['id'] = forecasts_df['id'].str.replace('evaluation', 'validation')

In [None]:
# Reset index to make 'id' a column instead of an index
predictions_df = predictions_df.reset_index()

# Rename the 'index' column to 'id'
predictions_df = predictions_df.rename(columns={'index': 'id'})

# Rename forecast columns
predictions_df.columns = ['id'] + [f'F{i}' for i in range(1, 29)]

In [None]:
# Combine validation and evaluation for submission
submit = pd.concat([forecasts_df, predictions_df]).reset_index(drop=True)
submit.to_csv('submission.csv', index=False)

In [None]:
submit.head(10)

In [None]:
submit.tail(10)