<div class ="alert alert-block alert-warning">
    
- This notebook aims to apply ideas (Recursive vs Direct from this Handbook https://phdinds-aim.github.io/time_series_handbook/08_WinningestMethods/lightgbm_m5_forecasting.html#recursive-forecasting
    
- Main changes include:
    - Apply on individual item than aggregated store time series
    - take into account exogenous variables provided

# Import Libraries

In [1]:
import gc
import os
import psutil

import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import lightgbm as lgb
import warnings  # To suppress warnings
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

import random  # For generating random numbers

# Function to set a fixed random seed for reproducibility
def seed_everything(seed):
    np.random.seed(seed)  # Set numpy random seed
    random.seed(seed)  # Set built-in random seed

seed_everything(seed=2024)  # Set the seed to 2024


# Load Dataset

In [2]:
calendar = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/calendar.csv")  # Load calendar dataset
print(f"len(calendar):{len(calendar)}")  # Print the number of rows in calendar
calendar

len(calendar):1969


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


In [3]:
sales_train_evaluation = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv")
print(f"len(sales_train_evaluation): {len(sales_train_evaluation)}")
sales_train_evaluation.head()

len(sales_train_evaluation): 30490


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [4]:
sell_prices = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")
print(f"len(sell_prices):{len(sell_prices)}")
sell_prices

len(sell_prices):6841121


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26
...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00
6841117,WI_3,FOODS_3_827,11618,1.00
6841118,WI_3,FOODS_3_827,11619,1.00
6841119,WI_3,FOODS_3_827,11620,1.00


In [5]:
# Enhanced memory optimization function with object datatype handling
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2  # Initial memory usage in MB
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:  # Downcast numerics
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        elif col_type == 'object':  # Handle object types
            if col == 'date':  # Convert date column to datetime
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
            else:
                df[col] = df[col].astype('category')  # Convert other object types to category
    end_mem = df.memory_usage().sum() / 1024**2  # Final memory usage in MB
    if verbose:
        print(f'Memory usage reduced to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

In [6]:
# Apply the optimized memory reduction function to each dataframe
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)
sales_train_evaluation = reduce_mem_usage(sales_train_evaluation)

Memory usage reduced to  0.13 Mb (40.4% reduction)
Memory usage reduced to 45.76 Mb (78.1% reduction)
Memory usage reduced to 96.30 Mb (78.7% reduction)


# Convert Sales Data to Long format

In [7]:
# Specify day columns up to d_1941 for the extended dataset
d_cols_eval = [f"d_{i}" for i in range(1, 1942)]
sales_train_evaluation_long = sales_train_evaluation.melt(
    id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
    value_vars=d_cols_eval,
    var_name="d",
    value_name="sales"
)
print(f"len(sales_train_evaluation_long): {len(sales_train_evaluation_long)}")
sales_train_evaluation_long.head()

len(sales_train_evaluation_long): 59181090


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


# Label Endcode Calendar event

In [8]:
# Encode event-related features in the calendar dataframe
event_columns = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
le = LabelEncoder()
for col in event_columns:
    calendar[col] = le.fit_transform(calendar[col].astype(str))

# Merge with Calender data

In [9]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(calendar, on="d", how="left")
sales_train_evaluation_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0


# Merge with Price data

In [10]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(
    sell_prices, 
    on=["store_id", "item_id", "wm_yr_wk"], 
    how="left"
)
sales_train_evaluation_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,


# Define Train, Validation and Evaluation sets

In [11]:
train_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1500, 1914)])]
print("Train set sample:")
print(f"len(Train set): {len(train_set)}")
train_set.head()

Train set sample:
len(Train set): 12622860


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
45704510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,8.257812
45704511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,3.970703
45704512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.970703
45704513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,10,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,4.640625
45704514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,2,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.880859


In [12]:
# Filter the validation set for d_1914 to d_1941
validation_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1914, 1942)])]
print("Validation set sample:")
print(f"len(Validation set): {len(validation_set)}")
validation_set.head()


Validation set sample:
len(Validation set): 853720


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
58327370,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,8.382812
58327371,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,3.970703
58327372,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.970703
58327373,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,4.640625
58327374,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,1,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.880859


In [13]:
# Create a new DataFrame for the prediction period (d_1942 to d_1969)
forecast_days = [f'd_{i}' for i in range(1942, 1970)]
forecast_df = pd.DataFrame({'d': forecast_days})

# Generate one entry per product-store combination for each forecast day
prediction_set = sales_train_evaluation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
prediction_set = prediction_set.merge(forecast_df, how='cross')

# Merge with calendar data to add date information for the forecast period
prediction_set = prediction_set.merge(calendar, on='d', how='left')

# Merge with sell_prices to add price data, aligning with the correct store, item, and week
prediction_set = prediction_set.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

print("Prediction set sample:")
print(f"len(Prediction set): {len(prediction_set)}")
prediction_set.head()


Prediction set sample:
len(Prediction set): 853720


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2016-05-23,11617,Monday,...,5,2016,30,4,4,2,0,0,0,8.382812
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1943,2016-05-24,11617,Tuesday,...,5,2016,30,4,4,2,0,0,0,8.382812
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1944,2016-05-25,11617,Wednesday,...,5,2016,30,4,4,2,0,0,0,8.382812
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1945,2016-05-26,11617,Thursday,...,5,2016,30,4,4,2,0,0,0,8.382812
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1946,2016-05-27,11617,Friday,...,5,2016,30,4,4,2,0,0,0,8.382812


In [14]:
# Apply the optimized memory reduction function to each dataframe
train_set = reduce_mem_usage(train_set)
validation_set = reduce_mem_usage(validation_set)
prediction_set = reduce_mem_usage(prediction_set)

Memory usage reduced to 531.02 Mb (43.5% reduction)
Memory usage reduced to 36.34 Mb (44.0% reduction)
Memory usage reduced to 28.20 Mb (50.3% reduction)


In [15]:
train_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
45704510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,8.257812
45704511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,3.970703
45704512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.970703
45704513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,10,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,4.640625
45704514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,2,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.880859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58327365,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1913,1,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,2.980469
58327366,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1913,0,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,2.480469
58327367,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1913,0,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,3.980469
58327368,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1913,3,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,1.280273


In [16]:
validation_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
58327370,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,8.382812
58327371,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,3.970703
58327372,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.970703
58327373,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,4.640625
58327374,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,1,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.880859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,2.980469
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,2.480469
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,3.980469
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,1.280273


In [17]:
prediction_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2016-05-23,11617,Monday,...,5,2016,30,4,4,2,0,0,0,8.382812
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1943,2016-05-24,11617,Tuesday,...,5,2016,30,4,4,2,0,0,0,8.382812
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1944,2016-05-25,11617,Wednesday,...,5,2016,30,4,4,2,0,0,0,8.382812
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1945,2016-05-26,11617,Thursday,...,5,2016,30,4,4,2,0,0,0,8.382812
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1946,2016-05-27,11617,Friday,...,5,2016,30,4,4,2,0,0,0,8.382812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853715,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1965,2016-06-15,11620,Wednesday,...,6,2016,30,4,4,2,0,1,1,1.000000
853716,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1966,2016-06-16,11620,Thursday,...,6,2016,30,4,4,2,0,0,0,1.000000
853717,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1967,2016-06-17,11620,Friday,...,6,2016,30,4,4,2,0,0,0,1.000000
853718,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1968,2016-06-18,11621,Saturday,...,6,2016,30,4,4,2,0,0,0,1.000000


# Multi-step Prediction Recursive

In [34]:
window_size = 365
forecast_horizon = 28  # Number of days to predict

In [35]:
# Placeholder to store forecasts for each 'id'
forecasts = {}

In [36]:
# Loop through each unique 'id'
for idx, unique_id in enumerate(train_set['id'].unique(), 1):
    # Print progress every 1,000 unique 'id's
    if idx % 1000 == 0:
        print(f"Processing ID {idx}/{train_set['id'].nunique()}")

    # Filter data for the current 'id' in both train and validation sets
    df_id_train = train_set[train_set['id'] == unique_id]
    df_id_val = validation_set[validation_set['id'] == unique_id]
    sales_series = df_id_train['sales'].values
    exogenous_vars_train = df_id_train[['wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'event_name_1', 
                                        'event_type_1', 'event_name_2', 'event_type_2']]
    exogenous_vars_val = df_id_val[['wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'event_name_1', 
                                    'event_type_1', 'event_name_2', 'event_type_2']]

    # Function to create the lagged data with exogenous variables
    def create_xy(series, exog, window_size, prediction_horizon):
        x, y = [], []
        for i in range(len(series) - window_size - prediction_horizon + 1):
            x.append(np.concatenate([series[i:i+window_size], exog.iloc[i + window_size - 1].values]))
            y.append(series[i+window_size:i+window_size+prediction_horizon])
        return np.array(x), np.array(y)

    # Prepare training data
    train_x, train_y = create_xy(sales_series, exogenous_vars_train, window_size, 1)
    train_y = train_y.flatten()

    # Define and train the model
    params = {
        'n_estimators': 2000,
        'max_depth': 4,
        'num_leaves': 2**4,
        'learning_rate': 0.05,
        'boosting_type': 'dart',
        'verbose': -1
    }
    model = lgb.LGBMRegressor(first_metric_only=True, **params)
    model.fit(train_x, train_y, eval_metric='l2') # RMSE (l2 or rmse): Preferred when large errors are more critical and need to be penalized.

    # Recursive forecasting
    recursive_x = np.concatenate([train_x[-1, :window_size], exogenous_vars_val.iloc[0].values])
    forecast_id = []
    for i in range(forecast_horizon):
        pred = model.predict(recursive_x.reshape((1, -1)))
        forecast_id.append(pred[0])

        # Update recursive_x by removing  the oldest sales value and adding the new prediction,
        # along with shifting the exogenous variable for the next time step.
        recursive_x = np.concatenate([recursive_x[1:window_size], pred, exogenous_vars_val.iloc[i].values])

    # Store forecast for the current 'id'
    forecasts[unique_id] = forecast_id

Processing ID 1000/30490
Processing ID 2000/30490
Processing ID 3000/30490
Processing ID 4000/30490
Processing ID 5000/30490
Processing ID 6000/30490
Processing ID 7000/30490
Processing ID 8000/30490
Processing ID 9000/30490
Processing ID 10000/30490
Processing ID 11000/30490
Processing ID 12000/30490
Processing ID 13000/30490
Processing ID 14000/30490
Processing ID 15000/30490
Processing ID 16000/30490
Processing ID 17000/30490
Processing ID 18000/30490
Processing ID 19000/30490
Processing ID 20000/30490
Processing ID 21000/30490
Processing ID 22000/30490
Processing ID 23000/30490
Processing ID 24000/30490
Processing ID 25000/30490
Processing ID 26000/30490
Processing ID 27000/30490
Processing ID 28000/30490
Processing ID 29000/30490
Processing ID 30000/30490


<div class ="alert alert-block alert-warning">
    
- Training time 2.5 hours for 30,490 series with 414 days

In [37]:
# Convert the forecasts dictionary to a DataFrame with each row as a unique 'id' and each column as a day
forecasts_df = pd.DataFrame.from_dict(forecasts, orient='index')

# Set the columns as forecast dates
start_dates = train_set.groupby('id')['date'].max()  # Get the last date for each 'id' in the training set
forecast_dates = {unique_id: pd.date_range(start=start_dates[unique_id] + pd.Timedelta(days=1), periods=forecast_horizon)
                  for unique_id in forecasts.keys()}

# Apply the dates as columns based on the forecast horizon
forecast_dates_series = pd.date_range(start=start_dates[unique_id] + pd.Timedelta(days=1), periods=forecast_horizon)
forecasts_df.columns = forecast_dates_series

# Display results
forecasts_df

Unnamed: 0,2016-04-25,2016-04-26,2016-04-27,2016-04-28,2016-04-29,2016-04-30,2016-05-01,2016-05-02,2016-05-03,2016-05-04,...,2016-05-13,2016-05-14,2016-05-15,2016-05-16,2016-05-17,2016-05-18,2016-05-19,2016-05-20,2016-05-21,2016-05-22
HOBBIES_1_001_CA_1_evaluation,0.782248,0.455901,1.669133,0.676679,0.941481,2.101522,1.013550,1.454617,0.517799,0.757577,...,1.343592,0.612894,0.779960,0.825427,1.901905,1.292891,1.203780,1.568171,1.470731,0.658084
HOBBIES_1_002_CA_1_evaluation,-0.027618,0.316051,0.154858,0.070327,0.405225,0.458061,0.215444,0.334559,0.304985,0.223936,...,0.274171,0.039894,0.148372,0.285043,0.103533,0.193095,0.382080,0.050914,0.236079,0.267731
HOBBIES_1_003_CA_1_evaluation,1.130868,1.116530,1.017858,1.026237,0.909253,1.125465,1.218235,1.068064,1.309365,0.861327,...,0.849537,0.814821,0.978350,0.661341,0.631093,1.079775,0.561674,0.662148,0.578904,0.500731
HOBBIES_1_004_CA_1_evaluation,2.212790,1.358741,1.518224,1.969709,2.794819,1.762337,3.052076,2.927147,1.621849,2.701683,...,-0.742399,1.131564,3.170281,3.507608,3.070291,0.544828,1.556279,2.302355,2.615113,1.047775
HOBBIES_1_005_CA_1_evaluation,3.438773,1.702650,0.144963,0.939545,2.894252,1.675452,0.682900,2.166750,1.399038,0.755092,...,2.486615,0.812637,1.909140,1.167258,0.726854,0.391678,1.562825,1.310004,0.619713,0.642890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_evaluation,0.830884,0.142162,-0.006358,0.623275,0.115823,-0.110091,0.343435,0.422073,-0.313811,-0.124163,...,0.539609,0.205840,0.717599,0.085916,0.571603,0.348262,-0.212544,0.014556,0.082289,-0.159099
FOODS_3_824_WI_3_evaluation,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,...,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796,0.358796
FOODS_3_825_WI_3_evaluation,0.135948,0.113975,0.632737,0.982392,0.798397,0.196859,0.406010,0.112614,-0.001302,1.600718,...,0.628576,2.115586,0.343650,1.274035,1.223728,-0.097607,-0.222730,0.304300,0.887260,0.893135
FOODS_3_826_WI_3_evaluation,2.654174,0.556416,1.478542,0.212940,1.571653,1.865221,0.761504,0.717136,-0.293051,0.872803,...,1.538639,1.372074,1.519919,1.433483,1.476114,0.174976,1.386906,0.928091,1.021090,0.041456


# Plotly plot against actual validation data for one sample

In [38]:
import plotly.graph_objects as go

In [39]:
# Define the specific sample id to plot
sample_id = "FOODS_3_586_TX_1_evaluation"

In [40]:
# Extract predicted values from forecasts_df
predicted_values = forecasts_df.loc[sample_id]

# Extract actual values from validation_set for the forecast horizon dates
actual_values = validation_set[validation_set['id'] == sample_id].set_index('date')['sales'].reindex(forecasts_df.columns)

# Create a Plotly figure
fig = go.Figure()

# Add the predicted sales line
fig.add_trace(go.Scatter(
    x=forecasts_df.columns,
    y=predicted_values,
    mode='lines',
    name="Predicted Sales",
    line=dict(dash='dash', color='blue')
))

# Add the actual sales line
fig.add_trace(go.Scatter(
    x=forecasts_df.columns,
    y=actual_values,
    mode='lines',
    name="Actual Sales",
    line=dict(color='red')
))

# Update layout
fig.update_layout(
    title=f"Forecasted vs Actual Sales for {sample_id}",
    xaxis_title="Date",
    yaxis_title="Sales",
    legend_title="Sales Type",
    template="plotly_dark"
)

# Show the figure
fig.show()


# Predict for Prediction set

In [41]:
window_size = 365
forecast_horizon = 28  # Number of days to predict

In [42]:
# Merge train_set and validation_set for a full history of sales data
full_history_set = pd.concat([train_set, validation_set]).reset_index(drop=True)

In [43]:
# Placeholder to store forecasts for each 'id'
forecasts = {}

In [44]:
# Loop through each unique 'id' in the prediction_set
for idx, unique_id in enumerate(prediction_set['id'].unique(), 1):
    # Print progress every 1,000 unique 'id's
    if idx % 1000 == 0:
        print(f"Processing ID {idx}/{prediction_set['id'].nunique()}")

    # Get the last window of data from full_history_set for the current 'id'
    history_window = full_history_set[full_history_set['id'] == unique_id].tail(window_size)['sales'].values

    # Ensure history_window has the expected length (window_size)
    if len(history_window) < window_size:
        print(f"Warning: Not enough data for {unique_id} to create a full window size.")
        continue

    # Get the exogenous variables for the current 'id' in prediction_set
    exogenous_vars_pred = prediction_set[prediction_set['id'] == unique_id][['wm_yr_wk', 'snap_CA', 'snap_TX', 
                                                                             'snap_WI', 'sell_price', 'event_name_1', 
                                                                             'event_type_1', 'event_name_2', 
                                                                             'event_type_2']].values

    # Initialize recursive_x with the last window of sales from the full history set
    recursive_x = np.concatenate([history_window, exogenous_vars_pred[0]])

    # Placeholder for predictions for this 'id'
    forecast_id = []

    # Recursive forecasting for each day in prediction_set
    for i in range(forecast_horizon):
        # Predict using the model
        pred = model.predict(recursive_x.reshape((1, -1)))
        forecast_id.append(pred[0])

        # Update recursive_x by removing the oldest sales value and adding the new prediction,
        # along with updating exogenous variables for the next day in prediction_set
        recursive_x = np.concatenate([recursive_x[1:window_size], pred, exogenous_vars_pred[i]])

    # Store the forecast for the current 'id'
    forecasts[unique_id] = forecast_id

Processing ID 1000/30490
Processing ID 2000/30490
Processing ID 3000/30490
Processing ID 4000/30490
Processing ID 5000/30490
Processing ID 6000/30490
Processing ID 7000/30490
Processing ID 8000/30490
Processing ID 9000/30490
Processing ID 10000/30490
Processing ID 11000/30490
Processing ID 12000/30490
Processing ID 13000/30490
Processing ID 14000/30490
Processing ID 15000/30490
Processing ID 16000/30490
Processing ID 17000/30490
Processing ID 18000/30490
Processing ID 19000/30490
Processing ID 20000/30490
Processing ID 21000/30490
Processing ID 22000/30490
Processing ID 23000/30490
Processing ID 24000/30490
Processing ID 25000/30490
Processing ID 26000/30490
Processing ID 27000/30490
Processing ID 28000/30490
Processing ID 29000/30490
Processing ID 30000/30490


<div class ="alert alert-block alert-warning">
    
- Prediction time 0.5 hours for 30,490 series on 28 days

In [45]:
# Convert the forecasts dictionary to a DataFrame with each row as a unique 'id' and each column as a day
predictions_df = pd.DataFrame.from_dict(forecasts, orient='index')

# Set the columns as forecast dates
start_dates = validation_set.groupby('id')['date'].max()  # Get the last date for each 'id' in the validation set
forecast_dates_series = pd.date_range(start=start_dates.iloc[0] + pd.Timedelta(days=1), periods=forecast_horizon)
predictions_df.columns = forecast_dates_series

# Display the results
predictions_df

Unnamed: 0,2016-05-23,2016-05-24,2016-05-25,2016-05-26,2016-05-27,2016-05-28,2016-05-29,2016-05-30,2016-05-31,2016-06-01,...,2016-06-10,2016-06-11,2016-06-12,2016-06-13,2016-06-14,2016-06-15,2016-06-16,2016-06-17,2016-06-18,2016-06-19
HOBBIES_1_001_CA_1_evaluation,1.980397,0.530571,1.226354,0.188450,1.267549,1.680870,1.143967,1.433481,0.495281,1.504835,...,1.681446,0.006544,0.727244,2.416531,1.587321,0.428737,0.337584,2.190479,1.826833,1.317635
HOBBIES_1_002_CA_1_evaluation,0.955882,0.511593,1.221729,0.497636,1.207439,-0.300720,1.214694,0.723970,0.464345,0.211648,...,0.934022,0.335461,0.820560,0.280921,0.650593,0.169749,0.129329,-0.053959,0.162760,0.208478
HOBBIES_1_003_CA_1_evaluation,2.240394,2.141529,2.068658,0.690953,2.081914,2.806723,1.166441,-0.075991,1.078939,1.200637,...,0.890696,-0.050910,0.756897,1.959062,0.622245,0.610250,1.389075,2.092345,1.548133,1.124962
HOBBIES_1_004_CA_1_evaluation,1.143426,1.871790,0.661431,0.427182,-0.063881,1.397020,0.831369,1.142681,0.854103,1.868831,...,1.732904,3.279754,1.040538,2.542188,1.442300,1.843285,0.201803,2.644576,2.119036,1.795769
HOBBIES_1_005_CA_1_evaluation,0.388488,2.876298,2.412286,1.912555,2.363682,2.457494,3.447490,2.029407,2.475473,1.809393,...,0.714755,2.025076,0.811412,2.812938,1.165901,1.597248,0.435642,1.202814,2.570834,1.865024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_evaluation,0.436514,0.707037,0.730026,0.062172,0.012753,1.052196,0.461896,-0.040333,1.280602,0.494761,...,2.114573,1.480678,1.483870,2.399424,1.558212,1.149403,0.469259,1.278154,2.294383,1.235600
FOODS_3_824_WI_3_evaluation,0.178979,0.170542,0.178979,0.170542,0.178979,0.178979,0.080339,0.178979,0.178979,0.178979,...,0.178979,0.178979,0.178979,0.178979,0.178979,0.147806,0.147806,0.233490,0.264662,0.264662
FOODS_3_825_WI_3_evaluation,0.816308,1.992703,0.285126,0.385599,1.572382,1.732559,0.887913,0.572109,1.082971,0.848887,...,-0.395317,0.337904,-0.022360,0.246435,0.035120,-0.605123,0.306127,0.070470,-0.523121,-0.019869
FOODS_3_826_WI_3_evaluation,0.017362,0.926105,0.129746,-0.203134,-0.007093,-0.590270,0.262346,0.314624,0.054652,0.998326,...,1.570513,2.759549,2.529721,1.408354,3.648281,2.669120,4.015867,1.877135,3.226404,2.402357


# Submission

In [46]:
# Reset index to make 'id' a column instead of an index
forecasts_df = forecasts_df.reset_index()

# Rename the 'index' column to 'id'
forecasts_df = forecasts_df.rename(columns={'index': 'id'})

# Rename forecast columns
forecasts_df.columns = ['id'] + [f'F{i}' for i in range(1, 29)]

# Modify 'id' values for submission
forecasts_df['id'] = forecasts_df['id'].str.replace('evaluation', 'validation')

In [47]:
# Reset index to make 'id' a column instead of an index
predictions_df = predictions_df.reset_index()

# Rename the 'index' column to 'id'
predictions_df = predictions_df.rename(columns={'index': 'id'})

# Rename forecast columns
predictions_df.columns = ['id'] + [f'F{i}' for i in range(1, 29)]

In [48]:
# Combine validation and evaluation for submission
submit = pd.concat([forecasts_df, predictions_df]).reset_index(drop=True)
submit.to_csv('submission.csv', index=False)

In [49]:
submit.head(10)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.782248,0.455901,1.669133,0.676679,0.941481,2.101522,1.01355,1.454617,0.517799,...,1.343592,0.612894,0.77996,0.825427,1.901905,1.292891,1.20378,1.568171,1.470731,0.658084
1,HOBBIES_1_002_CA_1_validation,-0.027618,0.316051,0.154858,0.070327,0.405225,0.458061,0.215444,0.334559,0.304985,...,0.274171,0.039894,0.148372,0.285043,0.103533,0.193095,0.38208,0.050914,0.236079,0.267731
2,HOBBIES_1_003_CA_1_validation,1.130868,1.11653,1.017858,1.026237,0.909253,1.125465,1.218235,1.068064,1.309365,...,0.849537,0.814821,0.97835,0.661341,0.631093,1.079775,0.561674,0.662148,0.578904,0.500731
3,HOBBIES_1_004_CA_1_validation,2.21279,1.358741,1.518224,1.969709,2.794819,1.762337,3.052076,2.927147,1.621849,...,-0.742399,1.131564,3.170281,3.507608,3.070291,0.544828,1.556279,2.302355,2.615113,1.047775
4,HOBBIES_1_005_CA_1_validation,3.438773,1.70265,0.144963,0.939545,2.894252,1.675452,0.6829,2.16675,1.399038,...,2.486615,0.812637,1.90914,1.167258,0.726854,0.391678,1.562825,1.310004,0.619713,0.64289
5,HOBBIES_1_006_CA_1_validation,0.357206,-0.203251,0.10619,0.006897,1.19014,2.240412,0.174715,0.09541,0.853773,...,0.653893,0.364287,-0.016966,0.263916,1.774221,1.332227,0.522398,-0.544618,1.046478,-1.181438
6,HOBBIES_1_007_CA_1_validation,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,...,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797,0.315797
7,HOBBIES_1_008_CA_1_validation,1.345513,2.835171,15.218437,3.115035,3.385611,11.497625,9.11081,5.27522,2.678988,...,7.036349,1.995715,8.745074,13.567831,14.555518,4.334463,7.75739,2.672015,3.190825,16.686775
8,HOBBIES_1_009_CA_1_validation,0.182943,0.852255,-0.458433,0.341033,0.094121,0.608918,2.444725,-1.252387,1.705635,...,1.516817,1.282665,1.095392,0.706824,-0.094376,0.524391,1.04814,-0.193125,0.852932,0.451984
9,HOBBIES_1_010_CA_1_validation,1.729947,0.35323,0.734746,0.293921,0.692769,0.354509,0.915571,0.301302,0.956357,...,0.3307,-0.100842,-0.290168,0.086382,0.114151,0.684334,0.235217,0.567173,-0.004749,0.694094


In [50]:
submit.tail(10)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
60970,FOODS_3_818_WI_3_evaluation,1.558231,1.392938,1.509451,1.343136,1.39033,1.40751,1.905492,1.175222,1.669918,...,1.745501,0.533029,0.936379,2.763236,1.86932,1.066934,0.632496,1.986724,1.314859,0.515441
60971,FOODS_3_819_WI_3_evaluation,2.377966,2.841628,1.387599,0.147418,2.134172,2.896292,2.47245,0.298078,1.345189,...,3.096625,2.979869,1.522781,2.582776,2.964734,2.871438,2.811801,3.670805,2.521023,2.879203
60972,FOODS_3_820_WI_3_evaluation,3.097917,1.446852,0.858761,0.897792,2.707168,1.641502,1.995014,0.227751,2.073503,...,1.471773,1.654033,2.233532,2.473018,1.513677,1.849445,2.357158,2.739611,2.137666,2.180766
60973,FOODS_3_821_WI_3_evaluation,2.831577,0.300685,1.535925,2.419958,3.381337,1.000773,0.868626,3.257639,2.982647,...,1.045644,3.757915,2.695115,1.932881,0.251565,2.332099,2.383989,1.685348,-0.236435,2.208345
60974,FOODS_3_822_WI_3_evaluation,1.862819,2.151741,1.631842,1.271022,2.73936,3.546274,1.801912,0.21022,2.433905,...,1.150871,0.945093,1.681074,0.812276,0.517012,-0.161258,0.538837,-0.401289,-0.175589,0.61154
60975,FOODS_3_823_WI_3_evaluation,0.436514,0.707037,0.730026,0.062172,0.012753,1.052196,0.461896,-0.040333,1.280602,...,2.114573,1.480678,1.48387,2.399424,1.558212,1.149403,0.469259,1.278154,2.294383,1.2356
60976,FOODS_3_824_WI_3_evaluation,0.178979,0.170542,0.178979,0.170542,0.178979,0.178979,0.080339,0.178979,0.178979,...,0.178979,0.178979,0.178979,0.178979,0.178979,0.147806,0.147806,0.23349,0.264662,0.264662
60977,FOODS_3_825_WI_3_evaluation,0.816308,1.992703,0.285126,0.385599,1.572382,1.732559,0.887913,0.572109,1.082971,...,-0.395317,0.337904,-0.02236,0.246435,0.03512,-0.605123,0.306127,0.07047,-0.523121,-0.019869
60978,FOODS_3_826_WI_3_evaluation,0.017362,0.926105,0.129746,-0.203134,-0.007093,-0.59027,0.262346,0.314624,0.054652,...,1.570513,2.759549,2.529721,1.408354,3.648281,2.66912,4.015867,1.877135,3.226404,2.402357
60979,FOODS_3_827_WI_3_evaluation,2.154709,2.328849,1.632464,1.527108,1.910996,1.874826,0.895391,2.063433,1.366429,...,-0.018188,2.020487,1.729348,0.928815,0.455801,1.750325,2.680653,1.462342,1.069866,2.011468
