<div class ="alert alert-block alert-warning">
    
- This notebook aims to apply ideas (Recursive vs Direct from this Handbook https://phdinds-aim.github.io/time_series_handbook/08_WinningestMethods/lightgbm_m5_forecasting.html#recursive-forecasting
    
- Main changes include:
    - Apply on individual item than aggregated store time series
    - take into account exogenous variables provided

# Import Libraries

In [1]:
import gc
import os
import psutil

import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import lightgbm as lgb
import warnings  # To suppress warnings
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

import random  # For generating random numbers

# Function to set a fixed random seed for reproducibility
def seed_everything(seed):
    np.random.seed(seed)  # Set numpy random seed
    random.seed(seed)  # Set built-in random seed

seed_everything(seed=2024)  # Set the seed to 2024


# Load Dataset

In [2]:
calendar = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/calendar.csv")  # Load calendar dataset
print(f"len(calendar):{len(calendar)}")  # Print the number of rows in calendar
calendar

len(calendar):1969


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


In [3]:
sales_train_evaluation = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv")
print(f"len(sales_train_evaluation): {len(sales_train_evaluation)}")
sales_train_evaluation.head()

len(sales_train_evaluation): 30490


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [4]:
sell_prices = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")
print(f"len(sell_prices):{len(sell_prices)}")
sell_prices

len(sell_prices):6841121


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26
...,...,...,...,...
6841116,WI_3,FOODS_3_827,11617,1.00
6841117,WI_3,FOODS_3_827,11618,1.00
6841118,WI_3,FOODS_3_827,11619,1.00
6841119,WI_3,FOODS_3_827,11620,1.00


In [5]:
# Enhanced memory optimization function with object datatype handling
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2  # Initial memory usage in MB
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:  # Downcast numerics
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        elif col_type == 'object':  # Handle object types
            if col == 'date':  # Convert date column to datetime
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
            else:
                df[col] = df[col].astype('category')  # Convert other object types to category
    end_mem = df.memory_usage().sum() / 1024**2  # Final memory usage in MB
    if verbose:
        print(f'Memory usage reduced to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

In [6]:
# Apply the optimized memory reduction function to each dataframe
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)
sales_train_evaluation = reduce_mem_usage(sales_train_evaluation)

Memory usage reduced to  0.13 Mb (40.4% reduction)
Memory usage reduced to 45.76 Mb (78.1% reduction)
Memory usage reduced to 96.30 Mb (78.7% reduction)


# Convert Sales Data to Long format

In [7]:
# Specify day columns up to d_1941 for the extended dataset
d_cols_eval = [f"d_{i}" for i in range(1, 1942)]
sales_train_evaluation_long = sales_train_evaluation.melt(
    id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
    value_vars=d_cols_eval,
    var_name="d",
    value_name="sales"
)
print(f"len(sales_train_evaluation_long): {len(sales_train_evaluation_long)}")
sales_train_evaluation_long.head()

len(sales_train_evaluation_long): 59181090


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


# Label Endcode Calendar event

In [8]:
# Encode event-related features in the calendar dataframe
event_columns = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
le = LabelEncoder()
for col in event_columns:
    calendar[col] = le.fit_transform(calendar[col].astype(str))

# Merge with Calender data

In [9]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(calendar, on="d", how="left")
sales_train_evaluation_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,30,4,4,2,0,0,0


# Merge with Price data

In [10]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(
    sell_prices, 
    on=["store_id", "item_id", "wm_yr_wk"], 
    how="left"
)
sales_train_evaluation_long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,30,4,4,2,0,0,0,


# Define Train, Validation and Evaluation sets

In [11]:
train_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1500, 1914)])]
print("Train set sample:")
print(f"len(Train set): {len(train_set)}")
train_set.head()

Train set sample:
len(Train set): 12622860


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
45704510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,8.257812
45704511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,3.970703
45704512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.970703
45704513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,10,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,4.640625
45704514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,2,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.880859


In [12]:
# Filter the validation set for d_1914 to d_1941
validation_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1914, 1942)])]
print("Validation set sample:")
print(f"len(Validation set): {len(validation_set)}")
validation_set.head()


Validation set sample:
len(Validation set): 853720


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
58327370,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,8.382812
58327371,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,3.970703
58327372,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.970703
58327373,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,4.640625
58327374,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,1,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.880859


In [13]:
# Create a new DataFrame for the prediction period (d_1942 to d_1969)
forecast_days = [f'd_{i}' for i in range(1942, 1970)]
forecast_df = pd.DataFrame({'d': forecast_days})

# Generate one entry per product-store combination for each forecast day
prediction_set = sales_train_evaluation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
prediction_set = prediction_set.merge(forecast_df, how='cross')

# Merge with calendar data to add date information for the forecast period
prediction_set = prediction_set.merge(calendar, on='d', how='left')

# Merge with sell_prices to add price data, aligning with the correct store, item, and week
prediction_set = prediction_set.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

print("Prediction set sample:")
print(f"len(Prediction set): {len(prediction_set)}")
prediction_set.head()


Prediction set sample:
len(Prediction set): 853720


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2016-05-23,11617,Monday,...,5,2016,30,4,4,2,0,0,0,8.382812
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1943,2016-05-24,11617,Tuesday,...,5,2016,30,4,4,2,0,0,0,8.382812
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1944,2016-05-25,11617,Wednesday,...,5,2016,30,4,4,2,0,0,0,8.382812
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1945,2016-05-26,11617,Thursday,...,5,2016,30,4,4,2,0,0,0,8.382812
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1946,2016-05-27,11617,Friday,...,5,2016,30,4,4,2,0,0,0,8.382812


In [14]:
# Apply the optimized memory reduction function to each dataframe
train_set = reduce_mem_usage(train_set)
validation_set = reduce_mem_usage(validation_set)
prediction_set = reduce_mem_usage(prediction_set)

Memory usage reduced to 531.02 Mb (43.5% reduction)
Memory usage reduced to 36.34 Mb (44.0% reduction)
Memory usage reduced to 28.20 Mb (50.3% reduction)


In [15]:
train_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
45704510,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,8.257812
45704511,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,3.970703
45704512,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,0,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.970703
45704513,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,10,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,4.640625
45704514,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1500,2,2015-03-08,11506,...,3,2015,30,4,4,2,1,0,1,2.880859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58327365,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1913,1,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,2.980469
58327366,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1913,0,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,2.480469
58327367,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1913,0,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,3.980469
58327368,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1913,3,2016-04-24,11613,...,4,2016,30,4,4,2,0,0,0,1.280273


In [16]:
validation_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
58327370,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,8.382812
58327371,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,3.970703
58327372,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.970703
58327373,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,4.640625
58327374,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,1,2016-04-25,11613,...,4,2016,30,4,4,2,0,0,0,2.880859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,2.980469
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,2.480469
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,3.980469
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,5,2016,30,4,4,2,0,0,0,1.280273


In [17]:
prediction_set

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2016-05-23,11617,Monday,...,5,2016,30,4,4,2,0,0,0,8.382812
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1943,2016-05-24,11617,Tuesday,...,5,2016,30,4,4,2,0,0,0,8.382812
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1944,2016-05-25,11617,Wednesday,...,5,2016,30,4,4,2,0,0,0,8.382812
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1945,2016-05-26,11617,Thursday,...,5,2016,30,4,4,2,0,0,0,8.382812
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1946,2016-05-27,11617,Friday,...,5,2016,30,4,4,2,0,0,0,8.382812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853715,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1965,2016-06-15,11620,Wednesday,...,6,2016,30,4,4,2,0,1,1,1.000000
853716,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1966,2016-06-16,11620,Thursday,...,6,2016,30,4,4,2,0,0,0,1.000000
853717,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1967,2016-06-17,11620,Friday,...,6,2016,30,4,4,2,0,0,0,1.000000
853718,FOODS_3_827_WI_3_evaluation,FOODS_3_827,FOODS_3,FOODS,WI_3,WI,d_1968,2016-06-18,11621,Saturday,...,6,2016,30,4,4,2,0,0,0,1.000000


# Multi-step Prediction Recursive

In [18]:
window_size = 365
forecast_horizon = 28  # Number of days to predict

In [19]:
# Placeholder to store forecasts for each 'id'
forecasts = {}

In [24]:
# Loop through each unique 'id'
for idx, unique_id in enumerate(train_set['id'].unique(), 1):
    # Print progress every 1,000 unique 'id's
    if idx % 1000 == 0:
        print(f"Processing ID {idx}/{train_set['id'].nunique()}")

    # Filter data for the current 'id' in both train and validation sets
    df_id_train = train_set[train_set['id'] == unique_id]
    df_id_val = validation_set[validation_set['id'] == unique_id]
    sales_series = df_id_train['sales'].values
    exogenous_vars_train = df_id_train[['wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'event_name_1', 
                                        'event_type_1', 'event_name_2', 'event_type_2']]
    exogenous_vars_val = df_id_val[['wm_yr_wk', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'event_name_1', 
                                    'event_type_1', 'event_name_2', 'event_type_2']]

    # Function to create the lagged data with exogenous variables
    def create_xy(series, exog, window_size, prediction_horizon):
        x, y = [], []
        for i in range(len(series) - window_size - prediction_horizon + 1):
            x.append(np.concatenate([series[i:i+window_size], exog.iloc[i + window_size - 1].values]))
            y.append(series[i+window_size:i+window_size+prediction_horizon])
        return np.array(x), np.array(y)

    # Prepare training data
    train_x, train_y = create_xy(sales_series, exogenous_vars_train, window_size, 1)
    train_y = train_y.flatten()

    # Define and train the model
    params = {
        'n_estimators': 2000,
        'max_depth': 4,
        'num_leaves': 2**4,
        'learning_rate': 0.1,
        'boosting_type': 'dart',
        'verbose': -1
    }
    model = lgb.LGBMRegressor(first_metric_only=True, **params)
    model.fit(train_x, train_y, eval_metric='l1') 

    # Recursive forecasting
    recursive_x = np.concatenate([train_x[-1, :window_size], exogenous_vars_val.iloc[0].values])
    forecast_id = []
    for i in range(forecast_horizon):
        pred = model.predict(recursive_x.reshape((1, -1)))
        forecast_id.append(pred[0])

        # Update recursive_x by removing  the oldest sales value and adding the new prediction,
        # along with shifting the exogenous variable for the next time step.
        recursive_x = np.concatenate([recursive_x[1:window_size], pred, exogenous_vars_val.iloc[i].values])

    # Store forecast for the current 'id'
    forecasts[unique_id] = forecast_id

Processing ID 1000/30490
Processing ID 2000/30490
Processing ID 3000/30490
Processing ID 4000/30490
Processing ID 5000/30490
Processing ID 6000/30490
Processing ID 7000/30490
Processing ID 8000/30490
Processing ID 9000/30490
Processing ID 10000/30490
Processing ID 11000/30490
Processing ID 12000/30490
Processing ID 13000/30490
Processing ID 14000/30490
Processing ID 15000/30490
Processing ID 16000/30490
Processing ID 17000/30490
Processing ID 18000/30490
Processing ID 19000/30490
Processing ID 20000/30490
Processing ID 21000/30490
Processing ID 22000/30490
Processing ID 23000/30490
Processing ID 24000/30490
Processing ID 25000/30490
Processing ID 26000/30490
Processing ID 27000/30490
Processing ID 28000/30490
Processing ID 29000/30490
Processing ID 30000/30490


<div class ="alert alert-block alert-warning">
    
- Training time 2.5 hours for 30,490 series with 414 days

In [26]:
# Convert the forecasts dictionary to a DataFrame with each row as a unique 'id' and each column as a day
forecasts_df = pd.DataFrame.from_dict(forecasts, orient='index')

# Set the columns as forecast dates
start_dates = train_set.groupby('id')['date'].max()  # Get the last date for each 'id' in the training set
forecast_dates = {unique_id: pd.date_range(start=start_dates[unique_id] + pd.Timedelta(days=1), periods=forecast_horizon)
                  for unique_id in forecasts.keys()}

# Apply the dates as columns based on the forecast horizon
forecast_dates_series = pd.date_range(start=start_dates[unique_id] + pd.Timedelta(days=1), periods=forecast_horizon)
forecasts_df.columns = forecast_dates_series

# Display results
forecasts_df

Unnamed: 0,2016-04-25,2016-04-26,2016-04-27,2016-04-28,2016-04-29,2016-04-30,2016-05-01,2016-05-02,2016-05-03,2016-05-04,...,2016-05-13,2016-05-14,2016-05-15,2016-05-16,2016-05-17,2016-05-18,2016-05-19,2016-05-20,2016-05-21,2016-05-22
HOBBIES_1_001_CA_1_evaluation,0.896860,0.743664,1.799107,0.879383,0.797335,2.429927,0.917900,1.723976,0.332424,0.630405,...,1.266216,0.422858,0.845927,0.904761,2.193636,1.531346,1.349766,1.826801,1.617795,0.582744
HOBBIES_1_002_CA_1_evaluation,-0.037426,0.342138,0.166013,0.086315,0.478650,0.501046,0.245068,0.374460,0.369053,0.259028,...,0.273505,-0.001916,0.146119,0.324685,0.100000,0.214485,0.425174,0.071001,0.251275,0.299957
HOBBIES_1_003_CA_1_evaluation,1.198257,1.180051,1.039823,0.860706,0.880732,0.942748,1.260835,1.016968,1.375899,0.620891,...,0.651654,0.605649,0.919020,0.617061,0.715135,1.130323,0.452690,0.747335,0.844404,0.355035
HOBBIES_1_004_CA_1_evaluation,2.115262,1.249462,1.824596,2.381181,3.005254,1.460841,3.265046,2.971128,1.837193,3.277673,...,-0.942774,1.034544,3.647268,3.885244,3.184662,0.339811,1.817646,2.457074,2.508920,1.125987
HOBBIES_1_005_CA_1_evaluation,3.646475,1.869064,0.410620,0.773286,3.123956,1.473342,0.619252,2.146015,1.461428,0.461071,...,2.401506,0.673245,1.987814,1.194576,1.071755,0.168287,1.661688,1.550208,0.518443,0.655068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_evaluation,0.927854,0.216142,-0.049001,0.716626,0.076834,-0.203675,0.457916,0.478570,-0.428976,-0.198558,...,0.584799,0.156273,0.787699,0.129066,0.657628,0.363182,-0.292536,-0.054630,0.043515,-0.223043
FOODS_3_824_WI_3_evaluation,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,...,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640,0.361640
FOODS_3_825_WI_3_evaluation,0.053666,-0.047116,0.475777,1.017185,1.109042,0.360602,0.255030,-0.396484,-0.014289,1.835619,...,0.988033,2.648449,0.804524,1.646532,1.543349,0.149547,-0.298178,0.126250,0.873423,1.334672
FOODS_3_826_WI_3_evaluation,2.803075,0.513823,1.611855,0.300031,1.754735,2.067423,0.965500,0.823828,-0.293262,0.841840,...,1.692842,1.454906,1.565227,1.460578,1.741406,-0.057347,1.556651,0.864577,1.156111,-0.019041


# Plotly plot against actual validation data for one sample

In [27]:
import plotly.graph_objects as go

In [31]:
# Define the specific sample id to plot
sample_id = "FOODS_3_586_TX_1_evaluation"

In [32]:
# Extract predicted values from forecasts_df
predicted_values = forecasts_df.loc[sample_id]

# Extract actual values from validation_set for the forecast horizon dates
actual_values = validation_set[validation_set['id'] == sample_id].set_index('date')['sales'].reindex(forecasts_df.columns)

# Create a Plotly figure
fig = go.Figure()

# Add the predicted sales line
fig.add_trace(go.Scatter(
    x=forecasts_df.columns,
    y=predicted_values,
    mode='lines',
    name="Predicted Sales",
    line=dict(dash='dash', color='blue')
))

# Add the actual sales line
fig.add_trace(go.Scatter(
    x=forecasts_df.columns,
    y=actual_values,
    mode='lines',
    name="Actual Sales",
    line=dict(color='red')
))

# Update layout
fig.update_layout(
    title=f"Forecasted vs Actual Sales for {sample_id}",
    xaxis_title="Date",
    yaxis_title="Sales",
    legend_title="Sales Type",
    template="plotly_dark"
)

# Show the figure
fig.show()


# Predict for Prediction set

In [34]:
window_size = 365
forecast_horizon = 28  # Number of days to predict

In [37]:
# Merge train_set and validation_set for a full history of sales data
full_history_set = pd.concat([train_set, validation_set]).reset_index(drop=True)

In [38]:
# Placeholder to store forecasts for each 'id'
forecasts = {}

In [39]:
# Loop through each unique 'id' in the prediction_set
for idx, unique_id in enumerate(prediction_set['id'].unique(), 1):
    # Print progress every 1,000 unique 'id's
    if idx % 1000 == 0:
        print(f"Processing ID {idx}/{prediction_set['id'].nunique()}")

    # Get the last window of data from full_history_set for the current 'id'
    history_window = full_history_set[full_history_set['id'] == unique_id].tail(window_size)['sales'].values

    # Ensure history_window has the expected length (window_size)
    if len(history_window) < window_size:
        print(f"Warning: Not enough data for {unique_id} to create a full window size.")
        continue

    # Get the exogenous variables for the current 'id' in prediction_set
    exogenous_vars_pred = prediction_set[prediction_set['id'] == unique_id][['wm_yr_wk', 'snap_CA', 'snap_TX', 
                                                                             'snap_WI', 'sell_price', 'event_name_1', 
                                                                             'event_type_1', 'event_name_2', 
                                                                             'event_type_2']].values

    # Initialize recursive_x with the last window of sales from the full history set
    recursive_x = np.concatenate([history_window, exogenous_vars_pred[0]])

    # Placeholder for predictions for this 'id'
    forecast_id = []

    # Recursive forecasting for each day in prediction_set
    for i in range(forecast_horizon):
        # Predict using the model
        pred = model.predict(recursive_x.reshape((1, -1)))
        forecast_id.append(pred[0])

        # Update recursive_x by removing the oldest sales value and adding the new prediction,
        # along with updating exogenous variables for the next day in prediction_set
        recursive_x = np.concatenate([recursive_x[1:window_size], pred, exogenous_vars_pred[i]])

    # Store the forecast for the current 'id'
    forecasts[unique_id] = forecast_id

Processing ID 1000/30490
Processing ID 2000/30490
Processing ID 3000/30490
Processing ID 4000/30490
Processing ID 5000/30490
Processing ID 6000/30490
Processing ID 7000/30490
Processing ID 8000/30490
Processing ID 9000/30490
Processing ID 10000/30490
Processing ID 11000/30490
Processing ID 12000/30490
Processing ID 13000/30490
Processing ID 14000/30490
Processing ID 15000/30490
Processing ID 16000/30490
Processing ID 17000/30490
Processing ID 18000/30490
Processing ID 19000/30490
Processing ID 20000/30490
Processing ID 21000/30490
Processing ID 22000/30490
Processing ID 23000/30490
Processing ID 24000/30490
Processing ID 25000/30490
Processing ID 26000/30490
Processing ID 27000/30490
Processing ID 28000/30490
Processing ID 29000/30490
Processing ID 30000/30490


<div class ="alert alert-block alert-warning">
    
- Prediction time 0.5 hours for 30,490 series on 28 days

In [40]:
# Convert the forecasts dictionary to a DataFrame with each row as a unique 'id' and each column as a day
predictions_df = pd.DataFrame.from_dict(forecasts, orient='index')

# Set the columns as forecast dates
start_dates = validation_set.groupby('id')['date'].max()  # Get the last date for each 'id' in the validation set
forecast_dates_series = pd.date_range(start=start_dates.iloc[0] + pd.Timedelta(days=1), periods=forecast_horizon)
predictions_df.columns = forecast_dates_series

# Display the results
predictions_df

Unnamed: 0,2016-05-23,2016-05-24,2016-05-25,2016-05-26,2016-05-27,2016-05-28,2016-05-29,2016-05-30,2016-05-31,2016-06-01,...,2016-06-10,2016-06-11,2016-06-12,2016-06-13,2016-06-14,2016-06-15,2016-06-16,2016-06-17,2016-06-18,2016-06-19
HOBBIES_1_001_CA_1_evaluation,1.759239,0.181754,0.968985,0.192730,1.230885,1.494630,1.134558,1.298377,0.206707,0.765715,...,1.010026,-0.284433,0.691259,2.006059,1.075714,0.388053,0.217778,1.855601,0.974828,0.936770
HOBBIES_1_002_CA_1_evaluation,0.870627,0.316195,0.858901,0.366225,1.104492,-0.563856,1.315251,0.901486,0.090949,-0.127926,...,1.036592,0.071268,0.560052,0.055292,0.373293,-0.045213,-0.041026,-0.388464,-0.322824,0.056779
HOBBIES_1_003_CA_1_evaluation,2.042634,2.180575,2.140569,0.575197,2.029289,2.620319,0.657481,-0.400414,0.992481,0.944535,...,0.506769,-0.256488,1.107457,1.999014,0.507849,0.506099,1.531006,2.037780,1.384524,1.071563
HOBBIES_1_004_CA_1_evaluation,1.187973,1.772150,0.887172,0.626501,-0.296346,1.271244,0.767849,1.192011,0.963758,1.991226,...,2.225026,3.423223,1.082705,2.304974,1.842021,1.611837,0.439279,2.979653,2.792632,1.814024
HOBBIES_1_005_CA_1_evaluation,0.386226,3.056662,2.277919,1.797884,2.415119,2.587564,3.679303,2.011487,2.258919,2.003639,...,0.870947,1.735492,0.785166,2.923574,1.079755,1.506807,0.849171,1.032197,2.579641,1.838061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_evaluation,0.309111,0.360147,0.775272,-0.485882,0.071914,0.782581,0.419743,-0.251868,1.140253,0.395177,...,2.354522,1.030979,1.508661,2.533730,1.805721,1.377529,0.853489,1.060206,2.420045,1.343376
FOODS_3_824_WI_3_evaluation,-0.098704,-0.160038,-0.098704,-0.160038,-0.160038,-0.160038,-0.310387,-0.160038,-0.160038,-0.160038,...,-0.160038,-0.160038,-0.160038,-0.160038,-0.160038,-0.174761,-0.174761,-0.144131,-0.129408,-0.129408
FOODS_3_825_WI_3_evaluation,0.772765,1.814701,-0.212700,0.154961,1.271586,1.355094,0.418253,0.305926,0.311625,0.180128,...,-0.642649,0.029958,-0.416848,-0.080992,-0.183278,-0.966023,0.092726,-0.392832,-1.012384,-0.426488
FOODS_3_826_WI_3_evaluation,0.001014,1.099261,0.435773,-0.091697,-0.119767,-0.703036,0.339743,0.377836,-0.096056,0.849153,...,1.411584,2.553083,2.340068,1.446881,3.236645,2.653755,4.114277,1.687275,2.779421,2.423747


# Submission

In [41]:
# Reset index to make 'id' a column instead of an index
forecasts_df = forecasts_df.reset_index()

# Rename the 'index' column to 'id'
forecasts_df = forecasts_df.rename(columns={'index': 'id'})

# Rename forecast columns
forecasts_df.columns = ['id'] + [f'F{i}' for i in range(1, 29)]

# Modify 'id' values for submission
forecasts_df['id'] = forecasts_df['id'].str.replace('evaluation', 'validation')

In [42]:
# Reset index to make 'id' a column instead of an index
predictions_df = predictions_df.reset_index()

# Rename the 'index' column to 'id'
predictions_df = predictions_df.rename(columns={'index': 'id'})

# Rename forecast columns
predictions_df.columns = ['id'] + [f'F{i}' for i in range(1, 29)]

In [44]:
# Combine validation and evaluation for submission
submit = pd.concat([forecasts_df, predictions_df]).reset_index(drop=True)
submit.to_csv('submission.csv', index=False)

In [45]:
submit.head(10)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.89686,0.743664,1.799107,0.879383,0.797335,2.429927,0.9179,1.723976,0.332424,...,1.266216,0.422858,0.845927,0.904761,2.193636,1.531346,1.349766,1.826801,1.617795,0.582744
1,HOBBIES_1_002_CA_1_validation,-0.037426,0.342138,0.166013,0.086315,0.47865,0.501046,0.245068,0.37446,0.369053,...,0.273505,-0.001916,0.146119,0.324685,0.1,0.214485,0.425174,0.071001,0.251275,0.299957
2,HOBBIES_1_003_CA_1_validation,1.198257,1.180051,1.039823,0.860706,0.880732,0.942748,1.260835,1.016968,1.375899,...,0.651654,0.605649,0.91902,0.617061,0.715135,1.130323,0.45269,0.747335,0.844404,0.355035
3,HOBBIES_1_004_CA_1_validation,2.115262,1.249462,1.824596,2.381181,3.005254,1.460841,3.265046,2.971128,1.837193,...,-0.942774,1.034544,3.647268,3.885244,3.184662,0.339811,1.817646,2.457074,2.50892,1.125987
4,HOBBIES_1_005_CA_1_validation,3.646475,1.869064,0.41062,0.773286,3.123956,1.473342,0.619252,2.146015,1.461428,...,2.401506,0.673245,1.987814,1.194576,1.071755,0.168287,1.661688,1.550208,0.518443,0.655068
5,HOBBIES_1_006_CA_1_validation,0.279083,-0.551719,0.066367,-0.270517,1.264685,2.479624,-0.120195,-0.296355,0.71253,...,1.108306,0.38758,-0.61228,0.27118,1.825313,0.760886,-0.508873,-0.069271,1.335887,-2.066605
6,HOBBIES_1_007_CA_1_validation,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,...,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199,0.317199
7,HOBBIES_1_008_CA_1_validation,1.099013,1.962817,14.880924,5.433526,3.23769,14.035446,10.092994,6.293682,3.780167,...,6.806573,3.111273,8.569112,11.2118,16.448706,3.918249,7.686645,1.058005,4.000542,20.039171
8,HOBBIES_1_009_CA_1_validation,0.124954,1.055959,-0.591114,0.457508,0.215951,0.515417,2.794622,-1.431809,1.971615,...,1.609532,1.517241,1.177979,0.926389,0.059689,0.609916,1.085001,-0.440772,0.86278,0.426215
9,HOBBIES_1_010_CA_1_validation,1.871106,0.262809,0.926503,0.568262,0.649374,0.226318,1.080133,0.05106,1.136688,...,0.44451,-0.236037,-0.51704,-0.016949,0.028394,0.595831,0.17779,0.736101,-0.105626,0.6225


In [46]:
submit.tail(10)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
60970,FOODS_3_818_WI_3_evaluation,1.609946,1.344119,1.687884,1.60809,1.378299,1.750722,2.063808,1.587872,1.998274,...,1.700788,1.066192,1.201423,3.378485,1.577408,1.63093,0.623353,2.325231,1.612607,1.294572
60971,FOODS_3_819_WI_3_evaluation,2.552474,2.873403,1.196139,0.237266,2.151935,3.060229,2.232291,0.589045,1.421613,...,3.38418,2.949152,1.402672,2.597705,3.248123,2.921052,2.702321,3.706242,2.613239,2.824249
60972,FOODS_3_820_WI_3_evaluation,3.45062,1.528736,0.865307,1.124285,2.911804,2.454471,2.13897,0.045297,2.17575,...,1.590011,1.71582,2.23569,2.957405,2.080571,1.585953,2.402487,3.068258,2.526178,2.318987
60973,FOODS_3_821_WI_3_evaluation,2.913403,0.32229,1.76217,2.694564,3.222871,1.117887,0.737627,3.297508,2.930617,...,0.888101,3.72266,2.916801,1.749776,0.337196,2.128353,2.734975,1.571426,-0.324001,2.28878
60974,FOODS_3_822_WI_3_evaluation,1.658106,2.334168,1.800254,1.263298,2.747332,3.466878,1.800529,0.258597,2.480986,...,1.160557,0.917239,1.500744,0.485254,0.442017,-0.207802,0.476575,-0.825703,-0.500445,0.556536
60975,FOODS_3_823_WI_3_evaluation,0.309111,0.360147,0.775272,-0.485882,0.071914,0.782581,0.419743,-0.251868,1.140253,...,2.354522,1.030979,1.508661,2.53373,1.805721,1.377529,0.853489,1.060206,2.420045,1.343376
60976,FOODS_3_824_WI_3_evaluation,-0.098704,-0.160038,-0.098704,-0.160038,-0.160038,-0.160038,-0.310387,-0.160038,-0.160038,...,-0.160038,-0.160038,-0.160038,-0.160038,-0.160038,-0.174761,-0.174761,-0.144131,-0.129408,-0.129408
60977,FOODS_3_825_WI_3_evaluation,0.772765,1.814701,-0.2127,0.154961,1.271586,1.355094,0.418253,0.305926,0.311625,...,-0.642649,0.029958,-0.416848,-0.080992,-0.183278,-0.966023,0.092726,-0.392832,-1.012384,-0.426488
60978,FOODS_3_826_WI_3_evaluation,0.001014,1.099261,0.435773,-0.091697,-0.119767,-0.703036,0.339743,0.377836,-0.096056,...,1.411584,2.553083,2.340068,1.446881,3.236645,2.653755,4.114277,1.687275,2.779421,2.423747
60979,FOODS_3_827_WI_3_evaluation,2.029897,2.541785,1.610183,1.540914,2.089993,1.968743,1.024512,2.072964,1.192298,...,-0.062904,1.830338,1.651854,1.104005,0.317422,1.861736,2.681552,1.321441,0.853256,2.165554
