<div class ="alert alert-block alert-warning">
    
- This notebook is taking reference from: https://www.kaggle.com/code/yunsuxiaozi/m5-forecasting-top7-study-notebook/notebook
- Attempt to translate it into english and also set a baseline model in which we could improve on

# Import Libraries

In [1]:
import pickle
import os
import gc

import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
import lightgbm as lgb  # LightGBM for machine learning model
import warnings  # To suppress warnings

warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

import random  # For generating random numbers

# Function to set a fixed random seed for reproducibility
def seed_everything(seed):
    np.random.seed(seed)  # Set numpy random seed
    random.seed(seed)  # Set built-in random seed

seed_everything(seed=2024)  # Set the seed to 2024


# Load Dataset

In [2]:
calendar = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/calendar.csv")  # Load calendar dataset
print(f"len(calendar):{len(calendar)}")  # Print the number of rows in calendar
calendar.head()  # Display the first few rows of the calendar dataset


len(calendar):1969


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [3]:
sales_train_evaluation = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv")
print(f"len(sales_train_evaluation): {len(sales_train_evaluation)}")
sales_train_evaluation.head()


len(sales_train_evaluation): 30490


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [4]:
sell_prices = pd.read_csv("/kaggle/input/m5-forecasting-accuracy/sell_prices.csv")
print(f"len(sell_prices):{len(sell_prices)}")
sell_prices.head()


len(sell_prices):6841121


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


# Downcasting to save memory

In [5]:
# Enhanced memory optimization function with object datatype handling
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2  # Initial memory usage in MB
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:  # Downcast numerics
            c_min, c_max = df[col].min(), df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        elif col_type == 'object':  # Handle object types
            if col == 'date':  # Convert date column to datetime
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
            else:
                df[col] = df[col].astype('category')  # Convert other object types to category
    end_mem = df.memory_usage().sum() / 1024**2  # Final memory usage in MB
    if verbose:
        print(f'Memory usage reduced to {end_mem:5.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df


In [6]:
# Apply the optimized memory reduction function to each dataframe
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)
sales_train_evaluation = reduce_mem_usage(sales_train_evaluation)

Memory usage reduced to  0.13 Mb (40.4% reduction)
Memory usage reduced to 45.76 Mb (78.1% reduction)
Memory usage reduced to 96.30 Mb (78.7% reduction)


# Convert Sales data to long format

In [7]:
# Specify day columns up to d_1941 for the extended dataset
d_cols_eval = [f"d_{i}" for i in range(1, 1942)]
sales_train_evaluation_long = sales_train_evaluation.melt(
    id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
    value_vars=d_cols_eval,
    var_name="d",
    value_name="sales"
)
print(f"len(sales_train_evaluation_long): {len(sales_train_evaluation_long)}")
sales_train_evaluation_long.head()


len(sales_train_evaluation_long): 59181090


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0


# Merge Calendar data

In [8]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(calendar, on="d", how="left")
sales_train_evaluation_long.head()


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,,,,,0,0,0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,,,,,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,,,,,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,,,,,0,0,0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,1,2011,,,,,0,0,0


# Merge Price Data

In [9]:
sales_train_evaluation_long = sales_train_evaluation_long.merge(
    sell_prices, 
    on=["store_id", "item_id", "wm_yr_wk"], 
    how="left"
)
sales_train_evaluation_long.head()


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,1,2011,,,,,0,0,0,


# Feature Engineering (Lags + Rolling Windows)

In [10]:
# Compute lag and rolling window features for the entire dataset
for lag in [7, 28]:
    sales_train_evaluation_long[f'sales_lag_{lag}'] = sales_train_evaluation_long.groupby('id')['sales'].shift(lag)

for window in [7, 28]:
    sales_train_evaluation_long[f'rolling_sales_mean_{window}'] = sales_train_evaluation_long.groupby('id')['sales'].transform(lambda x: x.shift(1).rolling(window).mean())
    sales_train_evaluation_long[f'rolling_sales_std_{window}'] = sales_train_evaluation_long.groupby('id')['sales'].transform(lambda x: x.shift(1).rolling(window).std())


# Feature Engineering (Trends)

In [11]:
sales_train_evaluation_long['daily_avg_sold'] = sales_train_evaluation_long.groupby(['id','d'])['sales'].transform('mean').astype(np.float16)
sales_train_evaluation_long['avg_sold'] = sales_train_evaluation_long.groupby(['id'])['sales'].transform('mean').astype(np.float16)
sales_train_evaluation_long['selling_trend'] = (sales_train_evaluation_long['daily_avg_sold'] - sales_train_evaluation_long['avg_sold']).astype(np.float16)
sales_train_evaluation_long.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)

In [12]:
sales_train_evaluation_long

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,snap_TX,snap_WI,sell_price,sales_lag_7,sales_lag_28,rolling_sales_mean_7,rolling_sales_std_7,rolling_sales_mean_28,rolling_sales_std_28,selling_trend
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,0,0,,,,,,,,-0.326172
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,0,0,,,,,,,,-0.257568
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,0,0,,,,,,,,-0.159180
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,0,0,,,,,,,,-1.718750
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29,11101,...,0,0,,,,,,,,-0.972656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59181085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1941,1,2016-05-22,11617,...,0,0,2.980469,3.0,1.0,0.857143,1.069045,0.642857,0.826160,0.464355
59181086,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,0,0,2.480469,0.0,0.0,0.285714,0.487950,0.285714,0.534522,-0.375000
59181087,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,d_1941,2,2016-05-22,11617,...,0,0,3.980469,1.0,0.0,0.714286,0.755929,0.714286,0.896790,1.106445
59181088,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,d_1941,0,2016-05-22,11617,...,0,0,1.280273,1.0,3.0,2.000000,2.160247,1.428571,1.317365,-0.380615


# Define Train, Validation and Evaluation sets

In [13]:
train_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(750, 1914)])]
print("Train set sample:")
print(f"len(Train set): {len(train_set)}")
train_set.head()

Train set sample:
len(Train set): 35490360


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,snap_TX,snap_WI,sell_price,sales_lag_7,sales_lag_28,rolling_sales_mean_7,rolling_sales_std_7,rolling_sales_mean_28,rolling_sales_std_28,selling_trend
22837010,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_750,0,2013-02-16,11304,...,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,-0.326172
22837011,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_750,0,2013-02-16,11304,...,0,0,3.970703,0.0,1.0,0.285714,0.48795,0.321429,0.547964,-0.257568
22837012,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_750,0,2013-02-16,11304,...,0,0,,0.0,0.0,0.0,0.0,0.0,0.0,-0.15918
22837013,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_750,2,2013-02-16,11304,...,0,0,4.339844,1.0,0.0,1.0,0.57735,1.0,1.333333,0.28125
22837014,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_750,2,2013-02-16,11304,...,0,0,3.080078,0.0,1.0,0.285714,0.755929,1.214286,1.548168,1.027344


In [15]:
# Filter the validation set for d_1914 to d_1941
validation_set = sales_train_evaluation_long[sales_train_evaluation_long['d'].isin([f'd_{i}' for i in range(1914, 1942)])]
print("Validation set sample:")
print(f"len(Validation set): {len(validation_set)}")
validation_set.head()


Validation set sample:
len(Validation set): 853720


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,snap_TX,snap_WI,sell_price,sales_lag_7,sales_lag_28,rolling_sales_mean_7,rolling_sales_std_7,rolling_sales_mean_28,rolling_sales_std_28,selling_trend
58327370,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,0,0,8.382812,1.0,1.0,1.142857,0.899735,0.964286,1.137969,-0.326172
58327371,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,0,0,3.970703,0.0,1.0,0.142857,0.377964,0.071429,0.262265,-0.257568
58327372,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,0,0,2.970703,1.0,0.0,0.857143,0.377964,0.571429,0.690066,-0.15918
58327373,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,0,2016-04-25,11613,...,0,0,4.640625,4.0,0.0,2.571429,2.370453,1.821429,1.88667,-1.71875
58327374,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1914,1,2016-04-25,11613,...,0,0,2.880859,0.0,1.0,1.714286,1.253566,1.357143,1.282771,0.027344


In [16]:
# Create a new DataFrame for the prediction period (d_1942 to d_1969)
forecast_days = [f'd_{i}' for i in range(1942, 1970)]
forecast_df = pd.DataFrame({'d': forecast_days})

# Generate one entry per product-store combination for each forecast day
prediction_set = sales_train_evaluation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
prediction_set = prediction_set.merge(forecast_df, how='cross')

# Merge with calendar data to add date information for the forecast period
prediction_set = prediction_set.merge(calendar, on='d', how='left')

# Merge with sell_prices to add price data, aligning with the correct store, item, and week
prediction_set = prediction_set.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

print("Prediction set sample:")
print(f"len(Prediction set): {len(prediction_set)}")
prediction_set.head()


Prediction set sample:
len(Prediction set): 853720


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,wm_yr_wk,weekday,...,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1942,2016-05-23,11617,Monday,...,5,2016,,,,,0,0,0,8.382812
1,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1943,2016-05-24,11617,Tuesday,...,5,2016,,,,,0,0,0,8.382812
2,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1944,2016-05-25,11617,Wednesday,...,5,2016,,,,,0,0,0,8.382812
3,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1945,2016-05-26,11617,Thursday,...,5,2016,,,,,0,0,0,8.382812
4,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1946,2016-05-27,11617,Friday,...,5,2016,,,,,0,0,0,8.382812


In [17]:
# Apply the optimized memory reduction function to each dataframe
train_set = reduce_mem_usage(train_set)
validation_set = reduce_mem_usage(validation_set)
prediction_set = reduce_mem_usage(prediction_set)

Memory usage reduced to 1964.45 Mb (42.0% reduction)
Memory usage reduced to 47.74 Mb (42.3% reduction)
Memory usage reduced to 28.20 Mb (16.8% reduction)


In [18]:
del calendar, sell_prices, sales_train_evaluation
gc.collect()

0

# Train model with LGBM on train and validation set

In [19]:
# Define constants
TARGET = 'sales'  # Assuming the target column is 'sales'
STORES_IDS = train_set['store_id'].unique()  # List of unique store IDs
model_dir = './models'  # Directory to save models

# Train a separate model for each store
for store_id in STORES_IDS:
    print(f"\nStarting training for store: {store_id}")

    # Filter the pre-defined train and validation sets by store
    print("  - Filtering data by store...")
    store_train_set = train_set[train_set['store_id'] == store_id].copy()
    store_valid_set = validation_set[validation_set['store_id'] == store_id].copy()

    # Drop 'date' column from training and validation sets to avoid DTypePromotionError
    X_train = store_train_set.drop(columns=[TARGET, 'store_id', 'date'])  # Remove date column
    y_train = store_train_set[TARGET]
    X_valid = store_valid_set.drop(columns=[TARGET, 'store_id', 'date'])  # Remove date column
    y_valid = store_valid_set[TARGET]

    
    # Define features and LightGBM parameters
    MODEL_FEATURES = X_train.columns.tolist()
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        'subsample': 0.5,
        'subsample_freq': 1,
        'learning_rate': 0.1, # Adjusted up from 0.015
        'num_leaves': 128,
        'min_data_in_leaf': 500,
        'feature_fraction': 0.5,
        'max_bin': 50,
        'n_estimators': 500, # Adjusted down from 3000
        'boost_from_average': False,
        'verbose': -1,
    }

    # Initialize and train the model with LightGBM
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        callbacks=[
            lgb.log_evaluation(20),  # Logs every 50 rounds
            lgb.early_stopping(stopping_rounds=10)  # Early stopping if no improvement
        ]
    )

    # Save the model
    print(f"  - Saving model for store: {store_id}")
    model_path = os.path.join(model_dir, f'lgb_model_{store_id}.bin')
    os.makedirs(model_dir, exist_ok=True)
    pickle.dump(model, open(model_path, 'wb'))

    # Clean up to free memory
    print(f"  - Cleaning up resources for store: {store_id}")
    del store_train_set, store_valid_set, X_train, y_train, X_valid, y_valid, model
    gc.collect()
    print(f"Finished training for store: {store_id}\n" + "="*50)



Starting training for store: CA_1
  - Filtering data by store...
Training until validation scores don't improve for 10 rounds
[20]	valid_0's rmse: 1.26939
[40]	valid_0's rmse: 1.03443
[60]	valid_0's rmse: 0.998037
[80]	valid_0's rmse: 0.977496
[100]	valid_0's rmse: 0.963118
[120]	valid_0's rmse: 0.95439
[140]	valid_0's rmse: 0.943438
[160]	valid_0's rmse: 0.936103
Early stopping, best iteration is:
[165]	valid_0's rmse: 0.93486
  - Saving model for store: CA_1
  - Cleaning up resources for store: CA_1
Finished training for store: CA_1

Starting training for store: CA_2
  - Filtering data by store...
Training until validation scores don't improve for 10 rounds
[20]	valid_0's rmse: 1.17141
[40]	valid_0's rmse: 0.942469
[60]	valid_0's rmse: 0.903147
[80]	valid_0's rmse: 0.883384
[100]	valid_0's rmse: 0.872896
[120]	valid_0's rmse: 0.867689
[140]	valid_0's rmse: 0.862083
[160]	valid_0's rmse: 0.860806
[180]	valid_0's rmse: 0.857165
[200]	valid_0's rmse: 0.855493
[220]	valid_0's rmse: 0.85

# Visualisation on validation set

In [20]:
validation_set[15000:15010]

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,snap_TX,snap_WI,sell_price,sales_lag_7,sales_lag_28,rolling_sales_mean_7,rolling_sales_std_7,rolling_sales_mean_28,rolling_sales_std_28,selling_trend
58342370,FOODS_3_580_TX_1_evaluation,FOODS_3_580,FOODS_3,FOODS,TX_1,TX,d_1914,4,2016-04-25,11613,...,0,0,0.25,9.0,2.0,4.429688,3.408203,2.607422,2.513672,-1.667969
58342371,FOODS_3_581_TX_1_evaluation,FOODS_3_581,FOODS_3,FOODS,TX_1,TX,d_1914,2,2016-04-25,11613,...,0,0,3.480469,1.0,2.0,0.285645,0.488037,0.821289,1.466797,1.242188
58342372,FOODS_3_582_TX_1_evaluation,FOODS_3_582,FOODS_3,FOODS,TX_1,TX,d_1914,0,2016-04-25,11613,...,0,0,1.980469,4.0,1.0,1.286133,1.379883,1.357422,1.496094,-0.644043
58342373,FOODS_3_583_TX_1_evaluation,FOODS_3_583,FOODS_3,FOODS,TX_1,TX,d_1914,1,2016-04-25,11613,...,0,0,1.5,2.0,0.0,1.286133,1.253906,0.893066,0.875,0.53125
58342374,FOODS_3_584_TX_1_evaluation,FOODS_3_584,FOODS_3,FOODS,TX_1,TX,d_1914,0,2016-04-25,11613,...,0,0,1.0,0.0,1.0,0.0,0.0,0.214233,0.498779,-1.708984
58342375,FOODS_3_585_TX_1_evaluation,FOODS_3_585,FOODS_3,FOODS,TX_1,TX,d_1914,5,2016-04-25,11613,...,0,0,3.980469,2.0,2.0,1.571289,1.272461,1.071289,1.844727,3.396484
58342376,FOODS_3_586_TX_1_evaluation,FOODS_3_586,FOODS_3,FOODS,TX_1,TX,d_1914,61,2016-04-25,11613,...,0,0,1.679688,72.0,54.0,60.15625,12.453125,53.9375,13.84375,2.25
58342377,FOODS_3_587_TX_1_evaluation,FOODS_3_587,FOODS_3,FOODS,TX_1,TX,d_1914,10,2016-04-25,11613,...,0,0,2.480469,23.0,22.0,23.140625,5.519531,23.328125,6.640625,-15.125
58342378,FOODS_3_588_TX_1_evaluation,FOODS_3_588,FOODS_3,FOODS,TX_1,TX,d_1914,1,2016-04-25,11613,...,0,0,3.939453,0.0,1.0,1.0,1.291016,0.643066,0.911621,0.513184
58342379,FOODS_3_589_TX_1_evaluation,FOODS_3_589,FOODS_3,FOODS,TX_1,TX,d_1914,0,2016-04-25,11613,...,0,0,1.280273,0.0,0.0,0.0,0.0,0.0,0.0,-1.069336


In [21]:
# Specify the item_id, dept_id, cat_id, store_id you want to filter
item_id = 'FOODS_3_580'
dept_id = 'FOODS_3'
cat_id = 'FOODS'
store_id = 'TX_1'

In [22]:
# Define the target range of `d` values for the validation period
valid_d_range = [f'd_{i}' for i in range(1914, 1942)]

# Filter the validation data for the specified item-store combination and `d` range
valid_data = validation_set[
    (validation_set['item_id'] == item_id) &
    (validation_set['dept_id'] == dept_id) &
    (validation_set['cat_id'] == cat_id) &
    (validation_set['store_id'] == store_id) &
    (validation_set['d'].isin(valid_d_range))
].copy()

# Ensure consistent feature selection by dropping only non-feature columns
non_feature_columns = ['sales', 'date', 'store_id']  # Adjust as necessary
X_valid = valid_data.drop(columns=non_feature_columns, errors='ignore')

# Load the trained model for the specified store
model_path = os.path.join(model_dir, f'lgb_model_{store_id}.bin')
model = pickle.load(open(model_path, 'rb'))

# Generate predictions for the specified validation data
valid_data['pred_sales'] = model.predict(X_valid)


In [23]:
import plotly.graph_objects as go

# Create a Plotly figure for comparison
fig = go.Figure()

# Add the actual values from validation data
fig.add_trace(go.Scatter(x=valid_data['d'], y=valid_data['sales'], mode='lines+markers', name='Actual Sales'))

# Add the predicted values for the validation period
fig.add_trace(go.Scatter(x=valid_data['d'], y=valid_data['pred_sales'], mode='lines+markers', name='Predicted Sales'))

# Update layout for better visualization
fig.update_layout(
    title=f"Actual vs Predicted Sales for (item_id={item_id}, dept_id={dept_id}, cat_id={cat_id}, store_id={store_id})",
    xaxis_title="Day",
    yaxis_title="Sales",
    legend_title="Legend",
    template="plotly_dark"
)

# Show the plot
fig.show()

# Recursive Forecasting on Prediction set

In [24]:
# Initialize an empty dictionary to store predictions for each day and each item-store combination
predicted_sales = {store_id: [] for store_id in STORES_IDS}

# Loop over each day in the prediction period (28 days)
for day in range(28):
    day_col = f'd_{1942 + day}'  # Current day in prediction period
    print(f"\nProcessing predictions for day: {day_col}")

    # Loop over each store model to generate predictions
    for store_id in STORES_IDS:
        print(f"  - Predicting for store: {store_id}")

        # Load the model for the store
        model_path = os.path.join(model_dir, f'lgb_model_{store_id}.bin')
        model = pickle.load(open(model_path, 'rb'))
        
        # Filter the prediction data for the specific store and day
        store_prediction_set = prediction_set[
            (prediction_set['store_id'] == store_id) &
            (prediction_set['d'] == day_col)
        ].copy()
        
        # Prepare features for the current day by removing unnecessary columns
        X_pred = store_prediction_set.drop(columns=['date', 'store_id'], errors='ignore')

        # Ensure X_pred contains the exact same features as used during training
        X_pred = X_pred.reindex(columns=model.booster_.feature_name(), fill_value=0)
        
        # Make predictions for the current day
        store_prediction_set['pred_sales'] = model.predict(X_pred)
        
        # Append predictions for each item-store combination in the store
        predicted_sales[store_id].append(store_prediction_set[['id', 'd', 'pred_sales']])
        
        # Update the prediction_set with predicted sales for recursive feature generation
        prediction_set.loc[store_prediction_set.index, 'sales'] = store_prediction_set['pred_sales']
        
        # Update lag and rolling window features based on the updated sales column
        for lag in [7, 28]:
            prediction_set[f'sales_lag_{lag}'] = prediction_set.groupby('id')['sales'].shift(lag)

        for window in [7, 28]:
            prediction_set[f'rolling_sales_mean_{window}'] = prediction_set.groupby('id')['sales'].transform(lambda x: x.shift(1).rolling(window).mean())
            prediction_set[f'rolling_sales_std_{window}'] = prediction_set.groupby('id')['sales'].transform(lambda x: x.shift(1).rolling(window).std())
        
        # compute trends for prediction_set
        prediction_set['daily_avg_sold'] = prediction_set.groupby(['id','d'])['sales'].transform('mean').astype(np.float16)
        prediction_set['avg_sold'] = prediction_set.groupby(['id'])['sales'].transform('mean').astype(np.float16)
        prediction_set['selling_trend'] = (prediction_set['daily_avg_sold'] - prediction_set['avg_sold']).astype(np.float16)
        prediction_set.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
        
        # Free up memory
        del model
        gc.collect()
        print(f"    Completed predictions for store: {store_id} on day: {day_col}")

# Convert the predictions dictionary into a DataFrame for easier manipulation and visualization
final_predictions = pd.concat([pd.concat(preds) for preds in predicted_sales.values()], ignore_index=True)

print("\nAll predictions completed and consolidated into final_predictions DataFrame.")



Processing predictions for day: d_1942
  - Predicting for store: CA_1
    Completed predictions for store: CA_1 on day: d_1942
  - Predicting for store: CA_2
    Completed predictions for store: CA_2 on day: d_1942
  - Predicting for store: CA_3
    Completed predictions for store: CA_3 on day: d_1942
  - Predicting for store: CA_4
    Completed predictions for store: CA_4 on day: d_1942
  - Predicting for store: TX_1
    Completed predictions for store: TX_1 on day: d_1942
  - Predicting for store: TX_2
    Completed predictions for store: TX_2 on day: d_1942
  - Predicting for store: TX_3
    Completed predictions for store: TX_3 on day: d_1942
  - Predicting for store: WI_1
    Completed predictions for store: WI_1 on day: d_1942
  - Predicting for store: WI_2
    Completed predictions for store: WI_2 on day: d_1942
  - Predicting for store: WI_3
    Completed predictions for store: WI_3 on day: d_1942

Processing predictions for day: d_1943
  - Predicting for store: CA_1
    Compl

In [25]:
final_predictions.head()

Unnamed: 0,id,d,pred_sales
0,HOBBIES_1_001_CA_1_evaluation,d_1942,0.061938
1,HOBBIES_1_002_CA_1_evaluation,d_1942,0.056958
2,HOBBIES_1_003_CA_1_evaluation,d_1942,0.050321
3,HOBBIES_1_004_CA_1_evaluation,d_1942,0.073827
4,HOBBIES_1_005_CA_1_evaluation,d_1942,0.063657


# Visualise Predictions

In [27]:
id=item_id+"_"+store_id+"_evaluation"
id

'FOODS_3_580_WI_3_evaluation'

In [28]:
# Merge with final_predictions for the forecast period d_1942 to d_1969
forecast_d_range = [f'd_{i}' for i in range(1942, 1970)]
forecast_data = final_predictions[
    (final_predictions['id'] == id) & 
    (final_predictions['d'].isin(forecast_d_range))
]

In [29]:
# Select necessary columns from valid_data (up to d_1941)
valid_data_subset = valid_data[['id', 'd', 'sales', 'pred_sales']].copy()

# Rename columns in forecast_data to match valid_data format for consistency
forecast_data_subset = forecast_data[['id', 'd', 'pred_sales']]

# Concatenate valid_data and forecast_data
combined_data = pd.concat([valid_data_subset, forecast_data_subset], ignore_index=True)

# Convert the day identifiers to a numeric format for plotting
combined_data['day'] = combined_data['d'].str.extract('(\d+)').astype(int)


In [30]:
# Create Plotly figure
fig = go.Figure()

# Plot actual sales
fig.add_trace(go.Scatter(
    x=combined_data[combined_data['day'] <= 1941]['day'],
    y=combined_data[combined_data['day'] <= 1941]['sales'],
    mode='lines+markers',
    name='Actual Sales'
))

# Plot predicted sales
fig.add_trace(go.Scatter(
    x=combined_data[combined_data['day'] > 1913]['day'],
    y=combined_data[combined_data['day'] > 1913]['pred_sales'],
    mode='lines+markers',
    name='Predicted Sales'
))

# Update layout for better visualization
fig.update_layout(
    title=f"Actual vs Predicted Sales for (item_id={item_id}, dept_id={dept_id}, cat_id={cat_id}, store_id={store_id})",
    xaxis_title="Day",
    yaxis_title="Sales",
    legend_title="Legend",
    template="plotly_dark"
)

# Show the plot
fig.show()

# Prepare for submission

In [32]:
# Load actual sales data for validation from the original file
sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv')

# Filter columns to get the relevant days for validation (d_1914 to d_1941)
validation = sales_train_validation[['id'] + [f'd_{i}' for i in range(1914, 1942)]]

# Rename columns to 'id' and 'F1' to 'F28' for submission format
validation.columns = ['id'] + [f'F{i + 1}' for i in range(28)]


In [33]:
# Transform the 'final_predictions' DataFrame to the required submission format
# Pivot to get columns F1 to F28 for each id
final_predictions_pivot = final_predictions.pivot(index='id', columns='d', values='pred_sales').reset_index()

# Rename columns to match submission format (F1 to F28)
final_predictions_pivot.columns = ['id'] + [f'F{i+1}' for i in range(28)]

# Modify 'id' to reflect evaluation (for the actual submission format in the competition)
final_predictions_pivot['id'] = final_predictions_pivot['id'].str.replace('evaluation', 'validation')


In [34]:
# Combine validation and evaluation for submission
submit = pd.concat([validation, final_predictions_pivot]).reset_index(drop=True)
submit.to_csv('submission.csv', index=False)