In [1]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.models import LightGBMModel
from sklearn.metrics import mean_squared_error, mean_squared_log_error
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../../Data/Kaggle/StoreSales/train_darts.csv", parse_dates=["date"])

In [3]:
df.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,typeholiday,dcoilwtico,city,state,typestores,cluster,day_of_week,day,month,year
0,2013-01-01,1,0,0.0,0,3,93.14,18,12,3,13,2,1,1,2013
1,2013-01-01,1,1,0.0,0,3,93.14,18,12,3,13,2,1,1,2013
2,2013-01-01,1,2,0.0,0,3,93.14,18,12,3,13,2,1,1,2013
3,2013-01-01,1,3,0.0,0,3,93.14,18,12,3,13,2,1,1,2013
4,2013-01-01,1,4,0.0,0,3,93.14,18,12,3,13,2,1,1,2013


In [4]:
unique_combinations = df[['store_nbr', 'family']].drop_duplicates()

In [5]:
total_rmse = 0
total_rmsle = 0
num_models = 0

In [6]:
trained_models = {}

In [7]:
included_stores = {5, 8, 16, 19, 22, 25, 33, 37, 41, 47, 51}
excluded_families = {1, 4, 14, 17, 19, 20, 31}

In [8]:
for index, row in unique_combinations.iterrows():
    store_nbr = row['store_nbr']
    family = row['family']

    if store_nbr in included_stores and family not in excluded_families:
    
        print(f"\nTraining model for store {store_nbr} and family {family}...")
        
        df_subset = df[(df['store_nbr'] == store_nbr) & (df['family'] == family)]
        
        ts = TimeSeries.from_dataframe(df_subset, value_cols=["sales"])
    
        training_size = int(len(ts) - 30)
        
        train = ts[:training_size]
        val = ts[training_size:]
    
        model = LightGBMModel(lags=2)
        model.fit(train)
    
        trained_models[(store_nbr, family)] = model
    
        predictions = model.predict(n=len(val))
    
        val = val.pd_series().tolist()
        predictions = predictions.pd_series().tolist()
    
        predictions = np.maximum(predictions, 0)
    
        rmse = np.sqrt(mean_squared_error(val, predictions))
        rmsle = np.sqrt(mean_squared_error(np.log1p(val), np.log1p(predictions)))
    
        total_rmse += rmse
        total_rmsle += rmsle
        num_models += 1

        model.save(f"SavedModels/LightGBM/LGBM_S{store_nbr}P{family}.pkl")
    
        print(f"Root Mean Squared Error (RMSE): {rmse}")
        print(f"Root Mean Squared Logarithmic Error (RMSLE): {rmsle}")


Training model for store 16 and family 0...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 42
[LightGBM] [Info] Number of data points in the train set: 1652, number of used features: 2
[LightGBM] [Info] Start training from score 5.775424
Root Mean Squared Error (RMSE): 2.9969725891365484
Root Mean Squared Logarithmic Error (RMSLE): 0.6067764825178769

Training model for store 16 and family 2...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 1652, number of used features: 2
[LightGBM] [Info] Start training from score 0.863196
Root Mean Squared Error (RMSE): 1.827462471947187
Root 

In [9]:
avg_rmse = total_rmse / num_models
avg_rmsle = total_rmsle / num_models

In [10]:
print(f"Average Root Mean Squared Error (RMSE) across all models: {avg_rmse}")
print(f"Average Root Mean Squared Logarithmic Error (RMSLE) across all models: {avg_rmsle}")

Average Root Mean Squared Error (RMSE) across all models: 179.91202849927575
Average Root Mean Squared Logarithmic Error (RMSLE) across all models: 0.47825641231131444
