## Daily SARIMA

In [1]:
import warnings
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error
from itertools import product

warnings.filterwarnings("ignore")

cluster_files = {
    0: "cluster_0.csv",
    1: "cluster_1.csv",
    2: "cluster_2.csv"
}

final_mape_per_cluster = {}
best_params = {}

# Grid search parameters for daily SARIMA
p_values = [1, 2]
d_values = [0, 1]
q_values = [1, 2]
P_values = [1, 2]
D_values = [0, 1]
Q_values = [1, 2]
param_grid = list(product(p_values, d_values, q_values, P_values, D_values, Q_values))

for cluster, file_path in cluster_files.items():
    cluster_data = pd.read_csv(file_path, parse_dates=["InvoiceDate"])
    if "total_sales" not in cluster_data.columns:
        cluster_data["total_sales"] = cluster_data["Quantity"] * cluster_data["Price"]

    cluster_data["Date"] = pd.to_datetime(cluster_data["InvoiceDate"].dt.date)
    cluster_data.sort_values(by="Date", inplace=True)

    daily_sales = cluster_data.groupby("Date")["total_sales"].sum()

    if len(daily_sales) < 30:
        print(f"  ⚠️ Skipped cluster {cluster}: only {len(daily_sales)} days")
        continue

    split_index = int(len(daily_sales) * 0.8)
    train_series = daily_sales.iloc[:split_index]
    test_series = daily_sales.iloc[split_index:]

    best_mape = float("inf")
    best_model = None
    best_order = None
    best_seasonal_order = None

    for (p, d, q, P, D, Q) in param_grid:
        try:
            model = SARIMAX(
                train_series,
                order=(p, d, q),
                seasonal_order=(P, D, Q, 7),
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            model_fit = model.fit(disp=False)
            preds = model_fit.forecast(steps=len(test_series))
            mape = mean_absolute_percentage_error(test_series, preds)

            if mape < best_mape:
                best_mape = mape
                best_model = model_fit
                best_order = (p, d, q)
                best_seasonal_order = (P, D, Q, 7)
        except Exception:
            continue

    if best_model is None:
        print(f"  ❌ No valid model found for cluster {cluster}")
        continue

    final_mape_per_cluster[cluster] = best_mape
    best_params[cluster] = (best_order, best_seasonal_order)

print("SARIMA Daily Forecast (Final MAPE per Cluster):")
for cluster, mape in final_mape_per_cluster.items():
    order, seasonal_order = best_params[cluster]
    print(f"Cluster {cluster}: {mape:.2%} (order={order}, seasonal_order={seasonal_order})")

SARIMAX Daily Forecast (Final MAPE per Cluster):
Cluster 0: 46.82% (order=(1, 0, 2), seasonal_order=(1, 0, 1, 7))
Cluster 1: 43.88% (order=(1, 1, 2), seasonal_order=(1, 0, 1, 7))
Cluster 2: 60.07% (order=(1, 1, 2), seasonal_order=(1, 1, 2, 7))


## Weekly SARIMA

In [3]:
import warnings
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_percentage_error
from itertools import product

warnings.filterwarnings("ignore")

# File paths per cluster
cluster_files = {
    0: "cluster_0.csv",
    1: "cluster_1.csv",
    2: "cluster_2.csv"
}

final_mape_per_cluster = {}
best_params = {}

# Grid search parameters for weekly SARIMA
p_values = [1, 2]
d_values = [0, 1]
q_values = [1, 2]
P_values = [1, 2]
D_values = [0, 1]
Q_values = [1, 2]
param_grid = list(product(p_values, d_values, q_values, P_values, D_values, Q_values))

for cluster, file_path in cluster_files.items():
    cluster_data = pd.read_csv(file_path, parse_dates=["InvoiceDate"])
    if "total_sales" not in cluster_data.columns:
        cluster_data["total_sales"] = cluster_data["Quantity"] * cluster_data["Price"]

    cluster_data["Week"] = cluster_data["InvoiceDate"].dt.to_period("W").dt.start_time
    cluster_data.sort_values(by="Week", inplace=True)

    weekly_sales = cluster_data.groupby("Week")["total_sales"].sum()

    if len(weekly_sales) < 20:
        print(f"  ⚠️ Skipped cluster {cluster}: only {len(weekly_sales)} weeks")
        continue

    split_index = int(len(weekly_sales) * 0.8)
    train_series = weekly_sales.iloc[:split_index]
    test_series = weekly_sales.iloc[split_index:]

    best_mape = float("inf")
    best_model = None
    best_order = None
    best_seasonal_order = None

    for (p, d, q, P, D, Q) in param_grid:
        try:
            model = SARIMAX(
                train_series,
                order=(p, d, q),
                seasonal_order=(P, D, Q, 52),
                enforce_stationarity=False,
                enforce_invertibility=False
            )
            model_fit = model.fit(disp=False)
            preds = model_fit.forecast(steps=len(test_series))
            mape = mean_absolute_percentage_error(test_series, preds)

            if mape < best_mape:
                best_mape = mape
                best_model = model_fit
                best_order = (p, d, q)
                best_seasonal_order = (P, D, Q, 52)
        except Exception:
            continue

    if best_model is None:
        print(f"  ❌ No valid model found for cluster {cluster}")
        continue

    final_mape_per_cluster[cluster] = best_mape
    best_params[cluster] = (best_order, best_seasonal_order)

print("SARIMA Weekly Forecast (Final MAPE per Cluster):")
for cluster, mape in final_mape_per_cluster.items():
    order, seasonal_order = best_params[cluster]
    print(f"Cluster {cluster}: {mape:.2%} (order={order}, seasonal_order={seasonal_order})")


SARIMAX Weekly Forecast (Final MAPE per Cluster):
Cluster 0: 18.03% (order=(1, 1, 1), seasonal_order=(1, 0, 2, 52))
Cluster 1: 18.83% (order=(1, 1, 2), seasonal_order=(1, 0, 1, 52))
Cluster 2: 21.28% (order=(2, 0, 1), seasonal_order=(1, 1, 2, 52))
