In [73]:
# packages gerais
import pandas as pd
import joblib
from statsmodels.tsa.statespace.sarimax import SARIMAX

# df_sales_ dataset
df_sales_filtered_last_6_month = joblib.load('df_sales_filtered_last_6_month.pkl')

## 1 Create a date field based in Year and Week of the year

### 1.1 But the date to make sense should be the last day in that specific week

In [74]:
import datetime
# Function to get the last day of the week using ISO calendar
def get_last_day_of_iso_week(year, week):
    first_day_of_year = datetime.datetime(year, 1, 4)  # 4th January is always in the first ISO week
    first_monday_of_year = first_day_of_year - datetime.timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + datetime.timedelta(weeks=week-1)
    return week_start_date + datetime.timedelta(days=6)

# Applying function to DataFrame
df_sales_filtered_last_6_month['last_day_of_week'] = df_sales_filtered_last_6_month.apply(
    lambda x: get_last_day_of_iso_week(x['year'], x['week']), axis=1
)

In [75]:
#df_sales_filtered_last_6_month = df_sales_filtered_last_6_month[(df_sales_filtered_last_6_month['store_id'] == 'S0097') & (df_sales_filtered_last_6_month['product_id'].isin(['P0704', 'P0001']))]

### 1.2 Convert date to time series by set as index and sort that

In [76]:
# Set 'date' column as index and sort by date
df_sales_filtered_last_6_month.set_index('last_day_of_week', inplace=True)
df_sales_filtered_last_6_month.sort_index(inplace=True)

In [77]:
df_sales_filtered_last_6_month

Unnamed: 0_level_0,store_id,product_id,year,week,sales,revenue,stock,price
last_day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-07,S0020,P0001,2019,14,2.0,18.56,9.0,10.95
2019-04-07,S0085,P0548,2019,14,39.0,86.67,79.0,1.95
2019-04-07,S0062,P0022,2019,14,1.0,60.45,9.0,101.90
2019-04-07,S0020,P0137,2019,14,0.0,0.00,16.0,56.90
2019-04-07,S0062,P0026,2019,14,2.0,9.16,6.0,4.95
...,...,...,...,...,...,...,...,...
2019-09-29,S0085,P0709,2019,39,88.0,567.07,287.0,7.00
2019-09-29,S0085,P0711,2019,39,0.0,0.00,56.0,12.90
2019-09-29,S0026,P0508,2019,39,1.0,6.02,12.0,6.50
2019-09-29,S0026,P0517,2019,39,0.0,0.00,3.0,199.95


### 1.3 Prepare to apply ARIMAX

In [78]:
# Group by 'store_id' and 'product_id'
grouped = df_sales_filtered_last_6_month.groupby(['store_id', 'product_id'])

# Create a DataFrame to store forecasts and a data frame to have those product/store with error
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMAX'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])

In [79]:
import warnings
from itertools import product

# Function to perform grid search for ARIMAX parameters
def optimize_arimax(series, exog, p_values, d_value, q_values):
    best_aic = float("inf")
    best_order = None
    best_model = None
    for p, q in product(p_values, q_values):
        try:
            model = SARIMAX(series, exog=exog, order=(p, d_value, q))
            model_fit = model.fit()
            aic = model_fit.aic
            if aic < best_aic:
                best_aic = aic
                best_order = (p, d_value, q)
                best_model = model_fit
        except:
            continue
    return best_order, best_model

# Grid search parameters
p_values = range(0, 3)
d_value  = 1
q_values = range(0, 3)

# Iterate over each group
for (store_id, product_id), group in grouped:
    # Reindex to ensure complete weekly intervals
    group = group.asfreq('W-SUN', method='pad')
    
    # Ensure there are enough data points to fit the model
    if len(group) < 3:
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': 'Not enough data points to fit ARIMAX model'
        }])], ignore_index=True)
        continue
    
    warnings.filterwarnings("ignore")  # specify to ignore warning messages

    # Fit ARIMAX model
    try:
        exog = group['price']
        best_order, best_model = optimize_arimax(group['sales'], exog, p_values, d_value, q_values)
       
        if best_model is not None:
            # Forecast future sales (next 3 weeks)
            # Use the last 3 weeks of exog values for forecasting
            forecast = best_model.forecast(steps=3, exog=exog[-3:])
            
            # Append the forecast to the DataFrame
            df_forecasts = pd.concat([df_forecasts, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'forecast_week_1': forecast[0],
                'forecast_week_2': forecast[1],
                'forecast_week_3': forecast[2],
                'ARIMAX': best_order
            }])], ignore_index=True)
        else:
            df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'error_message': 'Failed to find suitable ARIMAX model'
            }])], ignore_index=True)
    except Exception as e:
        print(f"Error fitting ARIMAX for Store: {store_id}, Product: {product_id}")
        print(str(e))
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': str(e)
        }])], ignore_index=True)

In [80]:
df_forecasts

Unnamed: 0,store_id,product_id,forecast_week_1,forecast_week_2,forecast_week_3,ARIMAX
0,S0020,P0001,3.073147,3.073147,3.073147,"(0, 1, 1)"
1,S0020,P0005,0.076918,0.076918,0.076918,"(0, 1, 1)"
2,S0020,P0007,0.000000,0.000000,0.000000,"(0, 1, 0)"
3,S0020,P0008,-0.000009,-0.000009,-0.000009,"(0, 1, 1)"
4,S0020,P0009,1.774972,1.774972,1.774972,"(0, 1, 1)"
...,...,...,...,...,...,...
1859,S0097,P0739,4.388958,4.388958,4.388958,"(0, 1, 1)"
1860,S0097,P0740,-2.678929,0.490527,2.064109,"(2, 1, 2)"
1861,S0097,P0741,0.646155,0.646155,0.646155,"(0, 1, 1)"
1862,S0097,P0747,29.466444,29.466444,29.466444,"(0, 1, 1)"


In [81]:
df_product_error

Unnamed: 0,store_id,product_id,error_message
0,S0020,P0053,Not enough data points to fit ARIMAX model
1,S0020,P0077,Not enough data points to fit ARIMAX model
2,S0020,P0104,Not enough data points to fit ARIMAX model
3,S0020,P0210,Not enough data points to fit ARIMAX model
4,S0020,P0270,Not enough data points to fit ARIMAX model
...,...,...,...
74,S0097,P0591,Not enough data points to fit ARIMAX model
75,S0097,P0634,Not enough data points to fit ARIMAX model
76,S0097,P0646,Not enough data points to fit ARIMAX model
77,S0097,P0657,Not enough data points to fit ARIMAX model


In [82]:
# Save the forecast DataFrame and the error DataFrame to CSV files
df_forecasts.to_csv('./Files/last_6_month_forecasts_arimax.csv', index=False)
df_product_error.to_csv('./Files/last_6_month_forecast_errors_arimax.csv', index=False)