In [1]:
# packages gerais
import pandas as pd
import joblib
from statsmodels.tsa.statespace.sarimax import SARIMAX

# df_sales_ dataset
df_sales_filtered_2019 = joblib.load('df_sales_filtered_2019.pkl')

## 1 Create a date field based in Year and Week of the year

### 1.1 But the date to make sense should be the last day in that specific week

In [2]:
import datetime
# Function to get the last day of the week using ISO calendar
def get_last_day_of_iso_week(year, week):
    first_day_of_year = datetime.datetime(year, 1, 4)  # 4th January is always in the first ISO week
    first_monday_of_year = first_day_of_year - datetime.timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + datetime.timedelta(weeks=week-1)
    return week_start_date + datetime.timedelta(days=6)

# Applying function to DataFrame
df_sales_filtered_2019['last_day_of_week'] = df_sales_filtered_2019.apply(
    lambda x: get_last_day_of_iso_week(x['year'], x['week']), axis=1
)

In [3]:
#df_sales_filtered_2019 = df_sales_filtered_2019[(df_sales_filtered_2019['store_id'] == 'S0097') & (df_sales_filtered_2019['product_id'].isin(['P0704', 'P0001']))]

### 1.2 Convert date to time series by set as index and sort that

In [4]:
# Set 'date' column as index and sort by date
df_sales_filtered_2019.set_index('last_day_of_week', inplace=True)
df_sales_filtered_2019.sort_index(inplace=True)

In [5]:
df_sales_filtered_2019

Unnamed: 0_level_0,store_id,product_id,year,week,sales,revenue,stock,price
last_day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-07,S0020,P0001,2019,14,2.0,18.56,9.0,10.95
2019-04-07,S0085,P0548,2019,14,39.0,86.67,79.0,1.95
2019-04-07,S0062,P0022,2019,14,1.0,60.45,9.0,101.90
2019-04-07,S0020,P0137,2019,14,0.0,0.00,16.0,56.90
2019-04-07,S0062,P0026,2019,14,2.0,9.16,6.0,4.95
...,...,...,...,...,...,...,...,...
2019-09-29,S0085,P0709,2019,39,88.0,567.07,287.0,7.00
2019-09-29,S0085,P0711,2019,39,0.0,0.00,56.0,12.90
2019-09-29,S0026,P0508,2019,39,1.0,6.02,12.0,6.50
2019-09-29,S0026,P0517,2019,39,0.0,0.00,3.0,199.95


### 1.3 Prepare to apply ARIMAX

In [6]:
# Group by 'store_id' and 'product_id'
grouped = df_sales_filtered_2019.groupby(['store_id', 'product_id'])

# Create a DataFrame to store forecasts and a data frame to have those product/store with error
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMAX'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])

In [7]:
from statsmodels.tsa.arima.model import ARIMA
import warnings
from itertools import product
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import pandas as pd

# Grid search parameters
p_values = range(0, 3)
d_value = 1
q_values = range(0, 3)

# DataFrames to store results
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMA'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])
df_metrics = pd.DataFrame(columns=['store_id', 'product_id', 'mse', 'rmse', 'mae', 'mape'])

# Iterate over each group
for (store_id, product_id), group in df_sales_filtered_2019.groupby(['store_id', 'product_id']):
    # Reindex to ensure complete weekly intervals
    group = group.asfreq('W-SUN', method='pad')
    
    # Ensure there are suficientes data points to fit the model
    if len(group) < 2:
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': 'Not enough data points to fit ARIMA model'
        }])], ignore_index=True)
        continue
    
    warnings.filterwarnings("ignore")  # specify to ignore warning messages

    # Fit ARIMA model
    try:
        best_aic = float("inf")
        best_order = None
        best_model = None

        for p, q in product(p_values, q_values):
            try:
                model = ARIMA(group['sales'], order=(p, d_value, q))
                model_fit = model.fit()
                aic = model_fit.aic
                if aic < best_aic:
                    best_aic = aic
                    best_order = (p, d_value, q)
                    best_model = model_fit
            except:
                continue
        
        if best_model is not None:
            # Forecast future sales (next 3 weeks)
            forecast = best_model.forecast(steps=3)
            
            # Append the forecast to the DataFrame
            df_forecasts = pd.concat([df_forecasts, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'forecast_week_1': forecast[0],
                'forecast_week_2': forecast[1],
                'forecast_week_3': forecast[2],
                'ARIMA': best_order
            }])], ignore_index=True)
            
            # Calculate metrics (assuming you have actual future sales for comparison)
            actual_future_sales = group['sales'][-3:]  # Adjust based on actual data availability
            
            if len(actual_future_sales) == 3:
                mse = mean_squared_error(actual_future_sales, forecast)
                rmse = mean_squared_error(actual_future_sales, forecast, squared=False)
                mae = mean_absolute_error(actual_future_sales, forecast)
                mape = mean_absolute_percentage_error(actual_future_sales, forecast)
                
                # Append the metrics to the DataFrame
                df_metrics = pd.concat([df_metrics, pd.DataFrame([{
                    'store_id': store_id,
                    'product_id': product_id,
                    'mse': mse,
                    'rmse': rmse,
                    'mae': mae,
                    'mape': mape
                }])], ignore_index=True)
            else:
                df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                    'store_id': store_id,
                    'product_id': product_id,
                    'error_message': 'Not enough actual future data to calculate metrics'
                }])], ignore_index=True)
        else:
            df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'error_message': 'Failed to find suitable ARIMA model'
            }])], ignore_index=True)
    except Exception as e:
        print(f"Error fitting ARIMA for Store: {store_id}, Product: {product_id}")
        print(str(e))
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': str(e)
        }])], ignore_index=True)

# Save the DataFrames to CSV files
df_forecasts.to_csv('forecasts_arimax.csv', index=False)
df_product_error.to_csv('product_errors_arimax.csv', index=False)
df_metrics.to_csv('metrics_results_arimax.csv', index=False)


In [8]:
df_forecasts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   store_id         1900 non-null   object 
 1   product_id       1900 non-null   object 
 2   forecast_week_1  1900 non-null   float64
 3   forecast_week_2  1900 non-null   float64
 4   forecast_week_3  1900 non-null   float64
 5   ARIMA            1900 non-null   object 
dtypes: float64(3), object(3)
memory usage: 89.2+ KB


In [9]:
df_forecasts

Unnamed: 0,store_id,product_id,forecast_week_1,forecast_week_2,forecast_week_3,ARIMA
0,S0020,P0001,3.073147,3.073147,3.073147,"(0, 1, 1)"
1,S0020,P0005,0.076913,0.076913,0.076913,"(0, 1, 1)"
2,S0020,P0007,0.000000,0.000000,0.000000,"(0, 1, 0)"
3,S0020,P0008,0.000000,0.000000,0.000000,"(0, 1, 0)"
4,S0020,P0009,1.769237,1.769237,1.769237,"(0, 1, 1)"
...,...,...,...,...,...,...
1895,S0097,P0739,4.384613,4.384613,4.384613,"(0, 1, 1)"
1896,S0097,P0740,0.690476,0.690476,0.690476,"(0, 1, 1)"
1897,S0097,P0741,0.645679,0.645679,0.645679,"(0, 1, 1)"
1898,S0097,P0747,15.662993,15.662993,15.662993,"(0, 1, 1)"


In [10]:
df_product_error

Unnamed: 0,store_id,product_id,error_message
0,S0020,P0053,Not enough actual future data to calculate met...
1,S0020,P0077,Not enough data points to fit ARIMA model
2,S0020,P0104,Not enough actual future data to calculate met...
3,S0020,P0210,Not enough data points to fit ARIMA model
4,S0020,P0270,Not enough data points to fit ARIMA model
...,...,...,...
74,S0097,P0591,Not enough data points to fit ARIMA model
75,S0097,P0634,Not enough actual future data to calculate met...
76,S0097,P0646,Not enough actual future data to calculate met...
77,S0097,P0657,Not enough data points to fit ARIMA model


In [11]:
df_metrics

Unnamed: 0,store_id,product_id,mse,rmse,mae,mape
0,S0020,P0001,6.956586,2.637534,2.357716,1.056900e+00
1,S0020,P0005,0.005916,0.076913,0.076913,3.463870e+14
2,S0020,P0007,0.000000,0.000000,0.000000,0.000000e+00
3,S0020,P0008,0.000000,0.000000,0.000000,0.000000e+00
4,S0020,P0009,2.053252,1.432917,1.410254,2.655978e+15
...,...,...,...,...,...,...
1859,S0097,P0739,12.942790,3.597609,3.128204,1.316436e+16
1860,S0097,P0740,0.889456,0.943110,0.896825,2.073084e+15
1861,S0097,P0741,0.792210,0.890062,0.784774,9.692930e+14
1862,S0097,P0747,235.220696,15.336906,15.329660,4.702657e+16
