In [19]:
# packages gerais
import pandas as pd
import joblib
from statsmodels.tsa.statespace.sarimax import SARIMAX

# df_sales_ dataset
df_sales_filtered_2019 = joblib.load('df_sales_filtered_2019.pkl')

## 1 Create a date field based in Year and Week of the year

### 1.1 But the date to make sense should be the last day in that specific week

In [20]:
import datetime
# Function to get the last day of the week using ISO calendar
def get_last_day_of_iso_week(year, week):
    first_day_of_year = datetime.datetime(year, 1, 4)  # 4th January is always in the first ISO week
    first_monday_of_year = first_day_of_year - datetime.timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + datetime.timedelta(weeks=week-1)
    return week_start_date + datetime.timedelta(days=6)

# Applying function to DataFrame
df_sales_filtered_2019['last_day_of_week'] = df_sales_filtered_2019.apply(
    lambda x: get_last_day_of_iso_week(x['year'], x['week']), axis=1
)

In [21]:
#df_sales_filtered_2019 = df_sales_filtered_2019[(df_sales_filtered_2019['store_id'] == 'S0097') & (df_sales_filtered_2019['product_id'].isin(['P0001', 'P0704', 'P0702','P0747']))]

### 1.2 Convert date to time series by set as index and sort that

In [22]:
# Set 'date' column as index and sort by date
df_sales_filtered_2019.set_index('last_day_of_week', inplace=True)
df_sales_filtered_2019.sort_index(inplace=True)

In [23]:
#df_sales_filtered_2019.head(60)

### 1.3 Prepare to apply ARIMAX

In [24]:
# Group by 'store_id' and 'product_id'
grouped = df_sales_filtered_2019.groupby(['store_id', 'product_id'])

# Create a DataFrame to store forecasts and a data frame to have those product/store with error
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMAX'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])

In [25]:
from statsmodels.tsa.arima.model import ARIMA
import warnings
from itertools import product
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import pandas as pd

# Grid search parameters
p_values = range(0, 3)
d_value = 1
q_values = range(0, 3)

# DataFrames to store results
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMAX'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])
df_metrics = pd.DataFrame(columns=['store_id', 'product_id', 'mse', 'rmse', 'mae', 'mape'])

# Iterate over each group
for (store_id, product_id), group in df_sales_filtered_2019.groupby(['store_id', 'product_id']):
    # Reindex to ensure complete weekly intervals
    group = group.asfreq('W-SUN', method='pad')
    
    # Ensure there are suficientes data points to fit the model
    if len(group) < 2:
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': 'Not enough data points to fit ARIMA model'
        }])], ignore_index=True)
        continue
    
    warnings.filterwarnings("ignore")  # specify to ignore warning messages

    # Fit ARIMA model
    try:
        best_aic = float("inf")
        best_order = None
        best_model = None

        for p, q in product(p_values, q_values):
            try:
                model = ARIMA(group['sales'], order=(p, d_value, q))
                model_fit = model.fit()
                aic = model_fit.aic
                if aic < best_aic:
                    best_aic = aic
                    best_order = (p, d_value, q)
                    best_model = model_fit
            except:
                continue
        
        if best_model is not None:
            # Forecast future sales (next 3 weeks)
            forecast = best_model.forecast(steps=3)
            
            # Append the forecast to the DataFrame
            df_forecasts = pd.concat([df_forecasts, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'forecast_week_1': forecast[0],
                'forecast_week_2': forecast[1],
                'forecast_week_3': forecast[2],
                'ARIMA': best_order
            }])], ignore_index=True)
            
            # Calculate metrics (assuming you have actual future sales for comparison)
            actual_future_sales = group['sales'][-3:]  # Adjust based on actual data availability
            
            if len(actual_future_sales) == 3:
                mse = mean_squared_error(actual_future_sales, forecast)
                rmse = mean_squared_error(actual_future_sales, forecast, squared=False)
                mae = mean_absolute_error(actual_future_sales, forecast)
                mape = mean_absolute_percentage_error(actual_future_sales, forecast)
                
                # Append the metrics to the DataFrame
                df_metrics = pd.concat([df_metrics, pd.DataFrame([{
                    'store_id': store_id,
                    'product_id': product_id,
                    'mse': mse,
                    'rmse': rmse,
                    'mae': mae,
                    'mape': mape
                }])], ignore_index=True)
            else:
                df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                    'store_id': store_id,
                    'product_id': product_id,
                    'error_message': 'Not enough actual future data to calculate metrics'
                }])], ignore_index=True)
        else:
            df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'error_message': 'Failed to find suitable ARIMA model'
            }])], ignore_index=True)
    except Exception as e:
        print(f"Error fitting ARIMA for Store: {store_id}, Product: {product_id}")
        print(str(e))
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': str(e)
        }])], ignore_index=True)

# Save the DataFrames to CSV files
df_product_error.to_csv('product_errors_arimax.csv', index=False)
df_metrics.to_csv('metrics_results_arimax.csv', index=False)


In [26]:
df_forecasts

Unnamed: 0,store_id,product_id,forecast_week_1,forecast_week_2,forecast_week_3,ARIMAX,ARIMA
0,S0020,P0001,2.902114,2.902114,2.902114,,"(0, 1, 1)"
1,S0020,P0005,0.128138,0.128138,0.128138,,"(0, 1, 1)"
2,S0020,P0007,0.000000,0.000000,0.000000,,"(0, 1, 0)"
3,S0020,P0008,0.210146,0.210146,0.210146,,"(0, 1, 1)"
4,S0020,P0009,1.774194,1.774194,1.774194,,"(0, 1, 1)"
...,...,...,...,...,...,...,...
1620,S0097,P0739,4.712004,4.712004,4.712004,,"(0, 1, 1)"
1621,S0097,P0740,0.666549,0.666549,0.666549,,"(0, 1, 1)"
1622,S0097,P0741,0.567435,0.567435,0.567435,,"(0, 1, 1)"
1623,S0097,P0747,14.408705,14.408705,14.408705,,"(0, 1, 1)"


In [35]:
# This input and merge the data predicted with the sales already done
import numpy as np

# Pivot the forecasts DataFrame
df_forecasts_melted = df_forecasts.melt(id_vars=['store_id', 'product_id', 'ARIMAX'], 
                                        value_vars=['forecast_week_1', 'forecast_week_2', 'forecast_week_3'], 
                                        var_name='week', value_name='forecast')

# Extract the week number from the 'week' column
df_forecasts_melted['week'] = df_forecasts_melted['week'].str.extract('(\d+)').astype(int)
df_forecasts_melted

# Resetting index to ensure last_day_of_week is a regular column
df_sales_filtered_2019.reset_index(drop=False, inplace=True)

# Step 1: Find the latest year and week number for each store_id and product_id
latest_weeks = df_sales_filtered_2019.groupby(['store_id', 'product_id'])[['year', 'week']].max().reset_index()
latest_weeks.columns = ['store_id', 'product_id', 'latest_year', 'latest_week']

# Step 2: Merge the latest year and week numbers with the forecast DataFrame
df_combined = df_forecasts_melted.merge(latest_weeks, on=['store_id', 'product_id'], how='left')

# Step 3: Add the forecast weeks to the latest week numbers, adjusting for year transition
def adjust_year_week(row):
    new_week = row['latest_week'] + row['week']
    new_year = row['latest_year']
    while new_week > 52:  # Assuming 52 weeks in a year
        new_week -= 52
        new_year += 1
    return new_year, new_week

df_combined[['forecast_year', 'forecast_week']] = df_combined.apply(
    lambda row: adjust_year_week(row), axis=1, result_type="expand"
)

# Step 4: Calculate the forecast's last_day_of_week based on forecast_year and forecast_week
def get_last_day_of_iso_week(year, week):
    first_day_of_year = pd.Timestamp(year, 1, 4)
    first_monday_of_year = first_day_of_year - pd.Timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + pd.Timedelta(weeks=week-1)
    return week_start_date + pd.Timedelta(days=6)

df_combined['last_day_of_week'] = df_combined.apply(
    lambda row: get_last_day_of_iso_week(row['forecast_year'], row['forecast_week']), axis=1
)

# Step 5: Select and rename the necessary columns to match the desired format, using forecast as sales
df_combined_final = df_combined[['store_id', 'product_id', 'forecast_year', 'forecast_week', 'forecast', 'last_day_of_week', 'ARIMAX']]
df_combined_final.columns = ['store_id', 'product_id', 'year', 'week', 'sales', 'last_day_of_week', 'ARIMAX']

# Step 6: Concatenate with the original sales DataFrame
df_sales_final = df_sales_filtered_2019[['store_id', 'product_id', 'year', 'week', 'sales', 'last_day_of_week']]
df_sales_final['ARIMAX'] = np.nan  # Adding ARIMAX column with NaN for actual sales

df_final = pd.concat([df_sales_final, df_combined_final], ignore_index=True).sort_values(by=['store_id', 'product_id', 'year', 'week'])

# Display the final DataFrame
df_final

Unnamed: 0,store_id,product_id,year,week,sales,last_day_of_week,ARIMAX
0,S0020,P0001,2019,1,1.000000,2019-01-06,
2196,S0020,P0001,2019,2,2.000000,2019-01-13,
3115,S0020,P0001,2019,3,3.000000,2019-01-20,
3603,S0020,P0001,2019,4,1.000000,2019-01-27,
4727,S0020,P0001,2019,5,0.000000,2019-02-03,
...,...,...,...,...,...,...,...
50267,S0097,P0748,2019,38,1.000000,2019-09-22,
52905,S0097,P0748,2019,39,1.000000,2019-09-29,
54530,S0097,P0748,2019,40,0.999998,2019-10-06,
56155,S0097,P0748,2019,41,0.999998,2019-10-13,


In [28]:
df_product_error

Unnamed: 0,store_id,product_id,error_message
0,S0020,P0270,Not enough data points to fit ARIMA model
1,S0020,P0460,Not enough data points to fit ARIMA model
2,S0020,P0581,Not enough data points to fit ARIMA model
3,S0020,P0634,Not enough actual future data to calculate met...
4,S0020,P0676,Not enough actual future data to calculate met...
5,S0020,P0696,Not enough data points to fit ARIMA model
6,S0026,P0270,Not enough data points to fit ARIMA model
7,S0026,P0460,Not enough data points to fit ARIMA model
8,S0026,P0581,Not enough actual future data to calculate met...
9,S0026,P0595,Not enough actual future data to calculate met...


In [29]:
# Save the forecast DataFrame and the error DataFrame to CSV files
df_final.to_csv('./Files/df_final.csv', index=False)
df_product_error.to_csv('./Files/2019_forecast_errors_arimax.csv', index=False)

In [36]:
df_final.head(60)

Unnamed: 0,store_id,product_id,year,week,sales,last_day_of_week,ARIMAX
0,S0020,P0001,2019,1,1.0,2019-01-06,
2196,S0020,P0001,2019,2,2.0,2019-01-13,
3115,S0020,P0001,2019,3,3.0,2019-01-20,
3603,S0020,P0001,2019,4,1.0,2019-01-27,
4727,S0020,P0001,2019,5,0.0,2019-02-03,
6934,S0020,P0001,2019,6,1.0,2019-02-10,
7781,S0020,P0001,2019,7,3.0,2019-02-17,
9221,S0020,P0001,2019,8,2.0,2019-02-24,
10129,S0020,P0001,2019,9,3.0,2019-03-03,
10825,S0020,P0001,2019,10,6.0,2019-03-10,


In [37]:
df_final_2 = df_final[(df_final['store_id'] == 'S0020') & (df_final['product_id'].isin(['P0005', '', '','']))]
df_final_2.head(60)

Unnamed: 0,store_id,product_id,year,week,sales,last_day_of_week,ARIMAX
1057,S0020,P0005,2019,1,0.0,2019-01-06,
2019,S0020,P0005,2019,2,0.0,2019-01-13,
2781,S0020,P0005,2019,3,0.0,2019-01-20,
4577,S0020,P0005,2019,4,0.0,2019-01-27,
4771,S0020,P0005,2019,5,0.0,2019-02-03,
6899,S0020,P0005,2019,6,0.0,2019-02-10,
7641,S0020,P0005,2019,7,1.0,2019-02-17,
9013,S0020,P0005,2019,8,0.0,2019-02-24,
10405,S0020,P0005,2019,9,0.0,2019-03-03,
11057,S0020,P0005,2019,10,1.0,2019-03-10,
