In [1]:
# packages gerais
import pandas as pd
import joblib
from statsmodels.tsa.statespace.sarimax import SARIMAX

# df_sales_ dataset
#df_sales_filtered_2019 = joblib.load('df_sales_filtered_2019.pkl')
df_sales_filtered_all = joblib.load('df_sales_filtered_all.pkl')

## 1 Create a date field based in Year and Week of the year

### 1.1 But the date to make sense should be the last day in that specific week

In [2]:
import datetime
# Function to get the last day of the week using ISO calendar
def get_last_day_of_iso_week(year, week):
    first_day_of_year = datetime.datetime(year, 1, 4)  # 4th January is always in the first ISO week
    first_monday_of_year = first_day_of_year - datetime.timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + datetime.timedelta(weeks=week-1)
    return week_start_date + datetime.timedelta(days=6)

# Applying function to DataFrame
df_sales_filtered_all['last_day_of_week'] = df_sales_filtered_all.apply(
    lambda x: get_last_day_of_iso_week(x['year'], x['week']), axis=1
)

In [3]:
#df_sales_filtered_2019 = df_sales_filtered_2019[(df_sales_filtered_2019['store_id'] == 'S0097') & (df_sales_filtered_2019['product_id'].isin(['P0001', 'P0704', 'P0702','P0747']))]

### 1.2 Convert date to time series by set as index and sort that

In [4]:
# Set 'date' column as index and sort by date
df_sales_filtered_all.set_index('last_day_of_week', inplace=True)
df_sales_filtered_all.sort_index(inplace=True)

In [5]:
#df_sales_filtered_2019.head(60)

### 1.3 Prepare to apply ARIMAX

In [6]:
# Group by 'store_id' and 'product_id'
grouped = df_sales_filtered_all.groupby(['store_id', 'product_id'])

# Create a DataFrame to store forecasts and a data frame to have those product/store with error
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMAX'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])

In [7]:
import warnings
import pandas as pd
from itertools import product
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Function to perform grid search for ARIMAX parameters
def optimize_arimax(series, exog, p_values, d_value, q_values):
    best_aic = float("inf")
    best_order = None
    best_model = None
    for p, q in product(p_values, q_values):
        try:
            model = SARIMAX(series, exog=exog, order=(p, d_value, q))
            model_fit = model.fit(disp=False)
            aic = model_fit.aic
            if aic < best_aic:
                best_aic = aic
                best_order = (p, d_value, q)
                best_model = model_fit
        except:
            continue
    return best_order, best_model

# Function to calculate metrics
def calculate_metrics(actual, forecast):
    mse = mean_squared_error(actual, forecast)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, forecast)
    mape = np.mean(np.abs((actual - forecast) / actual)) * 100
    return mse, rmse, mae, mape

# DataFrames to store results
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMAX'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])
df_metrics = pd.DataFrame(columns=['store_id', 'product_id', 'mse', 'rmse', 'mae', 'mape'])

# Grid search parameters
p_values = range(0, 3)
d_value  = 1
q_values = range(0, 3)

# Iterate over each group
for (store_id, product_id), group in grouped:
    # Reindex to ensure complete weekly intervals
    group = group.asfreq('W-SUN', method='pad')
    
    # Ensure there are enough data points to fit the model
    if len(group) < 6:  # Need at least 6 data points to train and test
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': 'Not enough data points to fit ARIMAX model'
        }])], ignore_index=True)
        continue
    
    warnings.filterwarnings("ignore")  # specify to ignore warning messages

    # Fit ARIMAX model
    try:
        exog = group['price']
        best_order, best_model = optimize_arimax(group['sales'], exog, p_values, d_value, q_values)
       
        if best_model is not None:
            # Forecast future sales (next 3 weeks)
            # Use the last 3 weeks of exog values for forecasting
            forecast = best_model.forecast(steps=3, exog=exog[-3:])
            actual = group['sales'][-3:].values  # Last 3 actual sales values
            
            # Calculate metrics
            mse, rmse, mae, mape = calculate_metrics(actual, forecast)
            
            # Append the forecast and metrics to the DataFrames
            df_forecasts = pd.concat([df_forecasts, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'forecast_week_1': forecast[0],
                'forecast_week_2': forecast[1],
                'forecast_week_3': forecast[2],
                'ARIMAX': best_order
            }])], ignore_index=True)
            
            df_metrics = pd.concat([df_metrics, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'mse': mse,
                'rmse': rmse,
                'mae': mae,
                'mape': mape
            }])], ignore_index=True)
        else:
            df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'error_message': 'Failed to find suitable ARIMAX model'
            }])], ignore_index=True)
    except Exception as e:
        print(f"Error fitting ARIMAX for Store: {store_id}, Product: {product_id}")
        print(str(e))
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': str(e)
        }])], ignore_index=True)

# Save the DataFrames to CSV files
df_product_error.to_csv('product_errors_arimax.csv', index=False)
df_metrics.to_csv('metrics_results_arimax.csv', index=False)
df_forecasts.to_csv('forecasts_arimax.csv', index=False)

In [8]:
df_forecasts

Unnamed: 0,store_id,product_id,forecast_week_1,forecast_week_2,forecast_week_3,ARIMAX
0,S0020,P0001,2.543150,2.543150,2.543150,"(0, 1, 1)"
1,S0020,P0005,0.139137,0.139137,0.139137,"(0, 1, 1)"
2,S0020,P0007,0.000000,0.000000,0.000000,"(0, 1, 0)"
3,S0020,P0008,0.300674,0.300674,0.300674,"(0, 1, 1)"
4,S0020,P0009,1.717164,1.717164,1.717164,"(0, 1, 1)"
...,...,...,...,...,...,...
1568,S0097,P0739,5.274460,3.208062,3.007870,"(2, 1, 1)"
1569,S0097,P0740,0.869525,0.639088,0.592115,"(1, 1, 1)"
1570,S0097,P0741,0.483696,0.483696,0.483696,"(0, 1, 1)"
1571,S0097,P0747,10.401855,10.401855,10.401855,"(0, 1, 1)"


In [9]:
# This input and merge the data predicted with the sales already done
import numpy as np

# Pivot the forecasts DataFrame
df_forecasts_melted = df_forecasts.melt(id_vars=['store_id', 'product_id', 'ARIMAX'], 
                                        value_vars=['forecast_week_1', 'forecast_week_2', 'forecast_week_3'], 
                                        var_name='week', value_name='forecast')

# Extract the week number from the 'week' column
df_forecasts_melted['week'] = df_forecasts_melted['week'].str.extract('(\d+)').astype(int)
df_forecasts_melted

# Resetting index to ensure last_day_of_week is a regular column
df_sales_filtered_all.reset_index(drop=False, inplace=True)

# Step 1: Find the latest year and week number for each store_id and product_id
latest_weeks = df_sales_filtered_all.groupby(['store_id', 'product_id'])[['year', 'week']].max().reset_index()
latest_weeks.columns = ['store_id', 'product_id', 'latest_year', 'latest_week']

# Step 2: Merge the latest year and week numbers with the forecast DataFrame
df_combined = df_forecasts_melted.merge(latest_weeks, on=['store_id', 'product_id'], how='left')

# Step 3: Add the forecast weeks to the latest week numbers, adjusting for year transition
def adjust_year_week(row):
    new_week = row['latest_week'] + row['week']
    new_year = row['latest_year']
    while new_week > 52:  # Assuming 52 weeks in a year
        new_week -= 52
        new_year += 1
    return new_year, new_week

df_combined[['forecast_year', 'forecast_week']] = df_combined.apply(
    lambda row: adjust_year_week(row), axis=1, result_type="expand"
)

# Step 4: Calculate the forecast's last_day_of_week based on forecast_year and forecast_week
def get_last_day_of_iso_week(year, week):
    first_day_of_year = pd.Timestamp(year, 1, 4)
    first_monday_of_year = first_day_of_year - pd.Timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + pd.Timedelta(weeks=week-1)
    return week_start_date + pd.Timedelta(days=6)

df_combined['last_day_of_week'] = df_combined.apply(
    lambda row: get_last_day_of_iso_week(row['forecast_year'], row['forecast_week']), axis=1
)

# Step 5: Select and rename the necessary columns to match the desired format, using forecast as sales
df_combined_final = df_combined[['store_id', 'product_id', 'forecast_year', 'forecast_week', 'forecast', 'last_day_of_week', 'ARIMAX']]
df_combined_final.columns = ['store_id', 'product_id', 'year', 'week', 'sales', 'last_day_of_week', 'ARIMAX']

# Step 6: Concatenate with the original sales DataFrame
df_sales_final = df_sales_filtered_all[['store_id', 'product_id', 'year', 'week', 'sales', 'last_day_of_week']]
df_sales_final['ARIMAX'] = np.nan  # Adding ARIMAX column with NaN for actual sales

df_final = pd.concat([df_sales_final, df_combined_final], ignore_index=True).sort_values(by=['store_id', 'product_id', 'year', 'week'])

df_final['sales'] = df_final['sales'].round(2)
# Display the final DataFrame
df_final

Unnamed: 0,store_id,product_id,year,week,sales,last_day_of_week,ARIMAX
36246,S0020,P0001,2017,45,0.0,2017-11-12,
36466,S0020,P0001,2017,46,1.0,2017-11-19,
37690,S0020,P0001,2017,47,2.0,2017-11-26,
38446,S0020,P0001,2017,48,0.0,2017-12-03,
39771,S0020,P0001,2017,49,1.0,2017-12-10,
...,...,...,...,...,...,...,...
147058,S0097,P0748,2019,38,1.0,2019-09-22,
148795,S0097,P0748,2019,39,1.0,2019-09-29,
150368,S0097,P0748,2019,40,1.0,2019-10-06,"(0, 1, 1)"
151941,S0097,P0748,2019,41,1.0,2019-10-13,"(0, 1, 1)"


In [10]:
df_product_error

Unnamed: 0,store_id,product_id,error_message
0,S0020,P0012,Not enough data points to fit ARIMAX model
1,S0020,P0117,Not enough data points to fit ARIMAX model
2,S0020,P0270,Not enough data points to fit ARIMAX model
3,S0020,P0314,Not enough data points to fit ARIMAX model
4,S0020,P0326,Not enough data points to fit ARIMAX model
...,...,...,...
62,S0097,P0570,Not enough data points to fit ARIMAX model
63,S0097,P0595,Not enough data points to fit ARIMAX model
64,S0097,P0634,Not enough data points to fit ARIMAX model
65,S0097,P0675,Not enough data points to fit ARIMAX model


In [11]:
# Save the forecast DataFrame and the error DataFrame to CSV files
df_final.to_csv('./Files/df_final.csv', index=False)
df_product_error.to_csv('./Files/2019_forecast_errors_arimax.csv', index=False)

In [12]:
df_final

Unnamed: 0,store_id,product_id,year,week,sales,last_day_of_week,ARIMAX
36246,S0020,P0001,2017,45,0.0,2017-11-12,
36466,S0020,P0001,2017,46,1.0,2017-11-19,
37690,S0020,P0001,2017,47,2.0,2017-11-26,
38446,S0020,P0001,2017,48,0.0,2017-12-03,
39771,S0020,P0001,2017,49,1.0,2017-12-10,
...,...,...,...,...,...,...,...
147058,S0097,P0748,2019,38,1.0,2019-09-22,
148795,S0097,P0748,2019,39,1.0,2019-09-29,
150368,S0097,P0748,2019,40,1.0,2019-10-06,"(0, 1, 1)"
151941,S0097,P0748,2019,41,1.0,2019-10-13,"(0, 1, 1)"


In [13]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 153515 entries, 36246 to 153514
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   store_id          153515 non-null  object        
 1   product_id        153515 non-null  object        
 2   year              153515 non-null  Int64         
 3   week              153515 non-null  Int64         
 4   sales             153515 non-null  float64       
 5   last_day_of_week  153515 non-null  datetime64[ns]
 6   ARIMAX            4719 non-null    object        
dtypes: Int64(2), datetime64[ns](1), float64(1), object(3)
memory usage: 9.7+ MB
