In [1]:
# packages gerais
import pandas as pd
import joblib

# df_sales_ dataset
df_sales_filtered_all = joblib.load('df_sales_filtered_all.pkl')
df_sales_filtered_2018_2019 = joblib.load('df_sales_filtered_2018_2019.pkl')
df_sales_filtered_2019 = joblib.load('df_sales_filtered_2019.pkl')
df_sales_filtered_last_6_month = joblib.load('df_sales_filtered_last_6_month.pkl')

## 1 Create a date field based in Year and Week of the year

### 1.1 But the date to make sense should be the last day in that specific week

In [2]:
import datetime
# Function to get the last day of the week using ISO calendar
def get_last_day_of_iso_week(year, week):
    first_day_of_year = datetime.datetime(year, 1, 4)  # 4th January is always in the first ISO week
    first_monday_of_year = first_day_of_year - datetime.timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + datetime.timedelta(weeks=week-1)
    return week_start_date + datetime.timedelta(days=6)

# Applying function to DataFrame
df_sales_filtered_last_6_month['last_day_of_week'] = df_sales_filtered_last_6_month.apply(
    lambda x: get_last_day_of_iso_week(x['year'], x['week']), axis=1
)

In [3]:
df_sales_filtered_last_6_month[(df_sales_filtered_last_6_month['store_id'] == 'S0097') & (df_sales_filtered_last_6_month['product_id'] == 'P0704')].tail(30)

Unnamed: 0,store_id,product_id,year,week,sales,revenue,stock,price,last_day_of_week
39826,S0097,P0704,2019,14,6.0,18.89,39.0,3.4,2019-04-07
39827,S0097,P0704,2019,15,4.0,12.6,35.0,3.4,2019-04-14
39828,S0097,P0704,2019,16,20.0,47.85,15.0,3.4,2019-04-21
39829,S0097,P0704,2019,17,15.0,33.04,20.0,3.4,2019-04-28
39830,S0097,P0704,2019,18,13.0,33.36,47.0,3.4,2019-05-05
39831,S0097,P0704,2019,19,7.0,15.75,37.0,3.4,2019-05-12
39832,S0097,P0704,2019,20,5.0,12.6,32.0,3.4,2019-05-19
39833,S0097,P0704,2019,21,3.0,9.45,29.0,3.4,2019-05-26
39834,S0097,P0704,2019,22,9.0,22.66,20.0,3.4,2019-06-02
39835,S0097,P0704,2019,23,4.0,8.81,16.0,3.4,2019-06-09


### 1.2 Convert date to time series by set as index and sort that

In [4]:
# Set 'date' column as index and sort by date
df_sales_filtered_last_6_month.set_index('last_day_of_week', inplace=True)
df_sales_filtered_last_6_month.sort_index(inplace=True)

In [5]:
df_sales_filtered_last_6_month

Unnamed: 0_level_0,store_id,product_id,year,week,sales,revenue,stock,price
last_day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-07,S0020,P0001,2019,14,2.0,18.56,9.0,10.95
2019-04-07,S0020,P0664,2019,14,4.0,5.56,10.0,1.50
2019-04-07,S0097,P0377,2019,14,0.0,0.00,7.0,14.50
2019-04-07,S0085,P0137,2019,14,0.0,0.00,6.0,56.90
2019-04-07,S0085,P0132,2019,14,1.0,13.88,0.0,14.99
...,...,...,...,...,...,...,...,...
2019-10-06,S0085,P0299,2019,40,0.0,0.00,5.0,2.00
2019-10-06,S0097,P0598,2019,40,0.0,0.00,41.0,4.50
2019-10-06,S0020,P0546,2019,40,0.0,0.00,4.0,17.90
2019-10-06,S0085,P0282,2019,40,0.0,0.00,31.0,4.90


### 1.3 Prepare to apply ARIMA

In [6]:
# Group by 'store_id' and 'product_id'
grouped = df_sales_filtered_last_6_month.groupby(['store_id', 'product_id'])

# Create a DataFrame to store forecasts and a data frame to have those product/store with error
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMA'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])

In [7]:
from statsmodels.tsa.arima.model import ARIMA
import warnings
from itertools import product

# Function to perform grid search for ARIMA parameters
def optimize_arima(series, p_values, d_value, q_values):
    best_aic = float("inf")
    best_order = None
    best_model = None
    for p, q in product(p_values, q_values):
        try:
            model = ARIMA(series, order=(p, d_value, q))
            model_fit = model.fit()
            aic = model_fit.aic
            if aic < best_aic:
                best_aic = aic
                best_order = (p, d_value, q)
                best_model = model_fit
        except:
            continue
    return best_order, best_model



# Grid search parameters
p_values = range(0, 3)
d_value  = 1
q_values = range(0, 3)

# Iterate over each group
for (store_id, product_id), group in grouped:
    # Reindex to ensure complete weekly intervals
    group = group.asfreq('W-SUN', method='pad')
    
    # Ensure there are enough data points to fit the model
    if len(group) < 2:
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': 'Not enough data points to fit ARIMA model'
        }])], ignore_index=True)
        continue
    
    warnings.filterwarnings("ignore")  # specify to ignore warning messages

    # Fit ARIMA model
    try:
       best_order, best_model = optimize_arima(group['sales'], p_values, d_value, q_values)
       
       if best_model is not None:
            # Forecast future sales (next 3 weeks)
            forecast = best_model.forecast(steps=3)
            
            # Append the forecast to the DataFrame
            df_forecasts = pd.concat([df_forecasts, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'forecast_week_1': forecast[0],
                'forecast_week_2': forecast[1],
                'forecast_week_3': forecast[2],
                'ARIMA': best_order
            }])], ignore_index=True)
       else:
            df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'error_message': 'Failed to find suitable ARIMA model'
            }])], ignore_index=True)
    except Exception as e:
        print(f"Error fitting ARIMA for Store: {store_id}, Product: {product_id}")
        print(str(e))
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': str(e)
        }])], ignore_index=True)


In [8]:
# Save the forecast DataFrame and the error DataFrame to CSV files
df_forecasts.to_csv('./Files/last_6_month_forecasts.csv', index=False)
df_product_error.to_csv('./Files/last_6_month_forecast_errors.csv', index=False)

In [9]:
df_forecasts

Unnamed: 0,store_id,product_id,forecast_week_1,forecast_week_2,forecast_week_3,ARIMA
0,S0020,P0001,3.073147,3.073147,3.073147,"(0, 1, 1)"
1,S0020,P0005,0.074072,0.074072,0.074072,"(0, 1, 1)"
2,S0020,P0007,0.000000,0.000000,0.000000,"(0, 1, 0)"
3,S0020,P0008,0.000000,0.000000,0.000000,"(0, 1, 0)"
4,S0020,P0009,1.703702,1.703702,1.703702,"(0, 1, 1)"
...,...,...,...,...,...,...
1912,S0097,P0739,2.666837,5.104150,3.586975,"(1, 1, 2)"
1913,S0097,P0740,0.622303,0.622303,0.622303,"(0, 1, 1)"
1914,S0097,P0741,0.453026,0.453026,0.453026,"(0, 1, 1)"
1915,S0097,P0747,15.086548,15.086548,15.086548,"(0, 1, 1)"
