In [1]:
# packages gerais
import pandas as pd
import joblib


# df_sales_ dataset
df_sales_filtered_all = joblib.load('df_sales_filtered_all.pkl')
df_sales_filtered_2018_2019 = joblib.load('df_sales_filtered_2018_2019.pkl')
df_sales_filtered_2019 = joblib.load('df_sales_filtered_2019.pkl')
df_sales_filtered_last_6_month = joblib.load('df_sales_filtered_last_6_month.pkl')

## 1 Create a date field based in Year and Week of the year

### 1.1 But the date to make sense should be the last day in that specific week

In [2]:
import datetime
# Function to get the last day of the week using ISO calendar
def get_last_day_of_iso_week(year, week):
    first_day_of_year = datetime.datetime(year, 1, 4)  # 4th January is always in the first ISO week
    first_monday_of_year = first_day_of_year - datetime.timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + datetime.timedelta(weeks=week-1)
    return week_start_date + datetime.timedelta(days=6)

# Applying function to DataFrame
df_sales_filtered_2019['last_day_of_week'] = df_sales_filtered_2019.apply(
    lambda x: get_last_day_of_iso_week(x['year'], x['week']), axis=1
)

In [3]:
df_sales_filtered_2019[(df_sales_filtered_2019['store_id'] == 'S0097') & (df_sales_filtered_2019['product_id'] == 'P0704')].tail(30)

Unnamed: 0,store_id,product_id,year,week,sales,revenue,stock,price,last_day_of_week
52139,S0097,P0704,2019,10,10.0,31.48,22.0,3.4,2019-03-10
52140,S0097,P0704,2019,11,10.0,24.87,12.0,3.4,2019-03-17
52141,S0097,P0704,2019,12,13.0,28.64,17.0,3.4,2019-03-24
52142,S0097,P0704,2019,13,13.0,31.48,27.0,3.4,2019-03-31
52143,S0097,P0704,2019,14,8.0,25.19,39.0,3.4,2019-04-07
52144,S0097,P0704,2019,15,4.0,12.6,35.0,3.4,2019-04-14
52145,S0097,P0704,2019,16,20.0,47.85,15.0,3.4,2019-04-21
52146,S0097,P0704,2019,17,15.0,33.04,20.0,3.4,2019-04-28
52147,S0097,P0704,2019,18,13.0,33.36,47.0,3.4,2019-05-05
52148,S0097,P0704,2019,19,7.0,15.75,37.0,3.4,2019-05-12


### 1.2 Convert date to time series by set as index and sort that

In [4]:
# Set 'date' column as index and sort by date
df_sales_filtered_2019.set_index('last_day_of_week', inplace=True)
df_sales_filtered_2019.sort_index(inplace=True)

In [5]:
df_sales_filtered_2019.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 52906 entries, 2019-01-06 to 2019-09-29
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   store_id    52906 non-null  object 
 1   product_id  52906 non-null  object 
 2   year        52906 non-null  UInt32 
 3   week        52906 non-null  UInt32 
 4   sales       52906 non-null  float64
 5   revenue     52906 non-null  float64
 6   stock       52906 non-null  float64
 7   price       52906 non-null  float64
dtypes: UInt32(2), float64(4), object(2)
memory usage: 3.3+ MB


### 1.3 Prepare to apply ARIMA

In [6]:
# Group by 'store_id' and 'product_id'
grouped = df_sales_filtered_2019.groupby(['store_id', 'product_id'])

# Create a DataFrame to store forecasts and a data frame to have those product/store with error
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])

In [7]:
from statsmodels.tsa.arima.model import ARIMA
import warnings
from itertools import product
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import pandas as pd
import joblib


# Group by store_id and product_id
grouped = df_sales_filtered_2019.groupby(['store_id', 'product_id'])

# Function to perform grid search for ARIMA parameters
def optimize_arima(series, p_values, d_value, q_values):
    best_aic = float("inf")
    best_order = None
    best_model = None
    for p, q in product(p_values, q_values):
        try:
            model = ARIMA(series, order=(p, d_value, q))
            model_fit = model.fit()
            aic = model_fit.aic
            if aic < best_aic:
                best_aic = aic
                best_order = (p, d_value, q)
                best_model = model_fit
        except:
            continue
    return best_order, best_model

# Function to calculate MSE, RMSE, MAE, MAPE
def calculate_metrics(actual, predicted):
    mse = mean_squared_error(actual, predicted)
    rmse = mean_squared_error(actual, predicted, squared=False)
    mae = mean_absolute_error(actual, predicted)
    mape = mean_absolute_percentage_error(actual, predicted)
    return mse, rmse, mae, mape

# Grid search parameters
p_values = range(0, 3)
d_value = 1
q_values = range(0, 3)

# DataFrames to store results
df_forecasts = pd.DataFrame(columns=['store_id', 'product_id', 'forecast_week_1', 'forecast_week_2', 'forecast_week_3', 'ARIMA'])
df_product_error = pd.DataFrame(columns=['store_id', 'product_id', 'error_message'])
df_mse = pd.DataFrame(columns=['store_id', 'product_id', 'mse', 'rmse', 'mae', 'mape'])

# Iterate over each group
for (store_id, product_id), group in grouped:
    # Reindex to ensure complete weekly intervals
    group = group.asfreq('W-SUN', method='pad')
    
    # Ensure there are enough data points to fit the model
    if len(group) < 2:
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': 'Not enough data points to fit ARIMA model'
        }])], ignore_index=True)
        continue
    
    warnings.filterwarnings("ignore")  # specify to ignore warning messages

    # Fit ARIMA model
    try:
        best_order, best_model = optimize_arima(group['sales'], p_values, d_value, q_values)
       
        if best_model is not None:
            # Forecast future sales (next 3 weeks)
            forecast = best_model.forecast(steps=3)
            
            # Append the forecast to the DataFrame
            df_forecasts = pd.concat([df_forecasts, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'forecast_week_1': forecast[0],
                'forecast_week_2': forecast[1],
                'forecast_week_3': forecast[2],
                'ARIMA': best_order
            }])], ignore_index=True)
            
            # Calculate metrics (assuming you have actual future sales for comparison)
            # Replace 'actual_future_sales' with your actual sales data for the next 3 weeks
            actual_future_sales = group['sales'][-3:]  # Adjust based on actual data availability
            if len(actual_future_sales) == 3:
                mse, rmse, mae, mape = calculate_metrics(actual_future_sales, forecast)
                # Append the metrics to the DataFrame
                df_mse = pd.concat([df_mse, pd.DataFrame([{
                    'store_id': store_id,
                    'product_id': product_id,
                    'mse': mse,
                    'rmse': rmse,
                    'mae': mae,
                    'mape': mape
                }])], ignore_index=True)
            else:
                df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                    'store_id': store_id,
                    'product_id': product_id,
                    'error_message': 'Not enough actual future data to calculate metrics'
                }])], ignore_index=True)
        else:
            df_product_error = pd.concat([df_product_error, pd.DataFrame([{
                'store_id': store_id,
                'product_id': product_id,
                'error_message': 'Failed to find suitable ARIMA model'
            }])], ignore_index=True)
    except Exception as e:
        print(f"Error fitting ARIMA for Store: {store_id}, Product: {product_id}")
        print(str(e))
        df_product_error = pd.concat([df_product_error, pd.DataFrame([{
            'store_id': store_id,
            'product_id': product_id,
            'error_message': str(e)
        }])], ignore_index=True)

# Save the DataFrames to CSV files (or any other desired format)
df_forecasts.to_csv('forecasts.csv', index=False)
df_product_error.to_csv('product_errors.csv', index=False)
df_mse.to_csv('2019_mse_results.csv', index=False)


In [None]:
# This input and merge the data predicted with the sales already done
import numpy as np

# Pivot the forecasts DataFrame
df_forecasts_melted = df_forecasts.melt(id_vars=['store_id', 'product_id', 'ARIMA'], 
                                        value_vars=['forecast_week_1', 'forecast_week_2', 'forecast_week_3'], 
                                        var_name='week', value_name='forecast')

# Extract the week number from the 'week' column
df_forecasts_melted['week'] = df_forecasts_melted['week'].str.extract('(\d+)').astype(int)
df_forecasts_melted

# Resetting index to ensure last_day_of_week is a regular column
df_sales_filtered_2019.reset_index(drop=False, inplace=True)

# Step 1: Find the latest year and week number for each store_id and product_id
latest_weeks = df_sales_filtered_2019.groupby(['store_id', 'product_id'])[['year', 'week']].max().reset_index()
latest_weeks.columns = ['store_id', 'product_id', 'latest_year', 'latest_week']

# Step 2: Merge the latest year and week numbers with the forecast DataFrame
df_combined = df_forecasts_melted.merge(latest_weeks, on=['store_id', 'product_id'], how='left')

# Step 3: Add the forecast weeks to the latest week numbers, adjusting for year transition
def adjust_year_week(row):
    new_week = row['latest_week'] + row['week']
    new_year = row['latest_year']
    while new_week > 52:  # Assuming 52 weeks in a year
        new_week -= 52
        new_year += 1
    return new_year, new_week

df_combined[['forecast_year', 'forecast_week']] = df_combined.apply(
    lambda row: adjust_year_week(row), axis=1, result_type="expand"
)

# Step 4: Calculate the forecast's last_day_of_week based on forecast_year and forecast_week
def get_last_day_of_iso_week(year, week):
    first_day_of_year = pd.Timestamp(year, 1, 4)
    first_monday_of_year = first_day_of_year - pd.Timedelta(days=first_day_of_year.weekday())
    week_start_date = first_monday_of_year + pd.Timedelta(weeks=week-1)
    return week_start_date + pd.Timedelta(days=6)

df_combined['last_day_of_week'] = df_combined.apply(
    lambda row: get_last_day_of_iso_week(row['forecast_year'], row['forecast_week']), axis=1
)

# Step 5: Select and rename the necessary columns to match the desired format, using forecast as sales
df_combined_final = df_combined[['store_id', 'product_id', 'forecast_year', 'forecast_week', 'forecast', 'last_day_of_week', 'ARIMA']]
df_combined_final.columns = ['store_id', 'product_id', 'year', 'week', 'sales', 'last_day_of_week', 'ARIMA']

# Step 6: Concatenate with the original sales DataFrame
df_sales_final_arima = df_sales_filtered_2019[['store_id', 'product_id', 'year', 'week', 'sales', 'last_day_of_week']]
df_sales_final_arima['ARIMA'] = np.nan  # Adding ARIMAX column with NaN for actual sales

df_final_arima = pd.concat([df_sales_final_arima, df_combined_final], ignore_index=True).sort_values(by=['store_id', 'product_id', 'year', 'week'])

# Display the final DataFrame
df_final_arima

In [None]:
df_forecasts

In [None]:
df_mse