In [None]:
# Example with statistical forecasts

In [1]:
import pandas as pd

In [16]:
import ray
import pandas as pd
from DeepRetail.forecasting.extras import for_ray
from DeepRetail.preprocessing.converters import transaction_df
from DeepRetail.forecasting.statistical import StatisticalForecaster
from statsforecast import StatsForecast
from statsforecast.models import SeasonalNaive, WindowAverage, Naive, ETS, AutoARIMA
import numpy as np

In [28]:
@ray.remote
def for_ray(forecaster, horizon, n_windows_cv=None):
    """Uses ray to produce forecasts in parallel

    Args:
        forecaster (StatsForecast object): The forecast object
        horizon (int): Forecasting horizon
        n_windows_cv (int, optional): Number of cross-validation windows.
                                      Defaults to None.

    Returns:
        pd.DataFrame: A dataframe with the forecast results.
    """

    # For no cross-validation
    if n_windows_cv is None:
        forecast_df = forecaster.forecast(h=horizon)

        # For cross-validation
    else:
        forecast_df = forecaster.cross_validation(h=horizon, n_windows=n_windows_cv)

    return forecast_df



def transaction_df(df, keep_zeros=False):
    """Converts a pivoted df to a transaction df. A transaction df has 3 columns:
        - unique_id: Sales location of each time series.
        - date: The date
        - y: The value for the time series

    Args:
        df (pd.DataFrame): The pivoted Dataframe.
                        Each row is a time series and columns are the dates.
        keep_zeros (bool, optional): If to keep periods with zero sales.
                                    Defaults to False.
    """

    # resets the index
    trans_df = df.reset_index(names="unique_id")

    # Melts
    trans_df = pd.melt(trans_df, id_vars="unique_id", value_name="y", var_name="date")

    # Filters zeros if keep_zeros is set to True
    if keep_zeros:
        trans_df = trans_df[trans_df["y"] != 0]

    return trans_df


class StatisticalForecaster(object):
    def __init__(self, models, seasonal_length, window_length=None, n_jobs=-1):

        self.models = models
        self.seasonal_length = seasonal_length
        self.window_length = window_length
        self.n_jobs = n_jobs

    def fit(
        self,
        df,
        freq,
        observation_threshold,
        trailing_zeros_threshold,
        total_to_forecast="all",
    ):

        # trailing zeros threshold:
        # how many successive zeros at the end so we discard the time series
        # On observation threshold consider the test set as well
        # (fitted vals + test set)
        # -> Rule of thumb: we need at least 3 times the size of the forecast horizon
        # -> Rule of thumb 2: at least 2-3 full seasonal circles to capture seasonality

        # Generate the model list
        models_to_fit = []

        # Append to the list
        if "Naive" in self.models:
            models_to_fit.append(Naive())
        if "SNaive" in self.models:
            models_to_fit.append(SeasonalNaive(season_length=self.seasonal_length))
        if "MovingAverage" in self.models:
            models_to_fit.append(WindowAverage(self.window_length))
        if "ETS" in self.models:
            models_to_fit.append(ETS(season_length=self.seasonal_length))
        if "ARIMA" in self.models:
            models_to_fit.append(AutoARIMA(season_length=self.seasonal_length))

        # Estimate number of non-zero observations and trailing zeros
        obs_count = pd.DataFrame(df.shape[1] - df.isin([0]).sum(axis=1)).rename(
            columns={0: "Total_Observations"}
        )
        obs_count["Trailing_Zeros"] = (
            df.iloc[:, -trailing_zeros_threshold:].isin([0]).sum(axis=1)
        )

        # filter
        obs_count_f = obs_count[
            (obs_count["Total_Observations"] > observation_threshold)
            & (obs_count["Trailing_Zeros"] < trailing_zeros_threshold)
        ]
        ids = obs_count_f.reset_index()["unique_id"].unique()
        fc_df = df.loc[ids]

        # Give a summary of the selection
        print(
            f"From a total of {df.shape[0]}, {fc_df.shape[0]}  fullfill the conditions for forecasting"
        )

        # convert to the right format for stats forecasts
        # simply renaming
        fc_df = transaction_df(fc_df, keep_zeros=False)

        # Prepare the date column
        fc_df = fc_df.rename(columns={"date": "ds"})
        fc_df["ds"] = pd.to_datetime(fc_df["ds"])

        if total_to_forecast != "all":
            # Take a sample
            ids = fc_df["unique_id"].unique()
            sample = np.random.choice(ids, 15)
            fc_df = fc_df[fc_df["unique_id"].isin(sample)]

        # Define the forecaster
        forecaster = StatsForecast(
            df=fc_df, models=models_to_fit, freq=freq, n_jobs=self.n_jobs
        )
        # Complete the fit
        self.forecaster = forecaster

    def predict(self, fh, cv=None, parallel=True):

        if parallel:
            # For parallelism use ray
            res_df = ray.get(for_ray.remote(self.forecaster, fh, cv))

        # for no parallelism just forecast
        else:
            if cv is None:
                res_df = self.forecaster.forecast(h=fh)
            else:
                res_df = self.forecaster.cross_validation(h=fh, n_windows=cv)

        return res_df


In [7]:
df = pd.read_csv('/home/filtheo/DeepRetail/aldi_month.csv', index_col = 0)

In [25]:
# Initialize some parameters
models = ['ETS', 'Naive', 'SNaive']
seasonal_length = 12
n_jobs = 1

In [29]:
# Define the forecaster
# Idealy I want to get rid of the seasonal length and only define frequency
# It needs some work so I have not yet implemented it 
forecaster = StatisticalForecaster(models = models, seasonal_length = seasonal_length, n_jobs = n_jobs)

In [11]:
print(df.shape[1])

39


In [None]:
# A total of 39 months!
# For ETS to capture seasonality we need at least 3 complete periods = (12*3 = 36)
# I will keep a month as a test set here -> Seasonality wont be captured

# Since I have h = 12 I will need at least 2*h non-zero observations for adequate forecasts
# (My rule of thumb based on experience with these!)
# Also I dont want to forecast items whose demand is stopped to save time

In [13]:
# Initialize
freq = 'M'
ot = 24 # the number of non-zero observations for a time serie
                    # Some time series were very intermitteny => non forecasable
tzt = 10 # how many zeros at the end to discard from forecasting
ttf = 15 # Just a small sample 

In [30]:
# Fit the forecaster
forecaster.fit(df, freq = freq, observation_threshold = ot, trailing_zeros_threshold = tzt, total_to_forecast = ttf )

From a total of 387, 387  fullfill the conditions for forecasting


In [31]:
# Variables for forecasting
h = 12
cv = None # no cross-validation
parallel = False # no parallelism

In [32]:
# Forecast
pred = forecaster.predict(fh = h, cv = cv, parallel = parallel)

In [34]:
pred.head()

Unnamed: 0_level_0,ds,Naive,SeasonalNaive,ETS
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
101,2022-04-30,24097.0,24052.0,24872.392578
101,2022-05-31,24097.0,22603.0,24872.392578
101,2022-06-30,24097.0,24097.0,24872.392578
101,2022-07-31,24097.0,30265.0,24872.392578
101,2022-08-31,24097.0,39653.0,24872.392578


In [None]:
# Next steps:
# Evaluation
# AutoML Forecaster