# Databricks notebook source

In [None]:
# Darts
from darts import TimeSeries
from darts.models.forecasting.lgbm import LightGBMModel
from darts import TimeSeries
from darts import metrics
from darts.dataprocessing.transformers import Scaler

# Analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities
import pandas as pd
import numpy as np
import datetime
import mlflow
import mlflow.sklearn
from numpy import savetxt
from datetime import timedelta
import holidays
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, PowerTransformer

# Notebook configuration
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set_theme(style="whitegrid", palette="pastel")
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing sales series
str_select_sales = '''
                        SELECT *
                        FROM analytics.refined_sales_orders_agg
                        '''

df_series_sales = spark.sql(str_select_sales).toPandas()

df_series_sales.info()

In [None]:
# Sets SYSTEM_TIMESTAMP column as DataFrame index
df_series_sales.set_index('SYSTEM_TIMESTAMP', inplace=True)
df_series_sales.sort_index(inplace=True)

# Converts granularity to 30 minutes
df_series_sales_resampled = df_series_sales.resample('30T').sum()

# Creates a date range with 30-minute granularity between start and end date
date_range = pd.date_range(start=df_series_sales.index.min().strftime('%Y-%m-%d %H:00:00'), end=df_series_sales.index.max().strftime('%Y-%m-%d %H:30:00'), freq='30T')

# Checking if original data has all time points - must be 0
date_range.difference(df_series_sales_resampled.index).shape[0]

In [None]:
df_series_sales_resampled.info()

In [None]:
# Function to calculate hourly representations for historical weekdays
def add_pct_columns(df, method='mean', years=[2024], months=[2, 3, 4], col='NET_VALUE_clean'):
    # Add columns for day of week and time of day
    df['day_of_week'] = df.index.dayofweek
    df['time_of_day'] = df.index.time
    
    # Calculate current percentage representation
    df['day_total'] = df.groupby(df.index.date)[col].transform('sum')
    df['pct_current'] = df[col] / df['day_total']
    
    # Function to calculate desired statistic (mean or median)
    def calc_stat(group, method):
        if method == 'mean':
            return group.mean()
        elif method == 'median':
            return group.median()
        else:
            raise ValueError("Method must be 'mean' or 'median'")
    
    # Calculate historical mean or median for each day of week and time of day combination
    # Filters specific months to follow the baseline
    history_df = df[(df.index.year.isin(years)) & (df.index.month.isin(months))]
    
    pct_hist = history_df.groupby(['day_of_week', 'time_of_day'])['pct_current'].apply(calc_stat, method=method).reset_index()
    pct_hist.columns = ['day_of_week', 'time_of_day', 'pct_hist']
    
    # Merge historical statistics to original dataframe
    df = df.merge(pct_hist, on=['day_of_week', 'time_of_day'], how='left')
    
    return df, history_df

In [None]:
# Removing outliers
df_series_sales_resampled['zscore'] = stats.zscore(df_series_sales_resampled[['NET_VALUE']])
outliers_index = df_series_sales_resampled[df_series_sales_resampled.zscore.abs() >= 4].index
df_series_sales_resampled['NET_VALUE_clean'] = df_series_sales_resampled['NET_VALUE']
df_series_sales_resampled.loc[outliers_index, 'NET_VALUE_clean'] = np.nan
df_series_sales_resampled['NET_VALUE_clean'] = df_series_sales_resampled['NET_VALUE_clean'].interpolate()

# Generating calculated data
df_calculation, history_df = add_pct_columns(df_series_sales_resampled, method='mean', col='NET_VALUE_clean')
df_calculation['SYSTEM_TIMESTAMP'] = df_series_sales_resampled.index

# Adjusting date column for visualization using display()
df_calculation['time_of_day'] = pd.to_datetime(df_calculation['time_of_day'], format='%H:%M:00').dt.time
df_calculation['datetime'] = df_calculation.apply(lambda row: pd.Timestamp.combine(row['SYSTEM_TIMESTAMP'], row['time_of_day']), axis=1)
df_calculation = df_calculation.drop(columns=['SYSTEM_TIMESTAMP'])
df_calculation.rename(columns={'datetime': 'SYSTEM_TIMESTAMP'}, inplace=True)

# Filtering only the specific hours checked by the analyst
df_calculation_filt = df_calculation[df_calculation.time_of_day.isin([datetime.time(12, 0), datetime.time(15, 0), datetime.time(17, 30), datetime.time(23, 30)])].copy()
df_calculation = df_calculation.drop(columns=['time_of_day'])

In [None]:
display(df_calculation[df_calculation.SYSTEM_TIMESTAMP >= '2024-05-01 00:00:00'])

In [None]:
# Generating series for test period
dt_test = '2023-12-18 00:00:00'
df_test = df_calculation[df_calculation.SYSTEM_TIMESTAMP >= dt_test].copy()

series_real = TimeSeries.from_dataframe(df_test,
                                       time_col='SYSTEM_TIMESTAMP',
                                       value_cols='pct_current',
                                       freq='30T')

series_pred = TimeSeries.from_dataframe(df_test,
                                       time_col='SYSTEM_TIMESTAMP',
                                       value_cols='pct_hist',
                                       freq='30T')

In [None]:
# Factor to add to series to be measured, avoiding zeros
# Following the same proportion as the factor used in model metrics
factor = 1 / df_test.NET_VALUE_clean.max()

In [None]:
# Calculating metrics

def calculate_wape(y, yhat):
    """
    Calculates Weighted Absolute Percentage Error (WAPE).
    
    Args:
        y (list or numpy array): Actual values.
        yhat (list or numpy array): Predicted values.
    
    Returns:
        float: WAPE value.
        
    Description:
    This function calculates the Weighted Absolute Percentage Error (WAPE).
    
    WAPE takes into account both absolute error and percentage error between actual and predicted values. 
    Furthermore, it performs error weighting based on actual values. In seasonal sales time series scenarios, 
    seasonal peak periods are assigned a higher weight, so errors in these periods have a more significant impact on this metric.
    
    Weighting is performed by assigning higher weights to larger actual values, since errors in high demand periods 
    can have a more relevant impact on planning decisions and financial results.
    
    In the code section where weight is defined as 'weight = actual', it is considered that the weight of each observation 
    is equal to the actual value of that observation itself. This means that when calculating WAPE, absolute percentage errors 
    are multiplied by the actual values themselves before being summed to calculate total weighted error.
    """
    
    # Initializes variables to store total WAPE and total weights
    total_wape = 0
    total_weight = 0
    
    try:
        # Loop over actual (y) and predicted (yhat) values
        for i in range(len(y)):
            actual = y[i]  # Actual value
            predicted = yhat[i]  # Predicted value
            
            # Calculates absolute error between actual and predicted value
            absolute_error = abs(actual - predicted)
            
            # Calculates absolute percentage error
            absolute_percentage_error = absolute_error / actual
            
            # Calculates weight as the actual value (used for weighting)
            weight = actual
            
            # Updates total WAPE summing weighted percentage error
            total_wape += absolute_percentage_error * weight
            
            # Updates total weights
            total_weight += weight
        
        # Calculates final WAPE as weighted average of percentage errors
        wape = total_wape / total_weight * 100
    except:
        # If an exception occurs (e.g., division by zero), sets WAPE as infinite
        wape = np.inf
        
    return wape

mape = metrics.mape(series_real + factor, series_pred + factor)
smape = metrics.smape(series_real + factor, series_pred + factor)
ope = metrics.ope(series_real + factor, series_pred + factor)
r2 = metrics.r2_score(series_real + factor, series_pred + factor)
rmse = metrics.rmse(series_real + factor, series_pred + factor)
wape = calculate_wape(series_real + factor, series_pred + factor).values().flatten()[0]

print('WAPE: ', round(wape, 2))
print('R²: ', round(r2, 2))
print('\n')
print('MAPE: ', round(mape, 2))
print('OPE: ', round(ope, 2))
print('RMSE: ', round(rmse, 2))

In [None]:
ope

In [None]:
plt.figure(figsize=(20, 7))
series_pred[-80:].stack(series_real[-80:]).plot()

In [None]:
# Cumulative metrics
df_calculation

In [None]:
df_calculation_acc = df_calculation.groupby(df_calculation.SYSTEM_TIMESTAMP.dt.date)[['pct_hist', 'pct_current']].cumsum()
df_calculation_acc['SYSTEM_TIMESTAMP'] = df_calculation['SYSTEM_TIMESTAMP']

In [None]:
display(df_calculation_acc.iloc[-1000:])

In [None]:
# Generating series for test period
dt_test = '2023-12-18 00:00:00'
df_test = df_calculation_acc[df_calculation_acc.SYSTEM_TIMESTAMP >= dt_test].copy()

series_real = TimeSeries.from_dataframe(df_test,
                                       time_col='SYSTEM_TIMESTAMP',
                                       value_cols='pct_current',
                                       freq='30T')

series_pred = TimeSeries.from_dataframe(df_test,
                                       time_col='SYSTEM_TIMESTAMP',
                                       value_cols='pct_hist',
                                       freq='30T')

mape = metrics.mape(series_real + factor, series_pred + factor)
smape = metrics.smape(series_real + factor, series_pred + factor)
ope = metrics.ope(series_real + factor, series_pred + factor)
r2 = metrics.r2_score(series_real + factor, series_pred + factor)
rmse = metrics.rmse(series_real + factor, series_pred + factor)
wape = calculate_wape(series_real + factor, series_pred + factor).values().flatten()[0]

print('WAPE: ', round(wape, 2))
print('R²: ', round(r2, 2))
print('\n')
print('MAPE: ', round(mape, 2))
print('OPE: ', round(ope, 2))
print('RMSE: ', round(rmse, 2))