# Databricks notebook source

## 1. Libraries

In [None]:
# Analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.stats import variation
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from scipy import stats
from scipy.stats import f_oneway
from scipy.stats import shapiro
from statsmodels.tsa.stattools import adfuller
from scipy.stats import pearsonr, spearmanr
from sklearn.preprocessing import MinMaxScaler, PowerTransformer

# Utilities
import pandas as pd
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta
import holidays
from sklearn.preprocessing import MinMaxScaler
import pywt
from scipy.fftpack import fft

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as ply

# Notebook configuration
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set_theme(style="whitegrid", palette="pastel")
import warnings
warnings.filterwarnings('ignore')


## 2. Data Import

In [None]:
# Monthly Budget
df_monthly_budget = pd.read_excel('/Workspace/Repos/DataScience/FORECAST_PROJECT/src/data/2024_Budget_Revenue_Adjusted.xlsx',
                   sheet_name='Target_Adjustment', skiprows=2, header=None, usecols="AR:BE", nrows=6)
df_monthly_budget.columns = df_monthly_budget.iloc[0]
df_monthly_budget = df_monthly_budget[1:]
df_monthly_budget = df_monthly_budget[df_monthly_budget['BU'] == 'Total'].transpose().reset_index()[1:-1]
df_monthly_budget.columns = ['month', 'total']

df_monthly_budget['total'] = df_monthly_budget['total'].astype(float)

df_monthly_budget.info()

In [None]:
# Sales Series
str_select_sales = '''
                        SELECT *
                        FROM analytics.refined_sales_orders_agg
                        '''

df_series_sales = spark.sql(str_select_sales).toPandas()

df_series_sales.info()

In [None]:
df_series_sales.head()


## 3. Feature Analysis

In [None]:
'''
General
  Distribution
  Outliers
  Components
  ACF/PACF
  Day of month
  Day of week
  Holidays
  Monthly budget

Hourly cumulative processed series
  Distribution  
  Outliers
  Components
  ACF/PACF
  Day of month
  Day of week
  Holidays
  Monthly budget

Project cumulatives or 30-minute granularity series????
Project components?
Project stationary series?

'''


### 3.1 30-min Granularity Series


#### 3.1.1 General Parameters

In [None]:
# Setting SYSTEM_TIMESTAMP column as DataFrame index
df_series_sales.set_index('SYSTEM_TIMESTAMP', inplace=True)
df_series_sales.sort_index(inplace=True)

# Converting granularity to 30 minutes
df_series_sales_resampled = df_series_sales.resample('30T').sum()

# Creating a date range with 30-minute granularity between start and end date
date_range = pd.date_range(start=df_series_sales.index.min().strftime('%Y-%m-%d %H:00:00'), end=df_series_sales.index.max().strftime('%Y-%m-%d %H:30:00'), freq='30T')

In [None]:
df_series_sales_resampled.head()

In [None]:
# Checking if original data has all time points
date_range.difference(df_series_sales_resampled.index).shape[0]

In [None]:
df_series_sales_resampled.describe()

In [None]:
display(df_series_sales_resampled.reset_index()[-50:])

In [None]:
df_series_sales_resampled.plot.line(figsize=(20, 7))

In [None]:
plt.figure(figsize=(10, 7))
sns.histplot(df_series_sales_resampled)

In [None]:
plt.figure(figsize=(10, 7))
sns.boxplot(df_series_sales_resampled, showfliers=False)

In [None]:
# Calculating zscore to check for outliers
df_series_sales_resampled['zscore'] = stats.zscore(df_series_sales_resampled[['NET_VALUE']])

In [None]:
df_series_sales_resampled.describe()

In [None]:
# Series without outliers
display(df_series_sales_resampled[df_series_sales_resampled.zscore.abs() < 3].reset_index())

In [None]:
# Distribution without outliers
plt.figure(figsize=(10, 7))
sns.histplot(df_series_sales_resampled[df_series_sales_resampled.zscore.abs() < 3][['NET_VALUE']])

In [None]:
# How many points are zero
f'{round(len(df_series_sales_resampled[df_series_sales_resampled.NET_VALUE == 0]) / len(df_series_sales_resampled), 2) * 100}%'

In [None]:
# Shapiro-Wilk normality test
statistic, p_value = shapiro(df_series_sales_resampled['NET_VALUE'].values)

if p_value < 0.05:
    print('The series does not follow a normal curve.')    
else:
    print('The series follows a normal curve.\n')    
print('Test Statistic:', statistic)
print('P-value:', p_value)

In [None]:
# Creating box-cox scaled values
boxcox_transformer = PowerTransformer()
df_series_sales_resampled['NET_VALUE_boxcox'] = boxcox_transformer.fit_transform(df_series_sales_resampled[['NET_VALUE']])

In [None]:
# Checking box-cox distribution
plt.figure(figsize=(10, 7))
sns.histplot(df_series_sales_resampled['NET_VALUE_boxcox'])

In [None]:
# Shapiro-Wilk normality test for box-cox
statistic, p_value = shapiro(df_series_sales_resampled['NET_VALUE_boxcox'])

if p_value < 0.05:
    print('The series does not follow a normal curve.')    
else:
    print('The series follows a normal curve.\n')    
print('Test Statistic:', statistic)
print('P-value:', p_value)

In [None]:
# Checking distribution for differenced values
plt.figure(figsize=(10, 7))
sns.histplot(df_series_sales_resampled['NET_VALUE'].diff())

In [None]:
# Shapiro-Wilk normality test for differenced series
statistic, p_value = shapiro(df_series_sales_resampled['NET_VALUE'].diff())

if p_value < 0.05:
    print('The series does not follow a normal curve.')    
else:
    print('The series follows a normal curve.\n')    
print('Test Statistic:', statistic)
print('P-value:', p_value)

In [None]:
# Stationarity Test
# Stationarity Test - Required for Granger Causality Test
stationarity_test_result = adfuller(df_series_sales_resampled[['NET_VALUE']])
stationarity_test_result

'''
return:

(test statistic,
pvalue,
number of lags used,
number of observations used for regression and calculation of critical values),
{critical values for different significance levels},
maximized information criterion
'''

In [None]:
# Stationarity Test
if stationarity_test_result[1] <= 0.05:
    print(f'The series is stationary')
    print(f'p-value: {round(stationarity_test_result[1], 5)}')
else:
    print(f'The series is NOT stationary')
    print(f'p-value: {round(stationarity_test_result[1], 5)}')

In [None]:
# Plotting time series decomposition
decomposition = seasonal_decompose(df_series_sales_resampled[['NET_VALUE']], 
                                  model = 'additive', 
                                  period = 7,
                                  two_sided = True)

fig_observed = go.Figure()
fig_trend = go.Figure()
fig_seasonal = go.Figure()
fig_residual = go.Figure()

fig_observed.add_trace(go.Scatter(x=decomposition.observed.index, y=decomposition.observed.values, name='Observed'))
fig_trend.add_trace(go.Scatter(x=decomposition.trend.index, y=decomposition.trend.values, name='Trend'))
fig_seasonal.add_trace(go.Scatter(x=decomposition.seasonal.index, y=decomposition.seasonal.values, name='Seasonality'))
fig_residual.add_trace(go.Scatter(x=decomposition.resid.index, y=decomposition.resid.values, name='Residual'))

fig_observed.update_layout(title='Observed')
fig_trend.update_layout(title='Trend')
fig_seasonal.update_layout(title='Seasonality')
fig_residual.update_layout(title='Residual')

fig_observed.show()
fig_trend.show()
fig_seasonal.show()
fig_residual.show()

In [None]:
sns.histplot(decomposition.resid.values)

In [None]:
# Shapiro-Wilk normality test for residuals
statistic, p_value = shapiro(decomposition.resid.dropna().values)

if p_value < 0.05:
    print('The series does not follow a normal curve.')    
else:
    print('The series follows a normal curve.\n')    
print('Test Statistic:', statistic)
print('P-value:', p_value)

In [None]:
# Residual stationarity test
stationarity_test_result = adfuller(decomposition.resid.dropna().values)
stationarity_test_result

if stationarity_test_result[1] <= 0.05:
    print(f'The series is stationary')
    print(f'p-value: {round(stationarity_test_result[1], 5)}')
else:
    print(f'The series is NOT stationary')
    print(f'p-value: {round(stationarity_test_result[1], 5)}')


#### 3.1.2 Seasonalities

In [None]:
# Analyzing frequencies present in the series

# Scaling data
scale = MinMaxScaler()
df_series_sales_resampled['net_value_scld'] = scale.fit_transform(df_series_sales_resampled[['NET_VALUE']]).squeeze()


# Functions for signal analysis
def get_ave_values(xvalues, yvalues, n = 5):
    signal_length = len(xvalues)
    if signal_length % n == 0:
        padding_length = 0
    else:
        padding_length = n - signal_length//n % n
    xarr = np.array(xvalues)
    yarr = np.array(yvalues)
    xarr.resize(signal_length//n, n)
    yarr.resize(signal_length//n, n)
    xarr_reshaped = xarr.reshape((-1,n))
    yarr_reshaped = yarr.reshape((-1,n))
    x_ave = xarr_reshaped[:,0]
    y_ave = np.nanmean(yarr_reshaped, axis=1)
    return x_ave, y_ave

def plot_signal_plus_average(time, signal, average_over = 5):
    fig, ax = plt.subplots(figsize=(20, 5))
    time_ave, signal_ave = get_ave_values(time, signal, average_over)
    ax.plot(time, signal, label='signal')
    ax.plot(time_ave, signal_ave, label = 'time average (n={})'.format(5))
    ax.set_xlim([time[0], time[-1]])
    ax.set_ylabel('Signal Amplitude', fontsize=18)
    ax.set_title('Signal + Time Average', fontsize=18)
    ax.set_xlabel('Time', fontsize=18)
    ax.legend()
    plt.show()

def get_fft_values(y_values, T, N, f_s):
    f_values = np.linspace(0.0, 1.0/(2.0*T), N//2)
    fft_values_ = fft(y_values)
    fft_values = 2.0/N * np.abs(fft_values_[0:N//2])
    return f_values, fft_values

def plot_fft_plus_power(time, signal, n=None):
    dt = time[1] - time[0]
    N = len(signal)
    fs = 1/dt

    fig, ax = plt.subplots(figsize=(20, 5))
    variance = np.std(signal)**2
    f_values, fft_values = get_fft_values(signal, dt, N, fs)
    fft_power = variance * abs(fft_values) ** 2     # FFT power spectrum
    if n:
        ax.plot(f_values[:n], fft_values[:n], 'r-', label='Fourier Transform')

        # X-axis ticks adjustment
        num_ticks = 20  
        ax.set_xticks(np.linspace(f_values[:n][0], f_values[:n][-1], num_ticks))
    else:
        ax.plot(f_values, fft_values, 'r-', label='Fourier Transform')
        # X-axis ticks adjustment
        num_ticks = 20
        ax.set_xticks(np.linspace(f_values[0], f_values[-1], num_ticks))


    #ax.plot(f_values, fft_power, 'k--', linewidth=1, label='FFT Power Spectrum')
    ax.set_xlabel('Frequency [Hz / hour]', fontsize=18)
    ax.set_ylabel('Amplitude', fontsize=18)
    ax.legend()
    plt.show()


def plot_wavelet_serie(time, signal, scales,
                        waveletname='cmor',
                        #cmap=plt.cm.seismic,
                        cmap=plt.cm.viridis,
                        title='Wavelet Transform (Intensity Spectrum)',
                        ylabel='Period [hours]',
                        xlabel='Time',
                        vlines=False,
                        ylim=None):  # Parameter to limit Y axis

  dt = time[1] - time[0]
  [coefficients, frequencies] = pywt.cwt(signal, scales, waveletname, dt)
  power = (abs(coefficients)) ** 2
  period = 1.0 / frequencies
  levels = [0.1, 0.5, 1, 10, 20, 30, 40, 50, 60, 70, 80, 90]
  contourlevels = np.log2(levels)

  fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 15), sharex=True, gridspec_kw={'height_ratios': [2, 5]})

  # Time Series
  ax1.plot(time, signal, label='Series')
  ax1.set_title('Time Series', fontsize=15)
  ax1.set_ylabel('Series (scaled)', fontsize=12)

  # Scalogram
  # period * 8760 * 2, because each period value represents 30 minutes in decimal of a year
  im = ax2.contourf(time, period * 8760 * 2, np.log2(power), contourlevels, extend='both', cmap=cmap)

  # Adjust color scale for better visualization
  #im.set_clim(np.min(np.log2(power)), np.max(np.log2(power)))

  ax2.set_title(title, fontsize=15)
  ax2.set_ylabel(ylabel, fontsize=12)
  ax2.set_xlabel(xlabel, fontsize=12)

  if ylim is not None:
      ax2.set_ylim(ylim)

  # Add main vertical lines if necessary
  vertical_lines = np.unique(time.astype(int)).tolist()
  if vlines:
      for line_position in vertical_lines:
          ax2.axvline(x=line_position, color='black', linestyle='--', linewidth=1)

  cbar_ax = fig.add_axes([0.95, 0.25, 0.03, 0.25]) # [left, bottom, width, height]
  fig.colorbar(im, cax=cbar_ax, orientation="vertical", pad=0.1)

  plt.show()

In [None]:
def calculate_year_decimal(date):
    '''
    Converts date series to decimal
    '''
    year_start = pd.Timestamp(year=date.year, month=1, day=1)
    year_end = pd.Timestamp(year=date.year + 1, month=1, day=1)
    
    # Total number of seconds in the year
    year_duration = (year_end - year_start).total_seconds()
    
    # Number of seconds elapsed since the start of the year
    elapsed_time = (date - year_start).total_seconds()
    
    # Calculate the decimal year
    year_decimal = date.year + (elapsed_time / year_duration)
    
    return year_decimal

time = df_series_sales_resampled.index.to_series().apply(calculate_year_decimal).values
signal = df_series_sales_resampled.net_value_scld.values

plot_signal_plus_average(time, signal)
plot_fft_plus_power(time, signal, n=2000)

In [None]:
# Available Wavelets
(', ').join(pywt.wavelist(kind='continuous'))

In [None]:
scales = np.arange(1, 300)
df_filt_wavelet = df_series_sales_resampled[df_series_sales_resampled.index.year >= 2024]
time = df_filt_wavelet.index.to_series().apply(calculate_year_decimal).values
signal = df_filt_wavelet.net_value_scld.values
plot_wavelet_serie(time, signal, scales, waveletname='morl', vlines=False)


#### 3.1.3 Auto Correlations

In [None]:
# Plotting Autocorrelation
nlags = (2*24) * 2
corr_array = acf(df_series_sales_resampled[['NET_VALUE']], alpha=0.05, nlags=nlags)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color='#3f3f3f') for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4', marker_size=6)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags])
fig.update_yaxes(zerolinecolor='#000000')

title='Autocorrelation (ACF)'
fig.update_layout(title=title,     
                  width=1600,
                  height=600)
fig.show()

# Generates the list with lags containing significant correlation
indices_out_of_bounds = [i for i, (corr, lower, upper) in enumerate(zip(corr_array[0], lower_y, upper_y)) if corr < lower or corr > upper]
print('Lags with significant autocorrelation:')
lags_acf = [-x for x in indices_out_of_bounds[1:]]
lags_acf.sort()
print(lags_acf)

In [None]:
# Plotting Partial Autocorrelation
corr_array = pacf(df_series_sales_resampled[['NET_VALUE']], alpha=0.05, nlags=nlags)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0, corr_array[0][x]), mode='lines',line_color='#3f3f3f') for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4', marker_size=6)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags]) 
fig.update_yaxes(zerolinecolor='#000000')

title='Partial Autocorrelation (PACF)'
fig.update_layout(title=title,     
                  width=1600,
                  height=600)
fig.show()

# Generates the list with lags containing significant correlation
indices_out_of_bounds = [i for i, (corr, lower, upper) in enumerate(zip(corr_array[0], lower_y, upper_y)) if corr < lower or corr > upper]
print('Lags with significant partial autocorrelation:')
lags_pacf = [-x for x in indices_out_of_bounds[1:]]
lags_pacf.sort()
print(lags_pacf)


#### 3.1.4 Date Attributes

In [None]:
# Creating temporal variables
df_series_sales_resampled['year'] = df_series_sales_resampled.index.year
df_series_sales_resampled['month'] = df_series_sales_resampled.index.month
df_series_sales_resampled['week_of_year'] = df_series_sales_resampled.index.isocalendar().week
df_series_sales_resampled['day_of_week'] = df_series_sales_resampled.index.day_of_week #  Monday=0 and Sunday=6
df_series_sales_resampled['hour'] = df_series_sales_resampled.index.hour

# Including day of week name
day_week_name = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df_series_sales_resampled['day_week_name'] = df_series_sales_resampled['day_of_week'].map(day_week_name)

# Including holiday
brazil_holidays = holidays.Brazil()
df_series_sales_resampled['event'] = df_series_sales_resampled.index.map(lambda x: brazil_holidays.get(x, None))

# Function to determine custom events
def custom_events(date):
    month_day = date.strftime('%m-%d')
    
    # Mother's Day (second Sunday of May)
    mothers_day = pd.Timestamp(date.year, 5, 1) + pd.DateOffset(weekday=6, weeks=1)
    # Mother's Day Eve
    mothers_day_eve = mothers_day - timedelta(days=1)
    # Valentines Day (June 12th in Brazil)
    valentines_day = pd.Timestamp(date.year, 6, 12)
    # Black Friday (fourth Friday of November)
    black_friday = pd.Timestamp(date.year, 11, 1) + pd.DateOffset(weekday=4, weeks=3)
    
    if month_day == mothers_day.strftime('%m-%d'):
        return 'Mothers Day'
    elif month_day == mothers_day_eve.strftime('%m-%d'):
        return 'Mothers Day Eve'
    elif month_day == valentines_day.strftime('%m-%d'):
        return 'Valentines Day'
    elif month_day == black_friday.strftime('%m-%d'):
        return 'Black Friday'
    else:
        return None

# Function to include events in holiday column
def include_events(row):
    if pd.isna(row['event']):
        return custom_events(row.name)
    else:
        return row['event']

# Adding custom holidays
df_series_sales_resampled['event'] = df_series_sales_resampled.apply(include_events, axis=1)

# Creating holiday flag
df_series_sales_resampled['event_flag'] = df_series_sales_resampled['event'].apply(lambda x: 1 if x is not None else 0)

df_series_sales_resampled['event'].fillna('Normal Day', inplace=True)

In [None]:
# Checking created holidays and events
df_series_sales_resampled['event'].unique().tolist()

In [None]:
df_series_sales_resampled.describe()

In [None]:
df_series_sales_resampled.head()

In [None]:
display(df_series_sales_resampled)

In [None]:
display(df_series_sales_resampled[(df_series_sales_resampled['event'] == 'Black Friday') | (df_series_sales_resampled['event'].isna())][['NET_VALUE', 'event']].fillna('normal day'))

In [None]:
# Verification of existence of statistically significant differences between variables on NET_VALUE
categorical_columns = ['event',
                        'event_flag',
                        'month',
                        'week_of_year',
                        'day_of_week',
                        'hour']

# List to store results
results = []

# Iterate through categorical columns and perform ANOVA test
categorical_df = df_series_sales_resampled[categorical_columns + ['NET_VALUE']]
categorical_df[categorical_columns] = categorical_df[categorical_columns].astype('category')

for col in categorical_columns:
    grouped_data = [categorical_df.loc[categorical_df[col] == category, 'NET_VALUE'] for category in categorical_df[col].unique()]
    
    f_statistic, p_value = f_oneway(*grouped_data)
    
    significant_difference = "Yes" if p_value < 0.05 else "No"
    
    results.append({'Variable': col, 'Significant Difference': significant_difference, 'p-value': p_value})

# Create DataFrame with results
results_df = pd.DataFrame(results)

display(results_df)

In [None]:
# Including date variables
df_monthly_budget['data'] = pd.to_datetime(df_monthly_budget['month'])
df_monthly_budget.drop(columns=['month'], axis=1, inplace=True)
df_monthly_budget['year'] = df_monthly_budget.data.dt.year
df_monthly_budget['month'] = df_monthly_budget.data.dt.month

# Including monthly budget
df_series_sales_resampled['data'] = df_series_sales_resampled.index
df_series_sales_resampled = df_series_sales_resampled.merge(df_monthly_budget[['year', 'month','total']], on=['year', 'month'], how='left')
df_series_sales_resampled.rename(columns={'total': 'month_budget'}, inplace=True)

In [None]:
df_series_sales_resampled.describe()

In [None]:
# Plotting heatmap with statistical significance annotations
df_corr = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df_corr,
            square = True, 
            cbar = True, 
            cmap = 'crest',
            ax = ax)

# Statistical significance annotations
rho = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr()
pval = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
p = rho.round(2).astype(str) + p

for i in range(p.shape[0]):
    for j in range(p.shape[1]):
        ax.text(j + 0.5, i + 0.5, p.iloc[i, j], ha='center', va='center', fontsize=8)

# Adjusting layout
plt.title('Correlation Matrix - Pearson')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()

# Displaying heatmap
plt.show()

In [None]:
# Plotting heatmap with statistical significance annotations
df_corr = df_series_sales_resampled.drop(columns=['event', 'day_week_name', 'zscore'], axis=1).corr(method='spearman')

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df_corr,
            square = True, 
            cbar = True, 
            cmap = 'crest',
            ax = ax)

# Statistical significance annotations
rho = df_series_sales_resampled.drop(columns=['event', 'day_week_name', 'zscore'], axis=1).corr()
pval = df_series_sales_resampled.drop(columns=['event', 'day_week_name', 'zscore'], axis=1).corr(method=lambda x, y: spearmanr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
p = rho.round(2).astype(str) + p

for i in range(p.shape[0]):
    for j in range(p.shape[1]):
        ax.text(j + 0.5, i + 0.5, p.iloc[i, j], ha='center', va='center', fontsize=8)

# Adjusting layout
plt.title('Correlation Matrix - Spearman')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()

# Displaying heatmap
plt.show()


### 3.2 Daily Cumulative Series

In [None]:
# Creating daily cumulative series
df_series_sales_resampled['NET_VALUE_cumulative_day'] = df_series_sales_resampled.groupby(df_series_sales_resampled.data.dt.date)['NET_VALUE'].cumsum()

In [None]:
display(df_series_sales_resampled.tail(1000))

In [None]:
# Plotting Autocorrelation
nlags = (2*24) * 2
corr_array = acf(df_series_sales_resampled[['NET_VALUE_cumulative_day']], alpha=0.05, nlags=nlags)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color='#3f3f3f') for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4', marker_size=6)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags])
fig.update_yaxes(zerolinecolor='#000000')

title='Autocorrelation (ACF) - NET_VALUE_cumulative_day'
fig.update_layout(title=title,     
                  width=1600,
                  height=600)
fig.show()

# Generates the list with lags containing significant correlation
indices_out_of_bounds = [i for i, (corr, lower, upper) in enumerate(zip(corr_array[0], lower_y, upper_y)) if corr < lower or corr > upper]
print('Lags with significant autocorrelation:')
lags_acf = [-x for x in indices_out_of_bounds[1:]]
lags_acf.sort()
print(lags_acf)

In [None]:
# Plotting Partial Autocorrelation
corr_array = pacf(df_series_sales_resampled[['NET_VALUE_cumulative_day']], alpha=0.05, nlags=nlags)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0, corr_array[0][x]), mode='lines',line_color='#3f3f3f') for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4', marker_size=6)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags]) 
fig.update_yaxes(zerolinecolor='#000000')

title='Partial Autocorrelation (PACF) - NET_VALUE_cumulative_day'
fig.update_layout(title=title,     
                  width=1600,
                  height=600)
fig.show()

# Generates the list with lags containing significant correlation
indices_out_of_bounds = [i for i, (corr, lower, upper) in enumerate(zip(corr_array[0], lower_y, upper_y)) if corr < lower or corr > upper]
print('Lags with significant partial autocorrelation:')
lags_pacf = [-x for x in indices_out_of_bounds[1:]]
lags_pacf.sort()
print(lags_pacf)

In [None]:
# Plotting heatmap with statistical significance annotations
df_corr = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df_corr,
            square = True, 
            cbar = True, 
            cmap = 'crest',
            ax = ax)

# Statistical significance annotations
rho = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr()
pval = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
p = rho.round(2).astype(str) + p

for i in range(p.shape[0]):
    for j in range(p.shape[1]):
        ax.text(j + 0.5, i + 0.5, p.iloc[i, j], ha='center', va='center', fontsize=8)

# Adjusting layout
plt.title('Correlation Matrix - Pearson')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()

# Displaying heatmap
plt.show()

In [None]:
# Plotting heatmap with statistical significance annotations
df_corr = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr(method='spearman')

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(df_corr,
            square = True, 
            cbar = True, 
            cmap = 'crest',
            ax = ax)

# Statistical significance annotations
rho = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr()
pval = df_series_sales_resampled.drop(columns=['data', 'event', 'day_week_name', 'zscore'], axis=1).corr(method=lambda x, y: spearmanr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
p = rho.round(2).astype(str) + p

for i in range(p.shape[0]):
    for j in range(p.shape[1]):
        ax.text(j + 0.5, i + 0.5, p.iloc[i, j], ha='center', va='center', fontsize=8)

# Adjusting layout
plt.title('Correlation Matrix - Spearman')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()

# Displaying heatmap
plt.show()



### 3.3 3-Hour Granularity Series

In [None]:
# Setting SYSTEM_TIMESTAMP column as DataFrame index
df_series_sales.set_index('SYSTEM_TIMESTAMP', inplace=True)
df_series_sales.sort_index(inplace=True)

# Converting granularity to 3 hours
df_series_sales_resampled_3h = df_series_sales.resample('3H').sum()

In [None]:
display(df_series_sales_resampled_3h.reset_index().iloc[-100:])

In [None]:
# Plotting Autocorrelation
nlags = (8*24) * 2
corr_array = acf(df_series_sales_resampled_3h[['NET_VALUE']], alpha=0.05, nlags=nlags)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color='#3f3f3f') for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4', marker_size=6)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags])
fig.update_yaxes(zerolinecolor='#000000')

title='Autocorrelation (ACF)'
fig.update_layout(title=title,     
                  width=1600,
                  height=600)
fig.show()

# Generates the list with lags containing significant correlation
indices_out_of_bounds = [i for i, (corr, lower, upper) in enumerate(zip(corr_array[0], lower_y, upper_y)) if corr < lower or corr > upper]
print('Lags with significant autocorrelation:')
lags_acf = [-x for x in indices_out_of_bounds[1:]]
lags_acf.sort()
print(lags_acf)


### 3.4 Daily Series

In [None]:
# Setting SYSTEM_TIMESTAMP column as DataFrame index
df_series_sales.set_index('SYSTEM_TIMESTAMP', inplace=True)
df_series_sales.sort_index(inplace=True)

# Converting granularity to Daily
df_series_sales_resampled_dia = df_series_sales.resample('D').sum()

In [None]:
display(df_series_sales_resampled_dia.reset_index())

In [None]:
# Plotting Autocorrelation
nlags = 180
corr_array = acf(df_series_sales_resampled_dia[['NET_VALUE']], alpha=0.05, nlags=nlags)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color='#3f3f3f') for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4', marker_size=6)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags])
fig.update_yaxes(zerolinecolor='#000000')

title='Autocorrelation (ACF)'
fig.update_layout(title=title,     
                  width=1600,
                  height=600)
fig.show()

# Generates the list with lags containing significant correlation
indices_out_of_bounds = [i for i, (corr, lower, upper) in enumerate(zip(corr_array[0], lower_y, upper_y)) if corr < lower or corr > upper]
print('Lags with significant autocorrelation:')
lags_acf = [-x for x in indices_out_of_bounds[1:]]
lags_acf.sort()
print(lags_acf)

In [None]:
# Plotting Partial Autocorrelation
corr_array = pacf(df_series_sales_resampled_dia[['NET_VALUE']], alpha=0.05, nlags=nlags)
lower_y = corr_array[1][:,0] - corr_array[0]
upper_y = corr_array[1][:,1] - corr_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0, corr_array[0][x]), mode='lines',line_color='#3f3f3f') for x in range(len(corr_array[0]))]
fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4', marker_size=6)
fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_xaxes(range=[-1, nlags]) 
fig.update_yaxes(zerolinecolor='#000000')

title='Partial Autocorrelation (PACF)'
fig.update_layout(title=title,     
                  width=1600,
                  height=600)
fig.show()

# Generates the list with lags containing significant correlation
indices_out_of_bounds = [i for i, (corr, lower, upper) in enumerate(zip(corr_array[0], lower_y, upper_y)) if corr < lower or corr > upper]
print('Lags with significant partial autocorrelation:')
lags_pacf = [-x for x in indices_out_of_bounds[1:]]
lags_pacf.sort()
print(lags_pacf)