In [27]:
def run(session) -> str:
    import io         # for image conversion
    import base64          # for image conversion
    import pandas as pd      #for data manipulation
    import matplotlib.pyplot as plt      # for plotting
    from statsmodels.tsa.arima.model import ARIMA       # for time series forecasting
    from statsmodels.graphics.tsaplots import plot_acf, plot_pacf     #for plotting autocorrelation and partial autocorrelation functions  
    


In [29]:
    import pandas as pd
data = pd.read_csv('data.csv')

#data = session.table("SALES_CLEAN").to_pandas()

In [31]:
# Ensure TRANS_DATE is in datetime format
data['TRANS_DATE'] = pd.to_datetime(data['TRANS_DATE'])

In [32]:
# Group by OUTLET_CODE, PRODUCT_CODE, and TRANS_DATE, and calculate total SALES_UNITS
data_grouped = data.groupby(['OUTLET_CODE', 'PRODUCT_CODE', 'TRANS_DATE'])['SALES_UNITS'].sum().reset_index()


In [62]:
from statsmodels.tsa.arima.model import ARIMA

#takes the data as inputs
#It filters the data for the specified outlet and product, creating a time series of sales units.

def forecast_sales(data, outlet_code, product_code, forecast_periods=12):  
    data_outlet_product = data[(data['OUTLET_CODE'] == outlet_code) & (data['PRODUCT_CODE'] == product_code)]
    sales_units_ts = data_outlet_product.set_index('TRANS_DATE')['SALES_UNITS']  
    
#If the time series is empty, it prints a message and returns None.   
    if sales_units_ts.empty: 
        print(f"The time series for outlet {outlet_code} and product {product_code} is empty. Skipping.")
        return None

    model = ARIMA(sales_units_ts, order=(1,1,1)) #p,d,q
    model_fit = model.fit()   #Otherwise, it fits an ARIMA model to the time series and forecasts future sales for the specified number of periods.
    
    forecast = model_fit.forecast(steps=forecast_periods)
    return forecast


In [76]:
# Example usage: Forecast for a specific outlet and product
outlet_code = 'OUTLET_1'  # replace with a specific outlet code
product_code = 'PRODUCT_1'  # replace with a specific product code
forecast = forecast_sales(data_grouped, outlet_code, product_code)

if forecast is not None:
    # Plot the forecast
    plt.figure(figsize=(10, 6))
    plt.plot(forecast, label='Forecasted Sales')
    plt.title(f'Sales Forecast for {product_code} at {outlet_code}')
    plt.xlabel('Date')
    plt.ylabel('Sales Units')
    plt.legend()

    # Convert the graph to a base64 string
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    response = base64.b64encode(buf.getvalue()).decode('utf-8')
    buf.close()

    print(response)  # or return response if within a function
else:
    print("No forecast available for the specified outlet and product.")


The time series for outlet OUTLET_1 and product PRODUCT_1 is empty. Skipping.
No forecast available for the specified outlet and product.


In [80]:
data.describe()

Unnamed: 0,MNTH_CODE,TRANS_DATE,SALES_UNITS,SALES_VOLUME
count,387369.0,387369,387368.0,387368.0
mean,202381.769705,2024-03-01 08:50:29.148951552,12.228338,0.000929
min,202310.0,2023-10-02 00:00:00,1.0,1.1e-05
25%,202401.0,2024-01-05 00:00:00,2.0,0.00036
50%,202403.0,2024-02-28 00:00:00,6.0,0.000475
75%,202405.0,2024-05-07 00:00:00,16.0,0.0009
max,202406.0,2024-07-02 00:00:00,5120.0,0.2755
std,38.982133,,37.797226,0.002054
