In [1]:
pip install matplotlib

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import io
import base64

def forecast_sales(data, outlet_code, product_code, forecast_periods=12):
    data_outlet_product = data[(data['OUTLET_CODE'] == outlet_code) & (data['PRODUCT_CODE'] == product_code)]
    sales_units_ts = data_outlet_product.set_index('TRANS_DATE')['SALES_UNITS']
    
    if sales_units_ts.empty:
        print(f"The time series for outlet {outlet_code} and product {product_code} is empty. Skipping.")
        return None

    # Prepare the data for regression
    sales_units_ts = sales_units_ts.reset_index()
    sales_units_ts['TRANS_DATE'] = sales_units_ts['TRANS_DATE'].map(pd.Timestamp.toordinal)
    
    X = sales_units_ts[['TRANS_DATE']]
    y = sales_units_ts['SALES_UNITS']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Train the Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Forecast future sales
    last_date = X['TRANS_DATE'].max()
    future_dates = np.array([last_date + i for i in range(1, forecast_periods + 1)]).reshape(-1, 1)
    forecast = model.predict(future_dates)
    
    return forecast, future_dates

def run(session) -> str:
    data = session.table("SALES_CLEAN").to_pandas()

    # Ensure TRANS_DATE is in datetime format
    data['TRANS_DATE'] = pd.to_datetime(data['TRANS_DATE'])

    # Group by OUTLET_CODE, PRODUCT_CODE, and TRANS_DATE, and calculate total SALES_UNITS
    data_grouped = data.groupby(['OUTLET_CODE', 'PRODUCT_CODE', 'TRANS_DATE'])['SALES_UNITS'].sum().reset_index()

    # Example usage: Forecast for a specific outlet and product
    outlet_code = 'OUTLET_1'  # replace with a specific outlet code
    product_code = 'PRODUCT_1'  # replace with a specific product code
    forecast, future_dates = forecast_sales(data_grouped, outlet_code, product_code)

    if forecast is not None:
        # Plot the forecast
        plt.figure(figsize=(10, 6))
        plt.plot(pd.to_datetime(future_dates.flatten(), origin='unix', unit='D'), forecast, label='Forecasted Sales')
        plt.title(f'Sales Forecast for {product_code} at {outlet_code}')
        plt.xlabel('Date')
        plt.ylabel('Sales Units')
        plt.legend()

        # Convert the graph to a base64 string
        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        response = base64.b64encode(buf.getvalue()).decode('utf-8')
        buf.close()

        return response
    else:
        return "No forecast available for the specified outlet and product."

# Example call to the function
# session = ...  # Initialize your session object here
# print(run(session))


In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import io
import base64