In [1]:
pip install --upgrade fosforml

Requirement already up-to-date: fosforml in /opt/conda/lib/python3.9/site-packages (1.1.8)
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install statsmodels

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

import fosforml
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [4]:
my_session.connection.database

'ASSORTMENT_PLANNING'

In [5]:
my_session.connection.schema

'CPG_BRONZE'

In [6]:
data = "ASSORTMENT_PLANNING.CPG_BRONZE.SALES_CLEAN_WITH_CLUSTER_SEP23TOJUL24"

In [7]:
sf_df = my_session.sql("select * from {}".format(data))

In [8]:
type(sf_df)

snowflake.snowpark.dataframe.DataFrame

In [9]:
df=sf_df.to_pandas()

In [10]:
type(df)

pandas.core.frame.DataFrame

In [11]:
df.head()

Unnamed: 0,OUTLET_CODE,PRODUCT_CODE,TRANS_DATE,UNIT_PTR,MNTH_CODE,START_DATE,SALES_VALUE,SALES_UNITS,SALES_VOLUME,SALES_PTR_VALUE,...,MONTH,DAY,DAYOFWEEK,QUARTER,IS_MONTH_START,IS_MONTH_END,FREQUENCY,CATEGORY_ENCODED,SUBCATEGORY_ENCODED,CLUSTER
0,OL10328,PRD0014,2023-12-08,17.857143,202312,2023-11-27,107.14,6,0.000312,107.142857,...,12,8,4,4,False,False,1,0,20,2
1,OL10328,PRD0058,2023-10-06,4.464286,202310,2023-10-02,53.57,12,0.000132,53.571429,...,10,6,4,4,False,False,1,5,11,0
2,OL10328,PRD0064,2024-04-21,99.090909,202404,2024-04-03,198.18,2,0.000214,198.181818,...,4,21,6,2,False,False,1,1,4,1
3,OL10328,PRD0064,2024-07-14,104.545455,202407,2024-07-03,101.41,1,0.000107,104.545455,...,7,14,6,3,False,False,2,1,4,1
4,OL10328,PRD0064,2024-07-21,104.545455,202407,2024-07-03,209.09,2,0.000214,209.090909,...,7,21,6,3,False,False,3,1,4,1


In [12]:
print(df.dtypes)

OUTLET_CODE             object
PRODUCT_CODE            object
TRANS_DATE              object
UNIT_PTR               float64
MNTH_CODE                int32
START_DATE              object
SALES_VALUE            float64
SALES_UNITS              int16
SALES_VOLUME            object
SALES_PTR_VALUE        float64
OC_CODE                  int32
DISTRIBUTOR_CODE        object
CITY                    object
STATE                   object
COUNTY                  object
STREET                  object
CATEGORY                object
SUBCATEGORY             object
BRAND                   object
YEAR                     int16
MONTH                     int8
DAY                       int8
DAYOFWEEK                 int8
QUARTER                   int8
IS_MONTH_START            bool
IS_MONTH_END              bool
FREQUENCY                 int8
CATEGORY_ENCODED          int8
SUBCATEGORY_ENCODED       int8
CLUSTER                   int8
dtype: object


In [31]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
 
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
 
table_name = '"ASSORTMENT_PLANNING.CPG_BRONZE.SALES_CLEAN_WITH_CLUSTER_SEP23TOJUL24"'
 
sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()
 
df['TRANS_DATE'] = pd.to_datetime(df['TRANS_DATE'])
df['MONTH'] = df['TRANS_DATE'].dt.month
 
# Sort the DataFrame by 'TRANS_MONTH' to maintain the time series order
df = df.sort_values(by=['CATEGORY_ENCODED', 'MONTH'])
 
# Function to create multiple lag features for SALES_UNITS
def create_lags(df, target_column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'LAG_{lag}'] = df[target_column].shift(lag)
    return df
 
# Define a function that applies the prediction process for each product, including RMSE and R² calculation
def train_single_model(df, num_lags=6):
    # Create lag features for SALES_UNITS
    df = create_lags(df, 'SALES_UNITS', num_lags)
 
    # Drop rows with NaN values (due to shifting from lag features)
    df = df.dropna()
 
    # Convert 'TRANS_MONTH' to an ordinal number to use as a feature
    df['TRANS_MONTH_ORDINAL'] = df['MONTH'].apply(lambda x: x.toordinal())
 
    # Define the features (Lag_1, Lag_2, ..., Lag_n, UNIT_PRICE, LEAD_TIME_IN_WEEKS, TRANS_MONTH_ORDINAL)
    lag_columns = [f'LAG_{lag}' for lag in range(1, num_lags + 1)]
    feature_columns = lag_columns + ['UNIT_PRICE', 'WEEKS', 'TRANS_MONTH_ORDINAL']
 
    # Define X (features) and y (target)
    X = df[feature_columns]
    Y = df['SALES_UNITS']
 
    # Split the data into training and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
 
    # Initialize the Linear Regression model
    model = LinearRegression()
 
    # Fit the model to the training data
    model.fit(X_train, Y_train)
 
    # Make predictions on the test data
    Y_pred = model.predict(X_test)
 
    # Convert y_pred to DataFrame
    Y_pred = pd.DataFrame(Y_pred, columns=['SALES_UNITS_PRED'])
 
    # Ensure column names are in uppercase and conform to allowed characters
    X_train.columns = X_train.columns.str.upper()
    Y_train = pd.DataFrame(Y_train, columns=['SALES_UNITS']).apply(lambda x: x.astype(str).str.upper())
    X_test.columns = X_test.columns.str.upper()
    Y_test = pd.DataFrame(Y_test, columns=['SALES_UNITS']).apply(lambda x: x.astype(str).str.upper())
    Y_pred.columns = Y_pred.columns.str.upper()
    '''
    print(type(X_train))
    print(type(X_test))
    print(type(Y_train))
    print(type(Y_test))
    print(type(Y_pred))
    '''
    from fosforml import register_model
    register_model(
        model_obj=model,
        session=my_session,
        x_train=X_train,
        x_test=X_test,
        y_train=Y_train,
        y_test=Y_test,
        y_pred=Y_pred,
        source="Notebook",
        dataset_name="ASSORTMENT_PLANNING.CPG_BRONZE.SALES_CLEAN_WITH_CLUSTER_SEP23TOJUL24",
        dataset_source="Snowflake",
        name="PCBA",
        description="This is a model for order forecast",
        flavour="sklearn",
        model_type="Regression",
        conda_dependencies=["scikit-learn==1.3.0"]
    )
    # Calculate RMSE
    rmse = mean_squared_error(Y_test, Y_pred, squared=False)
 
    # Calculate R²
    r2 = r2_score(Y_test, Y_pred)
 
    return model, rmse, r2, X_test, Y_test, df, feature_columns
 
def forecast_for_each_product(model, product_df, df, feature_columns, num_lags=6, forecast_periods=25):
    # Create lag features for SALES_UNITS
    product_df = create_lags(product_df, 'SALES_UNITS', num_lags)
    # Drop rows with NaN values (due to shifting from lag features)
    product_df = product_df.dropna()
 
    # Convert 'TRANS_MONTH' to an ordinal number to use as a feature
    product_df['TRANS_MONTH_ORDINAL'] = product_df['MONTH'].apply(lambda x: x.toordinal())
 
    # Define X (features) for this product
    X_product = product_df[feature_columns]
 
    # Forecast for the next 25 months
    future_months = pd.date_range(product_df['MONTH'].max(), periods=forecast_periods, freq='MS')
 
    # Initialize the last known lag values, UNIT_PRICE, LEAD_TIME_IN_WEEKS, and TRANS_MONTH_ORDINAL
    lag_columns = [f'LAG_{lag}' for lag in range(1, num_lags + 1)]
    last_lags = list(product_df[lag_columns].iloc[-1])
    last_unit_price = product_df['UNIT_PRICE'].iloc[-1]
    last_lead_time = product_df['WEEKS'].iloc[-1]
    last_TRANS_MONTH_ordinal = product_df['TRANS_MONTH_ORDINAL'].iloc[-1]
 
    # Create an empty list to store predictions
    future_preds = []
 
    for i in range(forecast_periods):
        # Increment the TRANS_MONTH_ORDINAL for the next future month
        last_TRANS_MONTH_ordinal += 30  # Assuming average month is ~30 days
 
        # Prepare input features for the next prediction
        future_X = np.array([last_lags + [last_unit_price, last_lead_time, last_TRANS_MONTH_ordinal]])
 
        # Predict the next SALES_UNITS
        future_pred = model.predict(future_X)[0]
 
        # Append the prediction to the list
        future_preds.append(future_pred)
 
        # Update lag values for the next iteration
        last_lags = [future_pred] + last_lags[:-1]  # Shift the lags with the new prediction
 
    # Format future_months as 'YYYY-MMM'
    future_months_formatted = future_months.strftime('%Y-%m-01')
 
    # Create a DataFrame for this product's future SALES_UNITS predictions
    future_forecast_df = pd.DataFrame({
        'MONTH': future_months_formatted,
        'PREDICTED_SALES_UNITS': future_preds,
        'CATEGORY_ENCODED': product_df['CATEGORY_ENCODED'].iloc[0],  # Add the PRODUCT_ID to each row
    })
 
    return future_forecast_df
 
# Example usage:
# Train the model once on the entire dataset
model, rmse, r2, X_test, Y_test, df, feature_columns = train_single_model(df)
 
# Loop through each product and forecast its future SALES_UNITS
all_forecasts = []
 
for product_id, product_df in df.groupby('CATEGORY_ENCODED'):
    forecast_df = forecast_for_each_product(model, product_df, df, feature_columns)
    all_forecasts.append(forecast_df)
 
# Combine all forecasts into one DataFrame
final_forecast_df = pd.concat(all_forecasts)
final_forecast_df

SnowparkSQLException: (1304): 01b7ae5b-0003-dee5-0000-31ef0025c01a: 002003 (42S02): SQL compilation error:
Object '"ASSORTMENT_PLANNING.CPG_BRONZE.SALES_CLEAN_WITH_CLUSTER_SEP23TOJUL24"' does not exist or not authorized.