In [1]:
pip install darts

Collecting statsforecast>=1.4 (from darts)
  Using cached statsforecast-1.7.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (28 kB)
Using cached statsforecast-1.7.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (314 kB)
Installing collected packages: statsforecast
  Attempting uninstall: statsforecast
    Found existing installation: statsforecast 0.6.0
    Uninstalling statsforecast-0.6.0:
      Successfully uninstalled statsforecast-0.6.0
Successfully installed statsforecast-1.7.8
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install statsforecast==0.6.0


In [None]:
pip install --upgrade pip

In [None]:
conda install -c conda-forge lightgbm

In [None]:
import fosforml
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [None]:
my_session.connection.database

In [None]:
my_session.connection.schema

In [None]:
table_name = "ASSORTMENT_PLANNING.CPG_BRONZE.SALES_CLEAN"

In [None]:
sf_df = my_session.sql("select * from {}".format(table_name))
type(sf_df)
df=sf_df.to_pandas()
type(df)

In [None]:
df.head(5)

In [None]:
df.info()

## Generate Additional Features

In [None]:
import pandas as pd

def generate_features(df):
    # Ensure the TRANS_DATE column is in datetime format
    df['TRANS_DATE'] = pd.to_datetime(df['TRANS_DATE'])
    
    # Extract temporal features
    df['year'] = df['TRANS_DATE'].dt.year
    df['month'] = df['TRANS_DATE'].dt.month
    df['day'] = df['TRANS_DATE'].dt.day
    df['dayofweek'] = df['TRANS_DATE'].dt.dayofweek
    df['quarter'] = df['TRANS_DATE'].dt.quarter
    df['is_month_start'] = df['TRANS_DATE'].dt.is_month_start
    df['is_month_end'] = df['TRANS_DATE'].dt.is_month_end
    
    # Create UNIT_PTR feature
    df['UNIT_PTR'] = df['SALES_PTR_VALUE'] / df['SALES_UNITS']
    
    return df

In [None]:
df = generate_features(df)

In [None]:
df.head()

In [None]:
df = df.sort_values(by='TRANS_DATE')

In [None]:
# Convert all column names to uppercase
df.columns = [col.upper() for col in df.columns]

In [None]:
df.info()

In [None]:
df_sorted = df.sort_values(by=['OUTLET_CODE', 'PRODUCT_CODE', 'TRANS_DATE'])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# List of columns in the desired order
columns_order = ['OUTLET_CODE', 'PRODUCT_CODE', 'TRANS_DATE', 'UNIT_PTR'] + [col for col in df_sorted.columns if col not in ['OUTLET_CODE', 'PRODUCT_CODE']]

# Reorder the DataFrame columns
df_sorted = df_sorted[columns_order]

In [None]:
df_sorted['FREQUENCY'] = df_sorted.groupby(['OUTLET_CODE', 'PRODUCT_CODE']).cumcount() + 1

In [None]:
df_sorted.head()

Split the dataset into test and train

K means clustering of OUTLET_CODES

In [None]:
# #Label Encoding

# import pandas as pd
# from sklearn.preprocessing import LabelEncoder

# # Initialize the label encoder
# label_encoder = LabelEncoder()

# # List of columns to encode
# columns_to_encode = ['PRODUCT_CODE', 'CATEGORY', 'SUBCATEGORY', 'BRAND']

# # Apply label encoding to each column
# for column in columns_to_encode:
#     train_data[column + '_encoded'] = label_encoder.fit_transform(train_data[column])

# # Convert all column names to uppercase and replace spaces with underscores
# train_data.columns = train_data.columns.str.upper().str.replace(' ', '_')

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def label_encode_dataframe(df, columns_to_encode):
    # Initialize the label encoder
    label_encoder = LabelEncoder()
    
    # Apply label encoding to each column
    for column in columns_to_encode:
        df[column + '_encoded'] = label_encoder.fit_transform(df[column])
    
    # Convert all column names to uppercase and replace spaces with underscores
    df.columns = df.columns.str.upper().str.replace(' ', '_')
    
    return df

In [None]:
columns_to_encode = ['PRODUCT_CODE', 'CATEGORY', 'SUBCATEGORY', 'BRAND']
df_sorted = label_encode_dataframe(df_sorted, columns_to_encode)

In [None]:
df_sorted.info()

In [None]:
# Aggregation dictionary
aggregation_dict = {
    'SALES_UNITS': 'mean',
    'UNIT_PTR': 'mean',
    'FREQUENCY': 'count',
    'PRODUCT_CODE_ENCODED': 'mean'
}

# Aggregate data by OUTLET_CODE
aggregated_df = df_sorted.groupby('OUTLET_CODE').agg(aggregation_dict).reset_index()

In [None]:
# Select features for clustering
features = ['SALES_UNITS', 'UNIT_PTR', 'FREQUENCY', 'PRODUCT_CODE_ENCODED']

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['SALES_UNITS', 'UNIT_PTR', 'FREQUENCY', 'PRODUCT_CODE_ENCODED']),
        # No need to preprocess category columns as they are already one-hot encoded
    ],
    remainder='passthrough'  # Keep the one-hot encoded columns as they are
)

In [None]:
df_sorted = preprocessor.fit_transform(aggregated_df[features])

In [None]:
# Calculate silhouette scores for different numbers of clusters
silhouette_scores = []
k_range = range(2, 11)  # Silhouette score is not defined for k=1
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(df_sorted)
    silhouette_avg = silhouette_score(df_sorted, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_range, silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score For Optimal k')
plt.show()

In [None]:
# Apply K-Means clustering with the chosen number of clusters 
optimal_k = k_range[silhouette_scores.index(max(silhouette_scores))]  # Choose the k with the highest silhouette score
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
aggregated_df['CLUSTER'] = kmeans.fit_predict(train_data_preprocessed)

# Display the first few rows to verify
print(aggregated_df.head())

In [None]:
aggregated_df.to_csv('outlet_to_cluster_mapping.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example visualization Clusters of SALES_UNITS VS SALES_VALUE
sns.scatterplot(x='SALES_UNITS', y='FREQUENCY', hue='CLUSTER', data=aggregated_df)
plt.title('Clusters of OUTLET_CODE')
plt.show()

In [None]:
df_sorted=df_sorted.merge(aggregated_df[['OUTLET_CODE', 'CLUSTER']], on='OUTLET_CODE', how='left')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from darts import TimeSeries
from darts import RegressionModel
from darts.models import ExponentialSmoothing
from darts.utils.missing_values import fill_missing_values
from sklearn.ensemble import RandomForestRegressor

# Create lagged columns within each group of OUTLET_CODE and PRODUCT_CODE
df_sorted['SALES_UNIT_LAG_1'] = df_sorted.groupby(['OUTLET_CODE', 'PRODUCT_CODE'])['SALES_UNITS'].shift(1)
df_sorted['SALES_UNIT_LAG_2'] = df_sorted.groupby(['OUTLET_CODE', 'PRODUCT_CODE'])['SALES_UNITS'].shift(2)
df_sorted['SALES_UNIT_LAG_3'] = df_sorted.groupby(['OUTLET_CODE', 'PRODUCT_CODE'])['SALES_UNITS'].shift(3)

df_sorted = df_sorted.dropna()

# Define features and target
features = [
    'YEAR', 'MONTH', 'DAY', 'DAYOFWEEK', 'QUARTER',
    'PRODUCT_CODE_ENCODED', 'CATEGORY_ENCODED', 'SUBCATEGORY_ENCODED', 'BRAND_ENCODED',
    'CLUSTER', 'UNIT_PTR', 'SALES_UNIT_LAG_1', 'SALES_UNIT_LAG_2', 'SALES_UNIT_LAG_3'
]
target = 'SALES_UNITS'

# Convert pandas timeseries dataframe into darts timeseries object
series = TimeSeries.from_group_dataframe(df_sorted, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=target)

feature_series = [TimeSeries.from_group_dataframe(df_sorted, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=feature) for feature in features]

# # Create Timeseries object for lag features
# lag_1_series = Time.from_group_dataframe(df_sorted, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols='SALES_UNIT_LAG_1')
# lag_2_series = Time.from_group_dataframe(df_sorted, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols='SALES_UNIT_LAG_2')
# lag_3_series = Time.from_group_dataframe(df_sorted, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols='SALES_UNIT_LAG_3')

# # Stack the original series with lagged features
# multi_series = series.stack([lag_1_series, lag_2_series, lag_3_series])

# Combine the target series with the feature series
combined_series = series.stack(*feature_series)

# Split the data
train_data = df_sorted[df_sorted['MNTH_CODE'] != 202408]
test_data = df_sorted[df_sorted['MNTH_CODE'] == 202408]

model=ExponentialSmoothing()
model.fit(train_data)

forecast = model.predict(len(test_data))

forecast.head(10)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:
# train_data.info()

In [None]:
# import pandas as pd
# from darts import TimeSeries
# from darts.models import LightGBMModel
# from darts.dataprocessing.transformers import Scaler
# from darts.metrics import mape

# # Define features and target
# features = [
#     'YEAR', 'MONTH', 'DAY', 'DAYOFWEEK', 'QUARTER',
#     'PRODUCT_CODE_ENCODED', 'CLUSTER', 'UNIT_PTR'
# ]
# target = 'SALES_UNITS'

# # Create a TimeSeries object for the target variable
# series = TimeSeries.from_group_dataframe(train_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=target)

# # Create TimeSeries objects for the features
# feature_series = [TimeSeries.from_group_dataframe(train_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=feature) for feature in features]

# # Combine the target series with the feature series
# combined_series = series.stack(*feature_series)

# # Do the above 3 steps for train dataset
# series_test = TimeSeries.from_group_dataframe(test_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=target)
# feature_series_test = [TimeSeries.from_group_dataframe(test_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=feature) for feature in features]
# combined_series_test = series_test.stack(*feature_series_test)

# # Scale the data
# scaler = Scaler()
# scaled_series = scaler.fit_transform(combined_series)
# scaled_series_test = scaler.fit_transform(combined_series_test)

# # Initialize and train the LightGBMModel
# model = LightGBMModel(lags=3)
# model.fit(scaled_series)

# # Make predictions
# pred = model.predict(len(scaled_series_test))

# # Inverse transform the predictions
# pred = scaler.inverse_transform(pred)

# # Evaluate the model
# print(f"MAPE: {mape(scaled_series_test, pred)}")

# # Plot the results
# series.plot(label='actual')
# pred.plot(label='forecast')

In [None]:
# import pandas as pd
# from darts import TimeSeries
# from darts.models import LightGBMModel
# from darts.dataprocessing.transformers import Scaler
# from darts.metrics import mape

# # Define features and target
# features = [
#     'YEAR', 'MONTH', 'DAY', 'DAYOFWEEK', 'QUARTER',
#     'PRODUCT_CODE_ENCODED', 'CLUSTER', 'UNIT_PTR'
# ]
# target = 'SALES_UNITS'

# # Create a TimeSeries object for the target variable
# series = TimeSeries.from_group_dataframe(train_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=target)

# # Create TimeSeries objects for the features
# feature_series = [TimeSeries.from_group_dataframe(train_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=feature) for feature in features]

# # # Combine the target series with the feature series
# # combined_series = series.stack(*feature_series)

# # Do the above 3 steps for train dataset
# series_test = TimeSeries.from_group_dataframe(test_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=target)
# feature_series_test = [TimeSeries.from_group_dataframe(test_data, group_cols = ['OUTLET_CODE','PRODUCT_CODE'], time_col='TRANS_DATE', value_cols=feature) for feature in features]

# # combined_series_test = series_test.stack(*feature_series_test)

# # # Scale the data
# # scaler = Scaler()
# # scaled_series = scaler.fit_transform(series)
# # scaled_series_test = scaler.fit_transform(series_test)
# # scaled_feature_series = scaler.fit_transform(feature_series)
# # scaled_feature_series_test = scaler.fit_transform(feature_series_test)

# model = RegressionModel(
#                         lags[-1,-2,-3],
#     lags_features=[0], model=NBEATSModel()
# )

# model.fit(series,feature_series)
# y_pred = model.predict(series_test,feature_series_test)

# # Initialize and train the LightGBMModel
# model = LightGBMModel(lags=3)
# model.fit(scaled_series)

# # Make predictions
# pred = model.predict(len(scaled_series_test))

# # Inverse transform the predictions
# pred = scaler.inverse_transform(pred)

# # Evaluate the model
# print(f"MAPE: {mape(scaled_series_test, pred)}")

# # Plot the results
# series.plot(label='actual')
# pred.plot(label='forecast')