In [None]:
pip install --upgrade fosforml

Collecting fosforml
[?25l  Downloading https://files.pythonhosted.org/packages/94/2e/3613fd0ccdbf3709dec86f87fe7624737a6f08bd1a813c88e65e7352dfde/fosforml-1.1.8-py3-none-any.whl (42kB)
[K     |████████████████████████████████| 51kB 4.6MB/s eta 0:00:011
[?25hCollecting snowflake-ml-python==1.5.0; python_version <= "3.9"
[?25l  Downloading https://files.pythonhosted.org/packages/80/72/c0fa5a9bc811a59a5a1c7113ff89676ed1629d7d6463db8c1a8c97a8b5f6/snowflake_ml_python-1.5.0-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 9.9MB/s eta 0:00:01
[?25hCollecting scikit-learn==1.3.2
[?25l  Downloading https://files.pythonhosted.org/packages/25/89/dce01a35d354159dcc901e3c7e7eb3fe98de5cb3639c6cd39518d8830caa/scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9MB)
[K     |████████████████████████████████| 10.9MB 29.6MB/s eta 0:00:01
[?25hCollecting cloudpickle==2.2.1
  Downloading https://files.pythonhosted.org/packages/15/80/44286939ca215

In [None]:
pip install --upgrade numpy

In [None]:
pip install --upgrade seaborn

In [None]:
!pip install pandas
!pip install snowflake-ml-python
!pip install requests

In [None]:
# pip install ydata-profiling --upgrade

In [None]:
import pandas as pd
import numpy as np
import fosforml
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 
plt.style.use('fivethirtyeight')

In [None]:
my_session.connection.database

In [None]:
my_session.connection.schema

In [None]:
table_name = "ASSORTMENT_PLANNING.CPG_BRONZE.SALES_CLEAN"

In [None]:
sf_df = my_session.sql("select * from {}".format(table_name))

In [None]:
type(sf_df)

In [None]:
df=sf_df.to_pandas()

In [None]:
type(df)

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df.info()

## Converting to datetime data type

In [None]:
df['TRANS_DATE'] = pd.to_datetime(df['TRANS_DATE'])
df['START_DATE'] = pd.to_datetime(df['START_DATE'])

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
# Find unique dates
df['TRANS_DATE'].nunique()

In [None]:
# Create features from date column
df['YEAR'] = df['TRANS_DATE'].dt.year
df['MONTH'] = df['TRANS_DATE'].dt.month
df['DAY'] = df['TRANS_DATE'].dt.day
df['DAY_OF_WEEK'] = df['TRANS_DATE'].dt.day_of_week + 1 # Monday is 1 and sunday is 7
df['QUARTER'] = df['TRANS_DATE'].dt.quarter
df['DAY_OF_YEAR'] = df['TRANS_DATE'].dt.dayofyear

In [None]:
# Set option to display all columns
pd.set_option('display.max_columns', None)

In [None]:
df.head(5)

In [None]:
'''from ydata_profiling import ProfileReport
profile=ProfileReport(df,explorative=True)
profile.to_file("autoeda_v1.html")'''

In [None]:
df_dup = df[df.duplicated()].sort_values(by=['TRANS_DATE','SALES_UNITS','OUTLET_CODE'])

In [None]:
df_unique = df.drop_duplicates()

In [None]:
df_unique.shape

In [None]:
df_check = df_unique.sort_values(by='TRANS_DATE')
df_check.head()

In [None]:
df_all_dates = pd.date_range(start='2023-08-29', end='2024-08-27').tolist()

In [None]:
missing_dates=set(df_all_dates) - set(df_unique['TRANS_DATE'])
len(missing_dates)

In [None]:
# Calculate the week number for each date
df_check['WEEK_NUMBER'] = df_check['TRANS_DATE'].dt.isocalendar().week

In [None]:
df_check.head()

In [None]:
# Calculate the year difference from the first date
df_check['YEAR_DIFF'] = df_check['TRANS_DATE'].dt.year - df_check['TRANS_DATE'].dt.year.min()

# Calculate the continuous week number
df_check['CONTINUOUS_WEEK_NUMBER'] = df_check['WEEK_NUMBER'] + df_check['YEAR_DIFF'] * 52

# Adjust for the first year weeks
first_year_weeks = df_check[df_check['YEAR_DIFF'] == 0]['WEEK_NUMBER'].max()
df_check['CONTINUOUS_WEEK_NUMBER'] = df_check.apply(
    lambda row: row['CONTINUOUS_WEEK_NUMBER'] - (52 - first_year_weeks) if row['YEAR_DIFF'] > 0 else row['CONTINUOUS_WEEK_NUMBER'],
    axis=1
)
df_check['CONTINUOUS_WEEK_NUMBER']=df_check['CONTINUOUS_WEEK_NUMBER']-34

In [None]:
import matplotlib.pyplot as plt

# Plot SALES_UNITS over CONTINUOUS_WEEK_NUMBER
plt.figure(figsize=(10, 5))
df_check.groupby('CONTINUOUS_WEEK_NUMBER')['SALES_UNITS'].sum().plot(kind='bar', color='orange')
plt.title('SALES_UNITS over CONTINUOUS_WEEK_NUMBER')
plt.xlabel('CONTINUOUS_WEEK_NUMBER')
plt.ylabel('SALES_UNITS')
plt.grid(True)
plt.show()

# Plot SALES_UNITS over DAY_OF_WEEK
plt.figure(figsize=(10, 5))
df_check.groupby('DAY_OF_WEEK')['SALES_UNITS'].sum().plot(kind='bar', color='salmon')
plt.title('SALES_UNITS over DAY_OF_WEEK')
plt.xlabel('Day of the Week')
plt.ylabel('SALES_UNITS')
plt.grid(True)
plt.show()

# Plot SALES_UNITS over TRANSACTION DATE
plt.figure(figsize=(10, 5))
df_check.groupby('TRANS_DATE')['SALES_UNITS'].sum().plot(kind='line', color='lightgreen')
plt.title('SALES_UNITS over DAY')
plt.xlabel('TRANSACTION DATE')
plt.ylabel('SALES_UNITS')
plt.grid(True)
plt.show()

import matplotlib.pyplot as plt
# Plot SALES_UNITS over MONTH
plt.figure(figsize=(10, 5))
df_check.groupby('MONTH')['SALES_UNITS'].sum().plot(kind='bar', color='skyblue')
plt.title('SALES_UNITS over MONTH')
plt.xlabel('Month')
plt.ylabel('SALES_UNITS')
plt.grid(True)
plt.show()

In [None]:
df_final = df_check.drop(['WEEK_NUMBER'],axis=1)

In [None]:
df_final.tail()

In [None]:
df_final['OUTLET_CODE'].nunique()

In [None]:
# # In case cluster column exists, uncomment this
#df_final = df_final.drop(columns=['CLUSTER'])


In [None]:
## Additional Features

df_final['FREQUENCY'] = df_final.groupby(['OUTLET_CODE', 'PRODUCT_CODE']).cumcount() + 1

df_final['PROFIT_PER_UNIT'] = (df_final['SALES_PTR_VALUE'] - df_final['SALES_VALUE']) / df_final['SALES_UNITS']

df_final['DAYS_BETWEEN'] = df_final.groupby(['OUTLET_CODE', 'PRODUCT_CODE'])['TRANS_DATE'].diff().dt.days
df_final['DAYS_BETWEEN'] = df_final['DAYS_BETWEEN'].fillna(0)

df_final['UNIT_PTR'] = df_final['SALES_PTR_VALUE']/df_final['SALES_UNITS']


In [None]:
df_final[(df_final['PRODUCT_CODE'] == 'PRD0147') & (df_final['OUTLET_CODE'] == 'OL160188')].head()

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# List of columns to encode
columns_to_encode = ['BRAND', 'PRODUCT_CODE', 'SUBCATEGORY', 'CITY','STATE', 'COUNTY']

# Apply label encoding to each column
for column in columns_to_encode:
    df_final[column + '_encoded'] = label_encoder.fit_transform(df_final[column])

df_final = pd.get_dummies(df_final, columns=['DISTRIBUTOR_CODE', 'CATEGORY'])

# Convert all column names to uppercase and replace spaces with underscores
df_final.columns = df_final.columns.str.upper().str.replace(' ', '_')

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Assuming your DataFrame is named df_final
# Convert all column names to uppercase and replace spaces with underscores
df_final.columns = df_final.columns.str.upper().str.replace(' ', '_')

# List of one-hot encoded category columns
category_columns = [
    'CATEGORY_DENTAL', 'CATEGORY_HAIR_CARE', 'CATEGORY_KIDS_CARE',
    'CATEGORY_LOTION', 'CATEGORY_PERFUME_AND_DEODRANTS', 'CATEGORY_SOAP', 'CATEGORY_WIPES'
]

# Aggregate data by OUTLET_CODE, including one-hot encoded category columns
aggregation_dict = {
    'SALES_UNITS': 'mean',
    'PROFIT_PER_UNIT': 'mean',
    'FREQUENCY': 'count'
}
aggregation_dict.update({col: 'max' for col in category_columns})

aggregated_df = df_final.groupby('OUTLET_CODE').agg(aggregation_dict).reset_index()

# Select features for clustering
features = ['SALES_UNITS', 'PROFIT_PER_UNIT', 'FREQUENCY'] + category_columns

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['SALES_UNITS', 'PROFIT_PER_UNIT', 'FREQUENCY']),
        # No need to preprocess category columns as they are already one-hot encoded
    ],
    remainder='passthrough'  # Keep the one-hot encoded columns as they are
)

# Preprocess the data
df_preprocessed = preprocessor.fit_transform(aggregated_df[features])


In [None]:
# Calculate silhouette scores for different numbers of clusters
silhouette_scores = []
k_range = range(2, 11)  # Silhouette score is not defined for k=1
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(df_preprocessed)
    silhouette_avg = silhouette_score(df_preprocessed, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_range, silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score For Optimal k')
plt.show()

In [None]:
# Apply K-Means clustering with the chosen number of clusters 
optimal_k = k_range[silhouette_scores.index(max(silhouette_scores))]  # Choose the k with the highest silhouette score
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
aggregated_df['CLUSTER'] = kmeans.fit_predict(df_preprocessed)

# Display the first few rows to verify
print(aggregated_df.head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example visualization Clusters of SALES_UNITS VS SALES_VALUE
sns.scatterplot(x='SALES_UNITS', y='FREQUENCY', hue='CLUSTER', data=aggregated_df)
plt.title('Clusters of OUTLET_CODE')
plt.show()

In [None]:
df_final_with_clusters = df_final.merge(aggregated_df[['OUTLET_CODE', 'CLUSTER']], on='OUTLET_CODE', how='left')
df_final_with_clusters.head()

In [None]:
df_featureset = df_final_with_clusters.copy()

In [None]:
# df_featureset.drop(['OC_CODE','OUTLET_CODE','PRODUCT_CODE','STREET','CITY','STATE','COUNTY','PRODUCT_CODE','SUBCATEGORY','BRAND'], axis=1, inplace=True)

In [None]:
df_featureset.info()

In [None]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

# Separate features and target
X = df_featureset.drop('SALES_UNITS', axis=1)
y = df_featureset['SALES_UNITS']

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Assuming df_featureset is your DataFrame

# Define features and target
features = [
    'YEAR', 
    'MONTH', 'DAY', 'DAY_OF_WEEK', 'DAY_OF_YEAR', 'FREQUENCY',
    'BRAND_ENCODED', 'PRODUCT_CODE_ENCODED', 'SUBCATEGORY_ENCODED', 'CITY_ENCODED',
    'STATE_ENCODED',
    # 'COUNTY_ENCODED', 
    # 'DISTRIBUTOR_CODE_DB0110', 'DISTRIBUTOR_CODE_DB0209',
    # 'DISTRIBUTOR_CODE_DB0652', 'DISTRIBUTOR_CODE_DB0655', 'DISTRIBUTOR_CODE_DB0706',
    # # 'CATEGORY_DENTAL', 'CATEGORY_HAIR_CARE', 'CATEGORY_KIDS_CARE', 'CATEGORY_LOTION',
    # 'CATEGORY_PERFUME_AND_DEODRANTS', 'CATEGORY_SOAP', 'CATEGORY_WIPES', 
    'CLUSTER','UNIT_PTR'
]
target = 'SALES_UNITS'

# Split the data into training and testing sets based on MNTH_CODE
train_data = df_featureset[df_featureset['MNTH_CODE'] != 202408]
test_data = df_featureset[df_featureset['MNTH_CODE'] == 202408]

X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]
y_test = test_data[target]

# Initialize and train the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Display feature importances
feature_importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
print("Feature Importances:\n", feature_importances)

In [None]:
# Create a DataFrame to compare actual and predicted values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

comparison_df.tail()

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score

# Assuming y_test and y_pred are your actual and predicted values respectively

# Define a threshold to convert continuous predictions to binary labels
threshold = 10  # Example threshold

# Convert to binary labels based on the threshold
y_test_binary = (y_test >= threshold).astype(int)
y_pred_binary = (y_pred >= threshold).astype(int)

# Compute precision and recall
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)

print(f"Precision: {precision}")
print(f"Recall: {recall}")

In [None]:
# import pandas as pd

# # Assuming X_test is your test features DataFrame
# # And comparison_df is your DataFrame containing actual and predicted values

# # Ensure the indices match
# comparison_df.index = X_test.index

# # Concatenate X_test with comparison_df
# merged_df = pd.concat([X_test, comparison_df], axis=1)

# # Display the first few rows to verify
# merged_df.head()


In [None]:
# import pandas as pd
# from sklearn.metrics import precision_score, recall_score

# # Assuming y_test and y_pred are your actual and predicted values respectively
# # And comparison_df is your DataFrame containing actual and predicted values

# # Ensure y_test and y_pred are integers representing class labels
# y_test_multiclass = y_test.astype(int)
# y_pred_multiclass = y_pred.astype(int)

# # Calculate precision and recall for multiclass
# precision = precision_score(y_test_multiclass, y_pred_multiclass, average='macro')
# recall = recall_score(y_test_multiclass, y_pred_multiclass, average='macro')

# # Add precision and recall as columns to comparison_df
# comparison_df['PRECISION'] = precision
# comparison_df['RECALL'] = recall

# # Display the first few rows to verify
# comparison_df.head()


In [None]:
# # Assuming the one-hot encoded columns are in merged_df
# # Create a mapping for distributor codes and categories
# distributor_columns = ['DISTRIBUTOR_CODE_DB0110', 'DISTRIBUTOR_CODE_DB0209', 'DISTRIBUTOR_CODE_DB0652', 'DISTRIBUTOR_CODE_DB0655', 'DISTRIBUTOR_CODE_DB0706']
# category_columns = ['CATEGORY_DENTAL', 'CATEGORY_HAIR_CARE', 'CATEGORY_KIDS_CARE', 'CATEGORY_LOTION', 'CATEGORY_PERFUME_AND_DEODRANTS', 'CATEGORY_SOAP', 'CATEGORY_WIPES']

# # Decode distributor codes
# merged_df['DISTRIBUTOR_CODE'] = merged_df[distributor_columns].idxmax(axis=1).str.replace('DISTRIBUTOR_CODE_', '')

# # Decode categories
# merged_df['CATEGORY'] = merged_df[category_columns].idxmax(axis=1).str.replace('CATEGORY_', '').str.replace('_', ' ')

# # Drop the one-hot encoded columns if no longer needed
# merged_df.drop(columns=distributor_columns + category_columns, inplace=True)

# # Display the first few rows to verify
# merged_df.head()


In [None]:
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder

# # Assuming df_final is your DataFrame before encoding
# # Initialize the label encoder
# label_encoders = {}

# # List of columns to encode
# columns_to_encode = ['BRAND', 'PRODUCT_CODE', 'SUBCATEGORY', 'CITY', 'STATE', 'COUNTY']

# # Fit the label encoders with the original data
# for column in columns_to_encode:
#     le = LabelEncoder()
#     le.fit(df_final[column])
#     label_encoders[column] = le

# # Inverse transform the encoded columns in merged_df
# merged_df['BRAND'] = label_encoders['BRAND'].inverse_transform(merged_df['BRAND_ENCODED'])
# merged_df['PRODUCT_CODE'] = label_encoders['PRODUCT_CODE'].inverse_transform(merged_df['PRODUCT_CODE_ENCODED'])
# merged_df['SUBCATEGORY'] = label_encoders['SUBCATEGORY'].inverse_transform(merged_df['SUBCATEGORY_ENCODED'])
# merged_df['CITY'] = label_encoders['CITY'].inverse_transform(merged_df['CITY_ENCODED'])
# merged_df['STATE'] = label_encoders['STATE'].inverse_transform(merged_df['STATE_ENCODED'])
# merged_df['COUNTY'] = label_encoders['COUNTY'].inverse_transform(merged_df['COUNTY_ENCODED'])

In [None]:
# # List of columns to drop
# columns_to_drop = ['BRAND_ENCODED', 'PRODUCT_CODE_ENCODED', 'SUBCATEGORY_ENCODED', 'CITY_ENCODED', 'STATE_ENCODED', 'COUNTY_ENCODED']

# # Drop the specified columns
# merged_df = merged_df.drop(columns=columns_to_drop)

# merged_df = merged_df.merge(df_final_with_clusters[['CITY', 'STATE', 'COUNTY', 'OUTLET_CODE']], on=['CITY', 'STATE', 'COUNTY'], how='left')

# merged_df.head()


In [None]:
### Creating a Future dataset for all possible outlet code & product code combinations

In [None]:
import pandas as pd
import itertools

# Assuming df is your DataFrame

# Extract unique values of OUTLET_CODE and PRODUCT_CODE
outlet_codes = df['OUTLET_CODE'].unique()
product_codes = df['PRODUCT_CODE'].unique()

# Generate all possible combinations of OUTLET_CODE and PRODUCT_CODE
combinations = list(itertools.product(outlet_codes, product_codes))

# Create a DataFrame from the combinations
future_df = pd.DataFrame(combinations, columns=['OUTLET_CODE', 'PRODUCT_CODE'])

# Display the first few rows to verify
print(future_df.head())



In [None]:
# Group by OUTLET_CODE and count unique DISTRIBUTOR_CODE
distributor_counts = df.groupby('OUTLET_CODE')['DISTRIBUTOR_CODE'].nunique().reset_index()

# Rename the column for clarity
distributor_counts.rename(columns={'DISTRIBUTOR_CODE': 'UNIQUE_DISTRIBUTOR_COUNT'}, inplace=True)

# Determine if each OUTLET_CODE has multiple or a single DISTRIBUTOR_CODE
distributor_counts['DISTRIBUTOR_TYPE'] = distributor_counts['UNIQUE_DISTRIBUTOR_COUNT'].apply(lambda x: 'Multiple' if x > 1 else 'Single')
distributor_counts.head()


In [None]:
print(distributor_counts['UNIQUE_DISTRIBUTOR_COUNT'].nunique())

In [None]:
future_df.info()

In [None]:
future_df = future_df.merge(df_final_with_clusters[['OUTLET_CODE', 'CLUSTER']], on='OUTLET_CODE', how='left')

In [None]:
future_df = future_df.drop(columns=['OUTLET_CODE'])

In [None]:
future_df.info()

In [None]:
future_df= future_df.drop_duplicates()

In [None]:
product_cluster_df = df_final_with_clusters[['PRODUCT_CODE', 'CLUSTER']]

In [None]:
product_cluster_df.info()

In [None]:
# import pandas as pd

# # Assuming df and future_df are your DataFrames

# # Merge the DISTRIBUTOR_CODE column from df with future_df
# future_df = future_df.merge(df[['OUTLET_CODE', 'DISTRIBUTOR_CODE']], on='OUTLET_CODE', how='left')
# future_df = future_df.merge(df_final[['PRODUCT_CODE', 'UNIT_PTR']], on='PRODUCT_CODE', how='left')

# # Display the first few rows to verify
# print(future_df.head())

In [None]:
# # Merge the CLUSTER column from df with future_df
# future_df = future_df.merge(df_final_with_clusters[['OUTLET_CODE','CLUSTER']], on='OUTLET_CODE', how='left')

# # Display the first few rows to verify
# print(future_df.head())