In [None]:
pip install fosforml

In [None]:
import pandas as pd
import fosforml
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
pip install seaborn

In [None]:
import seaborn as sns

In [None]:
my_session.connection.database

In [None]:
my_session.connection.schema

In [None]:
table_name = "ASSORTMENT_PLANNING.CPG_BRONZE.SALES_CLEAN"

In [None]:
sf_df = my_session.sql("select * from {}".format(table_name))

In [None]:
type(sf_df)

In [None]:
df=sf_df.to_pandas()

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df.shape

In [None]:
#How many % of missing values do we have for each feature?
missing_percentage = df.isnull().sum() / df.shape[0] * 100 
missing_percentage

In [None]:
#no any transactions happened without distributer and no outliers/NULL in sales
df.loc[df.DISTRIBUTOR_CODE.isnull(), ["SALES_VOLUME", "SALES_UNITS"]].describe()

In [None]:
#no any transactions happened without distributer and no outliers in sales
df.loc[df.OUTLET_CODE.isnull(), ["SALES_VOLUME", "SALES_UNITS"]].describe()

In [None]:
df.OUTLET_CODE.nunique()

In [None]:
df.DISTRIBUTOR_CODE.nunique()

In [None]:
#no negative sales happened
[df.describe()<0]

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.corr(numeric_only=True)

In [None]:

# Group by description and sum the sales
productby_sales = df.groupby('CATEGORY')['SALES_VOLUME'].sum().sort_values(ascending=False).iloc[0:30]

# Plotting
plt.figure(figsize=(20, 5))
sns.barplot(x=productby_sales.index, y=productby_sales.values, palette="Purples_r")
plt.ylabel("Total Sales")
plt.title("Which products has the most sales?")
plt.xticks(rotation=90)
plt.show()


In [None]:

# Group by description and sum the sales
productby_qty = df.groupby('CATEGORY')['SALES_UNITS'].sum().sort_values(ascending=False).iloc[0:30]

# Plotting
plt.figure(figsize=(20, 5))
sns.barplot(x=productby_qty.index, y=productby_qty.values, palette="Greens_r")
plt.ylabel("Total Sales")
plt.title("Which products sold most?")
plt.xticks(rotation=90)
plt.show()


In [None]:
df.COUNTY.nunique()

In [None]:
# Find unique dates
df['TRANS_DATE'].nunique()

In [None]:
state_counts = df.STATE.value_counts().sort_values(ascending=False).iloc[0:20]
plt.figure(figsize=(20,5))
sns.barplot(x=state_counts.index, y=state_counts.values, palette="Blues_r")
plt.ylabel("Counts")
plt.title("Which State made the most transactions?");
plt.xticks(rotation=90);
plt.yscale("log")

In [None]:
#How many percentage of transactions happened in california
df.loc[df.STATE=="California"].shape[0] / df.shape[0] * 100

In [None]:
df['TRANS_DATE'] = pd.to_datetime(df['TRANS_DATE'])
df['START_DATE'] = pd.to_datetime(df['START_DATE'])

In [None]:
# Create features from date column
df['YEAR'] = df['TRANS_DATE'].dt.year
df['MONTH'] = df['TRANS_DATE'].dt.month
df['DAY'] = df['TRANS_DATE'].dt.day
df['DAY_OF_WEEK'] = df['TRANS_DATE'].dt.day_of_week + 1 # Monday is 1 and sunday is 7
df['QUARTER'] = df['TRANS_DATE'].dt.quarter
df['DAY_OF_YEAR'] = df['TRANS_DATE'].dt.dayofyear

In [None]:
df.describe()

In [None]:
df_unique = df.drop_duplicates()

In [None]:
import matplotlib.pyplot as plt

# Plot SALES_UNITS over QUARTER
plt.figure(figsize=(10, 5))
df_unique.groupby('QUARTER')['SALES_UNITS'].sum().plot(kind='bar', color='orange')
plt.title('SALES_UNITS over QUARTER')
plt.xlabel('Quarter')
plt.ylabel('SALES_UNITS')
plt.grid(True)
plt.show()

# Plot SALES_UNITS over MONTH
plt.figure(figsize=(10, 5))
df_unique.groupby('MONTH')['SALES_UNITS'].sum().plot(kind='bar', color='skyblue')
plt.title('SALES_UNITS over MONTH')
plt.xlabel('Month')
plt.ylabel('SALES_UNITS')
plt.grid(True)
plt.show()


In [None]:
#most products are sold in quantities from 1 to 16
df.SALES_UNITS.describe()

In [None]:
#uniform distribution
fig, ax = plt.subplots(1,2,figsize=(20,5))
sns.distplot(df.SALES_UNITS, ax=ax[0], kde=False, color="limegreen");
sns.distplot(np.log(df.SALES_UNITS), ax=ax[1], bins=20, kde=False, color="limegreen");
ax[0].set_title("Quantity distribution")
ax[0].set_yscale("log")
ax[1].set_title("Log-Quantity distribution")
ax[1].set_xlabel("Natural-Log Quantity");

In [None]:
import datetime
      
startdate = datetime.datetime.strptime('2011-11-01', "%Y-%m-%d").date()
enddate = datetime.datetime.strptime('2011-11-01', "%Y-%m-%d").date()

In [None]:
import pandas as pd

# Define the dates as strings
startdate_str = '2024-01-01'  # Replace with your actual start date
enddate_str = '2023-08-29'    # Replace with your actual end date

# Convert the strings to datetime objects
startdate = pd.to_datetime(startdate_str)
enddate = pd.to_datetime(enddate_str)

# Split the dataset into train and test dataset
df_test = df[df['TRANS_DATE'] > startdate] #1 month
df_train = df[df['TRANS_DATE'] <= enddate] # 11 month


In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train.head()

In [None]:
X_train = df_train.drop(labels=['MNTH_CODE','TRANS_DATE','START_DATE','SALES_PTR_VALUE','OC_CODE',
'DISTRIBUTOR_CODE','CITY','STATE','COUNTY','STREET','PRODUCT_CODE','SUBCATEGORY','BRAND','DAY','DAY_OF_WEEK','DAY_OF_YEAR'],axis=1)
y_train = df_train.SALES_UNITS

X_test = df_train.drop(labels=['MNTH_CODE','TRANS_DATE','START_DATE','SALES_PTR_VALUE','OC_CODE',
'DISTRIBUTOR_CODE','CITY','STATE','COUNTY','STREET','PRODUCT_CODE','SUBCATEGORY','BRAND','DAY','DAY_OF_WEEK','DAY_OF_YEAR'],axis=1)
y_test = df_train.SALES_UNITS



In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
col_1 = df['OUTLET_CODE']
set_col_1 = list(set(col_1))
le.fit(col_1)
dict(zip(set_col_1, le.transform(set_col_1)))

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
col_2 = df['CATEGORY']
set_col_2 = list(set(col_2))
le.fit(col_2)
dict(zip(set_col_2, le.transform(set_col_2)))

In [None]:
X_train.keys()

In [None]:
X_test.keys()

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

# Check the first few rows of X_train and y_train
print("X_train head:\n", X_train.head())
print("y_train head:\n", y_train.head())

In [None]:
score = pd.DataFrame()
score.index.name = 'Scoring'
score['Training+Test Time(sec)'] = None
time_Frame = []

In [None]:
df.dtypes

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
#data splitting

In [None]:
#verify data loading
print("First few rows of X_train:\n", X_train[:5])
print("First few rows of y_train:\n", y_train[:5])

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 50, random_state = 0)
%time regressor.fit(X_train, y_train)
%time regressor.fit(X_test, y_test)
y_pred_train = regressor.predict(X_train)
print('Train RSME :', np.sqrt(mean_squared_error(y_train, y_pred_train)))


y_pred_test = regressor.predict(X_test)
print('Test RSME :', np.sqrt(mean_squared_error(y_test, y_pred_test)))

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
col_1 = df['OUTLET_CODE']
set_col_1 = list(set(col_1))
le.fit(col_1)
dict(zip(set_col_1, le.transform(set_col_1)))

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
col_2 = df['CATEGORY']
set_col_2 = list(set(col_2))
le.fit(col_2)
dict(zip(set_col_2, le.transform(set_col_2)))

In [None]:
df.CATEGORY

In [None]:
X = df.drop('SALES_VOLUME',axis=1)
y = df['CATEGORY']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 1000, random_state = 42)
#X_train = X_train.astype('OBJECT')
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
df=pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
df

In [None]:
#less than 1 percent of the mean of all the values in the ‘sales_volume’ column
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
#Accuracy of the prediction
# Calculate the absolute errors
errors = abs(y_pred - y_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
import seaborn as sns
plt.figure(figsize=(5, 7))

ax = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred, hist=False, color="y", label="Fitted Values" , ax=ax)

plt.title('Actual vs Predicted Values Sales Volume')


plt.show()
plt.close()