# Import Libraries

In [None]:
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import tqdm as tqdm
import warnings

warnings.filterwarnings('ignore')

# Data Preparation & Exploratory Data Analysis (EDA)

In [None]:
# Read csv file into Pandas DataFrame

data = pd.read_csv('dataCo.csv', encoding='ISO-8859-1')
data.head().round(2)

In [None]:
# General information about each data in the dataset

data.info()

In [None]:
# Display statistical information for numberical variables

data.describe()

In [None]:
# Check missing values
data.isnull().sum()

In [None]:
numCols = data.select_dtypes(include=['int64', 'float64']).columns

In [None]:
for col in numCols:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[col], kde=True)
    plt.title(f'Histogram of {col}')
    plt.show()

In [None]:
catCols = data.select_dtypes(include=['object']).columns

In [None]:
for col in catCols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=data, x=col, order=data[col].value_counts().index)
    plt.title(f'Count of {col}')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
correlationmatrix = data[numCols].corr()
plt.figure(figsize=(30, 18))
sns.heatmap(correlationmatrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

# Data Preprocessing

## Remove Unrelated Columns

In [None]:
# List of columns to drop
colsToDrop = [
    'Type',
    'Days for shipment (scheduled)',
    'Days for shipping (real)',
    'Delivery Status',
    'Late_delivery_risk',
    'Category Id',
    'Customer City',
    'Customer Country',
    'Customer Email',
    'Customer Fname',
    'Customer Id',
    'Customer Lname',
    'Customer Password',
    'Customer State',
    'Customer Street',
    'Customer Zipcode',
    'Department Id',
    'Department Name',
    'Latitude',
    'Longitude',
    'Order City',
    'Order Country',
    'Order Item Cardprod Id',
    'Order Item Id',
    'Order State',
    'Product Description',
    'Product Image',
    'Product Status',
    'Order Item Product Price',
    'Order Item Total',
    'Shipping Mode',
    'Order Status',
    'Order Zipcode',
    'shipping date (DateOrders)',
    'Customer Segment',
    'Market',
    'Order Item Discount',
    'Order Item Discount Rate',
    'Order Region',
    'Product Card Id',
    'Product Category Id',
    'Product Name',
    'Product Price'
]
# Create new DataFrame by dropping the specified columns
newData = data.drop(columns = colsToDrop)
newData.head().round(2)

# Data Transformation

In [None]:
newData['order date (DateOrders)'] = pd.to_datetime(newData['order date (DateOrders)'])
newData.info()

In [None]:
# Find the date of most recent purchase - for Recency

newData['order date (DateOrders)'].max()

In [None]:
# Set present date as next day of most recent purchase

present = dt.datetime(2018,2,1)

# Customer Segmentation

### Recency = No. of days between 1/2/2018 and date of last purchase (per customer)
### Frequency = No. of orders per customer
### Monetary = Total purchase price per customer

In [None]:
# Recency = Subtract maximum date from 1/2/2018 -> result converted into number of days since last purchase
# Frequency = Count number of orders for each customer
# Monetary = Sum of all prices paid by each customer
# Convert columns into RFM aspects

rfm = newData.groupby('Order Customer Id').agg({'order date (DateOrders)': lambda x: (present - x.max()).days, 
                                             'Order Id': lambda x: len(x), 'Sales per customer': lambda x: x.sum()})

rfm['order date (DateOrders)'] = rfm['order date (DateOrders)'].astype(int)

rfm.rename(columns={'order date (DateOrders)': 'Recency (R)', 
                         'Order Id': 'Frequency (F)', 
                         'Sales per customer': 'Monetary (M)'}, inplace=True)

In [None]:
# Dividing RFM data into five percentiles

quantiles = rfm.quantile(q=[0.2,0.4,0.6,0.8])
quantiles = quantiles.to_dict()
quantiles

In [None]:
# Define functions for score assignment

# Recency is best at minimum so 1st percentile = 5
def recency(x, y, z):
    if x <= z[y][0.20]:
        return 5
    elif x <= z[y][0.40]:
        return 4
    elif x <= z[y][0.60]:
        return 3
    elif x <= z[y][0.80]:
        return 2
    else:
        return 1
    
# F and M is better when score is higher so 1st percentile = 1
def frequencyAndMonetary(a, b, c):
    if a <= c[b][0.20]:
        return 1
    elif a <= c[b][0.40]:
        return 2
    elif a <= c[b][0.60]:
        return 3
    elif a <= c[b][0.80]:
        return 4
    else:
        return 5

In [None]:
# Create a column for R Score to indicate the score between 1 to 5
rfm['R Score'] = rfm['Recency (R)'].apply(recency, args=('Recency (R)',quantiles))

# Create a column for F Score to indicate the score between 1 to 5
rfm['F Score'] = rfm['Frequency (F)'].apply(frequencyAndMonetary, args=('Frequency (F)',quantiles))

# Create a column for M Score to indicate the score between 1 to 5
rfm['M Score'] = rfm['Monetary (M)'].apply(frequencyAndMonetary, args=('Monetary (M)',quantiles))

rfm

In [None]:
# Create a column for combined RFM score
rfm['RFM Score'] = rfm['R Score'].astype(str)+ rfm['F Score'].astype(str) + rfm['M Score'].astype(str)
rfm

# Exploratory Data Analysis (EDA)

In [None]:
# Plot distribution of RFM attributes

plt.figure(figsize=(8, 6))

plt.subplot(3, 1, 1)
sns.histplot(rfm['Recency (R)'], kde=True)
plt.xlabel('Recency')

plt.subplot(3, 1, 2)
sns.histplot(rfm['Frequency (F)'], kde=True)
plt.xlabel('Frequency')

plt.subplot(3, 1, 3)
sns.histplot(rfm['Monetary (M)'], kde=True)
plt.xlabel('Monetary')

plt.tight_layout()
plt.show()


# Model Building - RFM

In [None]:
# Print all unique values and total count

count=rfm['RFM Score'].unique()

print(count)

len(count)

In [None]:
# Summation of all R, F, M Scores for segmentation

rfm['Total Score'] = rfm[['R Score','F Score','M Score']].sum(axis=1)
rfm['Total Score'].unique()

In [None]:
# Perform segmentation based on RFM Scores (3 4 5 6 7 8 9 10 11 12 13)
# Assign customers into 3 segments - high, mid, low value
# 3 - 6 -> low value customers
# 7 - 9 -> mid value customers
# 10 - 13 -> high value customers

def RFMSegmentation(df):
    if 3 <= df['Total Score'] <= 6:
        return 'Low Value Customers' 
    elif 7 <= df['Total Score'] <= 9:
        return 'Mid Value Customers' 
    elif 10 <= df['Total Score'] <= 13:
        return 'High Value Customers'
    else:
        return 'Invalid score'

In [None]:
# Create new column indicating each customer segments based on RFM

rfm['Customer Segmentation'] =rfm.apply(RFMSegmentation, axis=1)
rfm.head().round(2)

In [None]:
# Plot customer segment distribution in a pie chart

rfm['Customer Segmentation'].value_counts().plot.pie(figsize=(6,6), startangle=0, explode=(0,0,0),
                                                     autopct='%.1f%%',shadow=False, colormap ='coolwarm')
plt.title("Customer Segmentation (RFM Model)", size=10, fontweight = 'bold')
plt.ylabel(" ")
plt.axis('equal') 
plt.show()

# Model Building - K-Means Clustering

In [None]:
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
# Standardize RFM values

scaler = StandardScaler()
rfmScaled = scaler.fit_transform(rfm[['Recency (R)', 'Frequency (F)', 'Monetary (M)']])

In [None]:
# To construct elbow chart to show the optimal number of k

inertia = []
for k in range (1,11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(rfmScaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o', linestyle='-')
plt.xlabel('Number of clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.grid(True)
plt.show()

In [None]:
# Create K-means clustering model with 3 clusters
kmeans = KMeans(n_clusters=3)

# Fit model to the scaled data
kmeans.fit(rfmScaled)

# Get the cluster labels
cluster_labels = kmeans.labels_

# Create new column in rfm known as CLuster
rfm['Cluster'] = cluster_labels

rfm.head()


In [None]:
# 3D plot containing all three attributes of RFM

fig = px.scatter_3d(rfm, x='Monetary (M)', y='Recency (R)', z='Frequency (F)',
                    color=cluster_labels, opacity=0.9, size_max=4)

fig.update_layout(title='Clustering of Recency, Frequency, and Monetary', 
                  scene=dict(xaxis_title='Monetary',
                             yaxis_title='Recency',
                             zaxis_title='Frequency'))
fig.show()

In [None]:
clusterCounts = rfm['Cluster'].value_counts()

# Create a pie chart
plt.figure(figsize=(8, 6))
plt.pie(clusterCounts, labels=clusterCounts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Clusters')
plt.axis('equal')
plt.show()


In [None]:
# Evaluation of clusters

silhouette = silhouette_score(rfmScaled, cluster_labels)
print("Silhouette Score:", silhouette)

In [None]:
# Merge results with dataset for further interpretation

rfmMerged = pd.merge(newData, rfm[['Customer Segmentation', 'Cluster']], left_on='Order Customer Id', right_index=True, how='right')
rfmMerged.round(2).head()

In [None]:
rfmMerged.to_parquet('rfmMerged.parquet')
rfmMerged.to_csv('rfmMerged.csv')

# Trend Analysis

In [None]:
# Copy newData into new DataFrame to maintain integrity of original data
forecastData = rfmMerged.copy()
forecastData['orderDate'] = forecastData['order date (DateOrders)'].dt.date
forecastData.head().round(2)

In [None]:
forecastData.info()

In [None]:
# Print profit by product category over time chart
# One cluster per graph

import plotly.graph_objects as go

forecastData['orderDate'] = pd.to_datetime(forecastData['orderDate'])

clusters = forecastData['Cluster'].unique()

for clusterValue in clusters:
    clusterData = forecastData[forecastData['Cluster'] == clusterValue]
    groupedData = clusterData.groupby(['Category Name', pd.Grouper(key='orderDate', freq='M')])['Benefit per order'].sum().reset_index()
    pivotData = groupedData.pivot(index='orderDate', columns='Category Name', values='Benefit per order')
    
    fig = go.Figure()
    for column in pivotData.columns:
        fig.add_trace(go.Scatter(x=pivotData.index, y=pivotData[column], mode='lines', name=column))

    fig.update_layout(
        title=f'Profit by Category Over Time (Cluster {clusterValue})',
        xaxis=dict(title='Order Date', tickangle=45),
        yaxis=dict(title='Profit'),
        legend=dict(title='Category'),
        width=1350,
        height=700,
    )

    fig.show()


In [None]:
# Group data by cluster and category and sum up the benefit per order (profit)
clusterCategoryProfit = forecastData.groupby(['Cluster', 'Category Name'])['Benefit per order'].sum().reset_index()

sales = forecastData.groupby(['Cluster', 'Category Name'])['Sales per customer'].sum().reset_index()

clusterCategoryProfit = clusterCategoryProfit.merge(sales, on=['Cluster', 'Category Name'], how='left')
clusterCategoryProfit.rename(columns={'Benefit per order': 'Profit', 'Sales per customer': 'Sales'}, inplace=True)
clusterCategoryProfit.tail().round(2)

In [None]:
# Split data according to cluster assignment

def splitData(data, cluster):
    return data[(data['Cluster'] == cluster)]

cluster0 = splitData(forecastData, 0)
cluster1 = splitData(forecastData, 1)
cluster2 = splitData(forecastData, 2)

print("Cluster 0:")
print(cluster0)
print("\nCluster 1:")
print(cluster1)
print("\nCluster 2:")
print(cluster2)

In [None]:
# Function to remove outliers and convert date format

def processClusterData(clusterData):
    
    clusterData = clusterData.groupby(['Cluster', 'Sales per customer','Category Name','orderDate'])['Benefit per order'].sum().reset_index()
    clusterData.rename(columns={'Benefit per order': 'Profit', 'Sales per customer': 'Sales'}, inplace=True)

    # Calculate and remove outliers using IQR method
    Q1 = clusterData['Profit'].quantile(0.25)
    Q3 = clusterData['Profit'].quantile(0.75)
    IQR = Q3 - Q1
    lowerBound = Q1 - 1.5 * IQR
    upperBound = Q3 + 1.5 * IQR
    clusterData = clusterData[(clusterData['Profit'] >= lowerBound) & (clusterData['Profit'] <= upperBound)]

    # Convert date into year + month and year + week
    clusterData['orderDate'] = pd.to_datetime(clusterData['orderDate'])
    clusterData['yearMonth'] = clusterData['orderDate'].dt.to_period('M')
    clusterData['yearWeek'] = clusterData['orderDate'].dt.to_period('W')
    clusterData['yearDay'] = clusterData['orderDate'].dt.to_period('D')

    return clusterData

# Process data for Cluster 0
cluster0 = processClusterData(cluster0)

# Process data for Cluster 1
cluster1 = processClusterData(cluster1)

# Process data for Cluster 2
cluster2 = processClusterData(cluster2)

In [None]:
# as_index = False returns pd DataFrame

cluster0Profit = cluster0.groupby(['Category Name','Sales','yearWeek'], as_index=False)['Profit'].sum()
cluster1Profit = cluster1.groupby(['Category Name','Sales','yearWeek'], as_index=False)['Profit'].sum()
cluster2Profit = cluster2.groupby(['Category Name','Sales','yearWeek'], as_index=False)['Profit'].sum()

## Trend Analysis for Cluster 0

In [None]:
# Remove categories that has less than 14 rows / days of orders as time series decomposition could not be produced

grouped = cluster0Profit.groupby('Category Name')

categoryCounts = grouped.size()

validcategories = categoryCounts[categoryCounts >= 14]

cluster0Profit = cluster0Profit[cluster0Profit['Category Name'].isin(validcategories.index)]

In [None]:
# Perform time series decomposition using STL

from statsmodels.tsa.seasonal import STL

trendList = []

for category_name, groupData in cluster0Profit.groupby('Category Name'):
    groupData['yearWeek'] = groupData['yearWeek'].dt.to_timestamp()
    groupData.set_index('yearWeek', inplace=True)
    groupData = groupData[~groupData.index.duplicated()] # Drop duplicate indices if any
    groupData = groupData.resample('D').asfreq().fillna(method='ffill') # Resample data to daily frequency & fill missing values
    
    # Perform STL decomposition
    stl = STL(groupData['Profit'], seasonal=7)  # seasonal parameter depends on seasonality of data
    result = stl.fit()
    
    # Save the decomposed components
    trend = result.trend
    seasonal = result.seasonal
    residual = result.resid

    # Determine trend type
    if trend.diff().dropna().mean() > 0.1:
        trendType = 'Increasing'
    elif trend.diff().dropna().mean() < -0.1:
        trendType = 'Decreasing'
    else:
        trendType = 'Stable'
    
    # Store the trend, trend type, seasonal, and residual data
    trendData = pd.DataFrame({
        'Trend': trend,
        'Trend Type': trendType,
        'Seasonal': seasonal,
        'Residual': residual
    })

    combinedData = pd.concat([groupData, trendData], axis=1)
    trendList.append(combinedData)

cluster0Trend = pd.concat(trendList)
cluster0Trend['Cluster'] = 0
cluster0Trend.to_csv('cluster0Trend.csv', index=False)

## Trend Analysis for Cluster 1

In [None]:
grouped = cluster1Profit.groupby('Category Name')

categoryCounts = grouped.size()

validcategories = categoryCounts[categoryCounts >= 14]

cluster1Profit = cluster1Profit[cluster1Profit['Category Name'].isin(validcategories.index)]

In [None]:
trendList = []

for category_name, groupData in cluster1Profit.groupby('Category Name'):

    groupData['yearWeek'] = groupData['yearWeek'].dt.to_timestamp()
    groupData.set_index('yearWeek', inplace=True)
    groupData = groupData[~groupData.index.duplicated()]
    groupData = groupData.resample('D').asfreq().fillna(method='ffill')
    

    stl = STL(groupData['Profit'], seasonal=7)
    result = stl.fit()
    
    trend = result.trend
    seasonal = result.seasonal
    residual = result.resid

    if trend.diff().dropna().mean() > 0.1:
        trendType = 'Increasing'
    elif trend.diff().dropna().mean() < -0.1:
        trendType = 'Decreasing'
    else:
        trendType = 'Stable'
    
    trendData = pd.DataFrame({
        'Trend': trend,
        'Trend Type': trendType,
        'Seasonal': seasonal,
        'Residual': residual
    })

    combinedData = pd.concat([groupData, trendData], axis=1)
    
    trendList.append(combinedData)

cluster1Trend = pd.concat(trendList)
cluster1Trend['Cluster'] = 1
cluster1Trend.to_csv('cluster1Trend.csv', index=False)


## Trend Analysis for Cluster 2

In [None]:
grouped = cluster2Profit.groupby('Category Name')

categoryCounts = grouped.size()

validcategories = categoryCounts[categoryCounts >= 14]

cluster2Profit = cluster2Profit[cluster2Profit['Category Name'].isin(validcategories.index)]

In [None]:
trendList = []

for category_name, groupData in cluster2Profit.groupby('Category Name'):

    groupData['yearWeek'] = groupData['yearWeek'].dt.to_timestamp()
    groupData.set_index('yearWeek', inplace=True)
    groupData = groupData[~groupData.index.duplicated()]
    groupData = groupData.resample('D').asfreq().fillna(method='ffill')

    stl = STL(groupData['Profit'], seasonal=7)
    result = stl.fit()
    
    trend = result.trend
    seasonal = result.seasonal
    residual = result.resid

    if trend.diff().dropna().mean() > 0.1:
        trendType = 'Increasing'
    elif trend.diff().dropna().mean() < -0.1:
        trendType = 'Decreasing'
    else:
        trendType = 'Stable'

    trendData = pd.DataFrame({
        'Trend': trend,
        'Trend Type': trendType,
        'Seasonal': seasonal,
        'Residual': residual
    })

    combinedData = pd.concat([groupData, trendData], axis=1)
    
    trendList.append(combinedData)

cluster2Trend = pd.concat(trendList)
cluster2Trend['Cluster'] = 2
cluster2Trend.to_csv('cluster2Trend.csv', index=False)


## Combination of all Clusters‘ Trend

In [None]:
combine = pd.concat([cluster0Trend, cluster1Trend, cluster2Trend], ignore_index=False)
combine

In [None]:
combine.rename(columns={'Category Name': 'categoryName'}, inplace=True)

In [None]:
def categorizeMainCategory(mainCat): 
    # Define dictionaries mapping category names to main categories 
    mainCategories = {
        'Apparel': ['Baby', 'Men\'s Apparel', 'Men\'s Clothing', 'Women\'s Clothing','Women\'s Apparel', 'Girls\' Apparel', 'Children\'s Clothing', 'Baby ', 
        'Men\'s Footwear', 'Women\'s Footwear', 'Accessories', 'Fitness Accessories', 'Health and Beauty'],
        'Sports': ['Sporting Goods', 'Shop By Sport', 'Baseball & Softball', 'Boxing & MMA', 'Camping & Hiking', 'Cardio Equipment', 'Cleats', 'Fishing', 
        'Golf Apparel', 'Golf Balls', 'Golf Gloves', 'Golf Shoes', 'Golf Bags & Carts', 'Hockey', 'Hunting & Shooting', 'Indoor/Outdoor Games', 'Lacrosse', 
        'Tennis & Racquet', 'Water Sports', 'Soccer', 'Basketball', 'Strength Training', 'Kids\' Golf Clubs', 'Men\'s Golf Clubs', 'Women\'s Golf Clubs', 'Garden'],
        'Electronics': ['Electronics', 'Cameras ', 'Computers', 'Consumer Electronics'],
        'Entertainment': ['As Seen on  TV!','Books ', 'CDs ', 'DVDs', 'Music', 'Video Games', 'Toys', 'Crafts', 'As Seen on TV!', 'Trade-In', 'Pet Supplies']
    }
    
    # Function to map category to main category 
    def mapToMainCategory(categoryName): 
        for mainCategory, categories in mainCategories.items(): 
            if categoryName in categories: 
                return mainCategory 
        return 'Other' 
    
    # Apply the mapping function to the categoryName column 
    mainCat['mainCategory'] = mainCat['categoryName'].apply(mapToMainCategory) 
    return mainCat

In [None]:
categorizeMainCategory(combine)

In [None]:
combine.to_csv('trendAnalysis.csv')
combine.to_parquet('trendAnalysis.parquet')

# Profit Forecasting

In [None]:
from pycaret.time_series import *
from tqdm import tqdm

In [None]:
data = pd.read_csv('trendAnalysis.csv')
data['yearWeek'] = pd.to_datetime(data['yearWeek'])
data.head()

In [None]:
group = ["Cluster", "Category Name", "yearWeek"]

aggData = data.groupby(group).sum()
aggData.reset_index(inplace=True)
aggData.info()

In [None]:
aggData.rename(columns={'Category Name': 'categoryName'}, inplace=True)

## ARIMA

In [None]:
from pycaret.time_series import *
from tqdm import tqdm

resultsCompilation = pd.DataFrame()
evaluationMetrics = pd.DataFrame()

with tqdm(range(len(aggData.groupby(["Cluster", "categoryName"])))) as pbar:
    for _, data in aggData.groupby(["Cluster", "categoryName"]):
        data.reset_index(inplace=True, drop=True)
        target_df = data.set_index('yearWeek')[['Profit']]
        cluster = data.Cluster[0]
        category = data.categoryName[0]
        num = len(data)* 0.1
        num = round(num)

        s = setup(target_df, target='Profit', verbose=False, fh = num)

        best = compare_models(include = ['arima'], verbose=False)
        evaluationMetric = pull()

        predictions = predict_model(best, fh = 365)
        predictions.reset_index(inplace=True)
        predictions['yearWeek'] = predictions['index'].dt.to_timestamp()
        predictions.drop(columns=['index'], inplace=True)
        predictions["Cluster"] = cluster
        predictions["categoryName"] = category
        predictions["algorithm"] = str(best)

        results = pd.concat([data, predictions])
        resultsCompilation = pd.concat([resultsCompilation, results], ignore_index=True)
        evaluationMetrics = pd.concat([evaluationMetrics, evaluationMetric], ignore_index=True)

        pbar.update(1)
        
# Saving error metrics and score to a CSV file
evaluationMetrics.to_csv('evaluationMetrics_arimaa.csv', index=False)
resultsCompilation.to_csv('resultsCompilation_arimaa.csv')
resultsCompilation.to_parquet('resultsCompilation_arimaa.parquet')

## Light Gradient Boosting

In [None]:
from pycaret.time_series import *
from tqdm import tqdm

resultsCompilation = pd.DataFrame()
evaluationMetrics = pd.DataFrame()

with tqdm(range(len(aggData.groupby(["Cluster", "categoryName"])))) as pbar:
    for _, data in aggData.groupby(["Cluster", "categoryName"]):
        data.reset_index(inplace=True, drop=True)
        target_df = data.set_index('yearWeek')[['Profit']]
        cluster = data.Cluster[0]
        category = data.categoryName[0]
        num = len(data)* 0.1
        num = round(num)

        s = setup(target_df, target='Profit', verbose=False, fh = num)

        best = compare_models(include = ['lightgbm_cds_dt'], verbose=False)
        evaluationMetric = pull()

        predictions = predict_model(best, fh = 365)
        predictions.reset_index(inplace=True)
        predictions['yearWeek'] = predictions['index'].dt.to_timestamp()
        predictions.drop(columns=['index'], inplace=True)
        predictions["Cluster"] = cluster
        predictions["categoryName"] = category
        predictions["algorithm"] = str(best)

        results = pd.concat([data, predictions])
        resultsCompilation = pd.concat([resultsCompilation, results], ignore_index=True)
        evaluationMetrics = pd.concat([evaluationMetrics, evaluationMetric], ignore_index=True)

        pbar.update(1)
        
# Saving error metrics and score to a CSV file
evaluationMetrics.to_csv('evaluationMetrics_gbr.csv', index=False)
resultsCompilation.to_csv('resultsCompilation_gbr.csv')
resultsCompilation.to_parquet('resultsCompilation_gbr.parquet')

## Random Forest

In [None]:
from pycaret.time_series import *
from tqdm import tqdm

resultsCompilation = pd.DataFrame()
evaluationMetrics = pd.DataFrame()

with tqdm(range(len(aggData.groupby(["Cluster", "categoryName"])))) as pbar:
    for _, data in aggData.groupby(["Cluster", "categoryName"]):
        data.reset_index(inplace=True, drop=True)
        target_df = data.set_index('yearWeek')[['Profit']]
        cluster = data.Cluster[0]
        category = data.categoryName[0]
        num = len(data)* 0.1
        num = round(num)

        s = setup(target_df, target='Profit', verbose=False, fh = num)

        best = compare_models(include = ['rf_cds_dt'], verbose=False)
        evaluationMetric = pull()

        predictions = predict_model(best, fh = 365)
        predictions.reset_index(inplace=True)
        predictions['yearWeek'] = predictions['index'].dt.to_timestamp()
        predictions.drop(columns=['index'], inplace=True)
        predictions["Cluster"] = cluster
        predictions["categoryName"] = category
        predictions["algorithm"] = str(best)

        results = pd.concat([data, predictions])
        resultsCompilation = pd.concat([resultsCompilation, results], ignore_index=True)
        evaluationMetrics = pd.concat([evaluationMetrics, evaluationMetric], ignore_index=True)

        pbar.update(1)
        
# Saving error metrics and score to a CSV file
evaluationMetrics.to_csv('evaluationMetrics_rf.csv', index=False)
resultsCompilation.to_csv('resultsCompilation_rf.csv')
resultsCompilation.to_parquet('resultsCompilation_rf.parquet')

## Linear Regression

In [None]:
from pycaret.time_series import *
from tqdm import tqdm

resultsCompilation = pd.DataFrame()
evaluationMetrics = pd.DataFrame()

with tqdm(range(len(aggData.groupby(["Cluster", "categoryName"])))) as pbar:
    for _, data in aggData.groupby(["Cluster", "categoryName"]):
        data.reset_index(inplace=True, drop=True)
        target_df = data.set_index('yearWeek')[['Profit']]
        cluster = data.Cluster[0]
        category = data.categoryName[0]
        num = len(data)* 0.1
        num = round(num)

        s = setup(target_df, target='Profit', verbose=False, fh = num)

        best = compare_models(include = ['lr_cds_dt'], verbose=False)
        evaluationMetric = pull()

        predictions = predict_model(best, fh = 365)
        predictions.reset_index(inplace=True)
        predictions['yearWeek'] = predictions['index'].dt.to_timestamp()
        predictions.drop(columns=['index'], inplace=True)
        predictions["Cluster"] = cluster
        predictions["categoryName"] = category
        predictions["algorithm"] = str(best)

        results = pd.concat([data, predictions])
        resultsCompilation = pd.concat([resultsCompilation, results], ignore_index=True)
        evaluationMetrics = pd.concat([evaluationMetrics, evaluationMetric], ignore_index=True)

        pbar.update(1)
        
# Saving error metrics and score to a CSV file
evaluationMetrics.to_csv('evaluationMetrics_lr.csv', index=False)
resultsCompilation.to_csv('resultsCompilation_lr.csv')
resultsCompilation.to_parquet('resultsCompilation_lr.parquet')

## Model Evaluation

### Forecasting of Product Categories

In [None]:
rf = pd.read_csv('evaluationMetrics_rf.csv')
lr = pd.read_csv('evaluationMetrics_lr.csv')
arima = pd.read_csv('evaluationMetrics_arima.csv')
lgb = pd.read_csv('evaluationMetrics_lightgbm.csv')

In [None]:
rf['Model'] = 'Random Forest'
lr['Model'] = 'Linear Regression'
arima['Model'] = 'ARIMA'
lgb['Model'] = 'LightGBM'

# Concatenate all DataFrames into one
combinedError = pd.concat([rf, lgb, lr, arima])

# Group by 'Model' and calculate the mean
eva = combinedError.groupby('Model')['MAE', 'RMSE'].mean()
eva.round(2)