# Prophet Recursive model
Overall process:
- Iteratively run Prophet model on 100 unique store chains
- Analyse data preprocessing, MAPE, SMAPE and Prophet prediction time

### 1. Create table to store Prophet model performance

In [None]:
CREATE OR REPLACE TABLE prophet_model_performance
(
    store_chain_id NUMBER(38,0),
    mape FLOAT,
    smape FLOAT,
    prophet_elapsed_time FLOAT,
    data_preprocessing_time FLOAT,
    overall_time FLOAT
);


### 2. Create stored procedure for data preprocessing
- Extract transactions for the specified store chain id
- Aggregate total sales per day
- Impute any missing dates

In [None]:
CREATE OR REPLACE PROCEDURE data_preprocessing_procedure(store_id INT)
RETURNS STRING
LANGUAGE SQL
AS
$$
    BEGIN
    -- Extract daily sales dataset for specified store_id
    CREATE OR REPLACE TABLE prophet_preprocessed_transactions AS
    WITH store_preprocessed_transactions AS(
        SELECT store_chain_id, purchase_amount, date, offer_date FROM preprocessed_transactions
        WHERE store_chain_id=:store_id
    ),
    prophet_dataset AS(
        WITH filtered_transactions AS (
            SELECT 
                store_chain_id,
                date,
                purchase_amount, 
                offer_date
            FROM store_preprocessed_transactions
            WHERE 
                (SELECT COUNT(offer_date) FROM store_preprocessed_transactions) = 0 OR
                date < (SELECT MIN(offer_date) FROM store_preprocessed_transactions)
        )
        SELECT 
            store_chain_id,
            date,
            SUM(purchase_amount) AS total_sales,
        FROM filtered_transactions
        GROUP BY store_chain_id, date
        ORDER BY store_chain_id, date
    ),
    date_range AS(
        SELECT 
            MIN(date) AS min_date,
            MAX(date) AS max_date,
        FROM prophet_dataset
    ),
    
    -- Create date table
    date_table AS (
        SELECT 
            min_date AS date,
            max_date 
        FROM date_range
        UNION ALL
        SELECT
            DATEADD(day, 1, date),
            max_date
        FROM date_table
        WHERE date_table.date < date_table.max_date
    )
    
    -- Left join with transactions table
    SELECT 
        dt.date,
        COALESCE(s.total_sales, 0) AS total_sales,
    FROM date_table dt
    LEFT JOIN prophet_dataset s
        ON dt.date = s.date
    ORDER BY s.date;
    RETURN 'SUCCESS';
    END
$$;

### 3. Create stored procedure for Prophet model
- Call upon data preprocessing stored procedure before fitting the Prophet model

In [None]:
CREATE OR REPLACE PROCEDURE run_prophet_forecasting(store_id INT)
RETURNS STRING
LANGUAGE PYTHON 
RUNTIME_VERSION = '3.9'
HANDLER = 'prophet_main'
PACKAGES = (
    'snowflake-snowpark-python',
    'pandas',
    'prophet',
    'scikit-learn',
    'numpy'
)
AS
$$
def prophet_main(session, store_id):

    import time
    import numpy as np
    import pandas as pd
    from prophet import Prophet
    from sklearn.metrics import mean_absolute_percentage_error as MAPE_metrics

    TEST_SIZE = 14
    TRAIN_SIZE = TEST_SIZE * 4

    overall_start_time = time.time()
    
    # Call data preprocessing stored procedure
    preprocessing_start_time = time.time()
    session.sql(f"CALL data_preprocessing_procedure({store_id});").collect()
    session.sql("SELECT COUNT(*) FROM prophet_preprocessed_transactions;").collect()
    preprocessing_end_time = time.time()
    
    
    # Read preprocessed dataset
    transactions_df = session.table("prophet_preprocessed_transactions")
    transactions_df = transactions_df.to_pandas()
    transactions_df = transactions_df[["DATE", "TOTAL_SALES"]]
    transactions_df["DATE"] = pd.to_datetime(transactions_df["DATE"])
    transactions_df = transactions_df.sort_values(by='DATE', ignore_index=True)


    # Rename columns and split train test set
    transactions_df = transactions_df.rename(columns={"DATE": "ds"})
    transactions_df = transactions_df.rename(columns={"TOTAL_SALES": "y"})
    
    transactions_df["y"] = transactions_df["y"] + 1
    
    FLOOR = 0
    CAP = transactions_df["y"].max()
    
    transactions_df["floor"] = 0
    transactions_df["cap"] = CAP
    
    # Split train test set
    test_set = transactions_df.iloc[-TEST_SIZE:]
    transactions_df = transactions_df.iloc[:-TEST_SIZE]
    
    # Fit model
    train_data = transactions_df[-TRAIN_SIZE:]

    prophet_start_time = time.time()
    model = Prophet(
        seasonality_mode="multiplicative",
        growth="logistic"
    )
    model.fit(train_data)
    prophet_end_time = time.time()

    df_future = model.make_future_dataframe(periods=TEST_SIZE, freq="D")
    df_future["floor"] = FLOOR
    df_future["cap"] = CAP
    prophet_predictions = model.predict(df_future)
    
    predictions = prophet_predictions["yhat"].iloc[-TEST_SIZE:].values
    val_data = transactions_df["y"].iloc[-TEST_SIZE:]

    # Store results
    mape = float(MAPE_metrics(test_set["y"], predictions))
    smape = float(100/len(test_set["y"]) * np.sum(2 * np.abs(predictions - test_set["y"]) / (np.abs(test_set["y"]) + np.abs(predictions))))

    overall_end_time = time.time()
    
    prophet_time = float(prophet_end_time - prophet_start_time)
    preprocessing_time = float(preprocessing_end_time - preprocessing_start_time)
    overall_time = float(overall_end_time - overall_start_time)

    session.sql(f"DELETE FROM prophet_model_performance WHERE store_chain_id = {store_id}").collect()
    session.sql(f"INSERT INTO prophet_model_performance (store_chain_id, mape, smape, prophet_elapsed_time, data_preprocessing_time, overall_time) VALUES ({store_id}, {mape}, {smape}, {prophet_time}, {preprocessing_time}, {overall_time})").collect()
    
    return "SUCCESS"
$$;

### 4. Get 100 unique store id and run prophet model

In [None]:
session = get_active_session()
session.use_database("ml")
session.use_schema("retail_store")

df = session.table("transactions")
store_chain_list = session.sql("SELECT DISTINCT store_chain_id FROM transactions;")
store_chain_list = store_chain_list.to_pandas()
store_chain_list = store_chain_list["STORE_CHAIN_ID"].to_list()
store_chain_list.sort()

# Run default Prophet model on 100 unique store chain daily sales data
store_index = 0
for i in range(100):
    print(f"Step {i+1}/100: store #{store_chain_list[store_index]}")
    try:
        session.sql(f"CALL run_prophet_forecasting ({store_chain_list[store_index]})").collect()
        print("Success")
    except Exception as e:
        i -= 1
        print(f"ERROR (Store chain {store_chain_list[store_index]}): {e}")

    store_index += 1


### 5. Average performance

- Average MAPE: 1.819
- Average SMAPE: 26.215
- Average Prophet prediction time: 0.197s
- Average Preprocessing time: 2.471s
- Average Overall time: 3.116s

Time analysis: Overall time is slighty higher than Prophet + Preprocessing time, as this does not include SQL DML commands such as deleting and inserting rows into performance table

In [None]:
SELECT * FROM prophet_model_performance;

In [None]:
SELECT 
    AVG(mape) AS avg_mape, 
    AVG(smape) AS avg_smape,
    AVG(prophet_elapsed_time) AS avg_prophet_time,
    AVG(data_preprocessing_time) AS avg_preprocessing_time,
    AVG(overall_time) AS avg_overall_time,
    SUM(overall_time) AS total_time
FROM
    prophet_model_performance;

### 6. Generate Boxplot for mape and overall time

Packages:
1. matplotlib
2. snowflake-snowpark-python
3. pandas
4. scikit-learn
5. prophet
6. numpy
7. seaborn

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

session = get_active_session()
session.use_database("ml")
session.use_schema("retail_store")

performance = session.table("prophet_model_performance")
performance = performance.to_pandas()

- Distribution plot for mape values

In [None]:
sns.displot(performance["MAPE"], kde=True)
plt.show()

- Boxplot for mape values

In [None]:
sns.displot(performance["MAPE"][performance["MAPE"] < 0.6], kde=True)
plt.show()

In [None]:
plt.figure(figsize = (3,3))
plt.boxplot(performance["MAPE"])
plt.show()

- Boxplot for overall time

In [None]:
plt.figure(figsize = (3,3))
plt.boxplot(performance["OVERALL_TIME"])
plt.show()

### 7. Isolate highest MAPE outlier

Analysis: Store chain that has MAPE outlier are mainly due to the missing/negative sales data which are imputed as 0, which affects the Prophet model's performance

In [None]:
SELECT * FROM prophet_model_performance
WHERE mape < 2 AND mape > 1;

In [None]:
CREATE OR REPLACE TABLE outlier_isolation_prophet_model AS
WITH store_preprocessed_transactions AS(
    SELECT store_chain_id, purchase_amount, date, offer_date FROM preprocessed_transactions
    WHERE store_chain_id=3
),
prophet_dataset AS(
    WITH filtered_transactions AS (
        SELECT 
            store_chain_id,
            date,
            purchase_amount, 
            offer_date
        FROM store_preprocessed_transactions
        WHERE 
            (SELECT COUNT(offer_date) FROM store_preprocessed_transactions) = 0 OR
            date < (SELECT MIN(offer_date) FROM store_preprocessed_transactions)
    )
    SELECT 
        store_chain_id,
        date,
        SUM(purchase_amount) AS total_sales,
    FROM filtered_transactions
    GROUP BY store_chain_id, date
    ORDER BY store_chain_id, date
),
date_range AS(
    SELECT 
        MIN(date) AS min_date,
        MAX(date) AS max_date,
    FROM prophet_dataset
),

-- Create date table
date_table AS (
    SELECT 
        min_date AS date,
        max_date 
    FROM date_range
    UNION ALL
    SELECT
        DATEADD(day, 1, date),
        max_date
    FROM date_table
    WHERE date_table.date < date_table.max_date
)

-- Left join with transactions table
SELECT 
    dt.date,
    COALESCE(s.total_sales, 0) AS total_sales,
FROM date_table dt
LEFT JOIN prophet_dataset s
    ON dt.date = s.date
ORDER BY s.date;

In [None]:
import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error as MAPE_metrics
import matplotlib.pyplot as plt

TEST_SIZE = 14
TRAIN_SIZE = TEST_SIZE * 4

# Read preprocessed dataset
session = get_active_session()
session.use_database("ml")
session.use_schema("retail_store")

transactions_df = session.table("outlier_isolation_prophet_model")
transactions_df = transactions_df.to_pandas()
transactions_df = transactions_df[["DATE", "TOTAL_SALES"]]
transactions_df["DATE"] = pd.to_datetime(transactions_df["DATE"])
transactions_df = transactions_df.sort_values(by='DATE', ignore_index=True)


# Rename columns and split train test set
transactions_df = transactions_df.rename(columns={"DATE": "ds"})
transactions_df = transactions_df.rename(columns={"TOTAL_SALES": "y"})

transactions_df["y"] = transactions_df["y"] + 1

FLOOR = 0
CAP = transactions_df["y"].max()

transactions_df["floor"] = 0
transactions_df["cap"] = CAP

# Split train test set
test_set = transactions_df.iloc[-TEST_SIZE:]
transactions_df = transactions_df.iloc[:-TEST_SIZE]


# Fit model
train_data = transactions_df[-TRAIN_SIZE:]

model = Prophet(
    seasonality_mode="multiplicative",
    growth="logistic"
)
model.fit(train_data)

df_future = model.make_future_dataframe(periods=TEST_SIZE, freq="D")
df_future["floor"] = FLOOR
df_future["cap"] = CAP
prophet_predictions = model.predict(df_future)

predictions = prophet_predictions["yhat"].iloc[-TEST_SIZE:].values
val_data = transactions_df["y"].iloc[-TEST_SIZE:]

In [None]:
def plot_graph(train_values, actual_values, predictions):
    x_train = np.linspace(0, len(train_values), len(train_values))
    x = np.linspace(len(train_values), len(train_values) + len(actual_values), len(actual_values))

    plt.plot(x_train, train_values)
    plt.plot(x, actual_values)
    plt.plot(x, predictions)
    plt.legend(["Train Data", "Actual Sales", "Predictions"])
    plt.show()

plot_graph(train_data["y"].iloc[-56:], val_data, predictions)

In [None]:
session.close()