# ARIMA ML Model

Overall Process:
- PMDarima auto arima model with time series cross validation 
- PMDarima auto arima model with expanding window


Packages:
1. scikit-learn
2. scipy
3. snowflake-snowpark-python
4. pandas
5. numpy
6. matplotlib
7. statsmodels
8. pmdarima (for auto arima)

In [None]:
import time
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_percentage_error as MAPE_metrics
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima

# Prediction timeframe: 14 days
# Training timeframe: 56 days (4 weeks)
TEST_SIZE = 14
TRAIN_SIZE = TEST_SIZE * 4

# Get preprocessed transaction data for store chain = 2
session = get_active_session()
session.use_database("ML")
session.use_schema("RETAIL_STORE")

data = session.table("store_2_preprocessed_transactions").to_pandas()
data = data[["DATE", "TOTAL_SALES"]]
data["DATE"] = pd.to_datetime(data["DATE"])
ew_data = data.sort_values("DATE") # Preserve dataframe for expanding window
data.set_index("DATE", inplace=True)

- Functions for future use.

In [None]:
def plot_graph(train_values, actual_values, predictions):
    """
    Plot a graph showing train data, actual values and predictions.
    
    The function plots three lines:
    1. Training data values
    2. Actual test values 
    3. Predicted values
    
    The x-axis represents time steps and y-axis represents the values.

    Args:
        train_values: Array of training data values to plot
        actual_values: Array of actual test values to plot
        predictions: Array of predicted values to plot
    """
    x_train = np.linspace(0, len(train_values), len(train_values))
    x = np.linspace(len(train_values), len(train_values) + len(actual_values), len(actual_values))

    plt.plot(x_train, train_values)
    plt.plot(x, actual_values)
    plt.plot(x, predictions)
    plt.legend(["Train Data", "Actual Sales", "Predictions"])
    plt.show()

    return


def calculate_smape(actual_values, predictions):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE) between actual and predicted values.
    
    Args:
        actual_values: Array of actual values
        predictions: Array of predicted values
        
    Returns:
        float: SMAPE score as a percentage between 0 and 100
    """
    return 100/len(actual_values) * np.sum(2 * np.abs(predictions - actual_values) / (np.abs(actual_values) + np.abs(predictions)))

### 1. Split dataset into train and test set

In [None]:
# Split data into train and test set
X_train = data.iloc[:-TEST_SIZE]
test_data = data.tail(TEST_SIZE)


### 2. Time series cross validation index split

In [None]:
# Time series cross validation index split
tscv = TimeSeriesSplit(n_splits=math.floor(len(X_train)/20), max_train_size=TRAIN_SIZE, test_size=TEST_SIZE)
tscv_split_index = tscv.split(X_train)

### 3. Fit Pmdarima auto arima model with time series cross validation

Optimal results:
- MAPE value: 0.299
- SMAPE value: 26.751
- Tuning time: 29.768s
- Fitting time: 0.3192s

In [None]:
start_time = time.time()

# Use Pmdarima to select best parameters for ARIMA model (p,d,q) and seasonal components
AUTO_ARIMA_model_tscv = auto_arima(
    X_train, 
    m=7,
    start_p=1, max_p=7,
    start_d=1, max_d=2,
    start_q=1, max_q=7,
    seasonal=True,
    stepwise=True,
    suppress_warnings=True,
    error_action="ignore"
)

end_time = time.time()


In [None]:
print(f"ARIMA autotuning time: {end_time - start_time} seconds")

print(AUTO_ARIMA_model_tscv.summary())

- Test accuracy of ARIMA model using time series cross validation

In [None]:
# Test accuracy of ARIMA model
MAPE_values_tscv = []

for train_index, val_index in tscv_split_index:
    train_data = X_train.iloc[train_index]
    val_data = X_train.iloc[val_index]
    
    # Obtain Mean Aboslute Percentage Error (MAPE) for time series cross validation sets
    AUTO_ARIMA_model_tscv.fit(y=train_data)
    predictions = AUTO_ARIMA_model_tscv.predict(n_periods=TEST_SIZE)
    MAPE = MAPE_metrics(val_data, predictions)
    MAPE_values_tscv.append(MAPE)

- Plot MAPE values across time series cross validation split

In [None]:
# Model summary for Pmdarima with Time Series Cross Validation
plt.plot(MAPE_values_tscv)
plt.title("MAPE values (TSCV)")
plt.show()

### 4. Predict last 14 days (using last 2 months of data)

In [None]:
start_time = time.time()
AUTO_ARIMA_model_tscv.fit(y=X_train[-TRAIN_SIZE:])
predictions = AUTO_ARIMA_model_tscv.predict(n_periods=TEST_SIZE)
end_time = time.time()

print(f"ARIMA fitting time: {end_time - start_time} seconds")

mape = MAPE_metrics(test_data, predictions)
print(f"Last 14 days prediction MAPE values: {mape}")
smape = calculate_smape(test_data["TOTAL_SALES"].values, predictions)
print(f"Last 14 days prediction SMAPE values: {smape}")

In [None]:
plot_graph(X_train[-TRAIN_SIZE:], test_data, predictions)

### 5. Fit ARIMA Expanding Window model
Optimal results:
- MAPE value: 0.120
- SMAPE value: 11.407
- Tuning time: -
- Fitting time: 0.2426s

In [None]:
EXPANDING_WINDOW_FREQ = "14D"

# Expanding window split
ew_data["DATE"] = pd.to_datetime(ew_data["DATE"])
ew_data_sales = ew_data.set_index("DATE")

start_date = ew_data["DATE"].min() + pd.Timedelta(days=TEST_SIZE*4)
end_date = ew_data["DATE"].max() - pd.Timedelta(days=TEST_SIZE)

# Create training and validation sets and record its performance
MAPE_values_ew = []

for date in pd.date_range(start_date, end_date, freq=EXPANDING_WINDOW_FREQ):
    
    ew_train = ew_data_sales.loc[:date - pd.offsets.Day(1)]
    ew_val = ew_data_sales.loc[date:date + pd.offsets.Day(TEST_SIZE-1)]

    # Re-train ARIMA model every N number of days
    AUTO_ARIMA_model_ew = auto_arima(
        ew_train, 
        m=7,
        start_p=1, max_p=7,
        start_d=1, max_d=2,
        start_q=1, max_q=7,
        seasonal=True,
        stepwise=True,
        suppress_warnings=True,
        error_action="ignore"
    )

    # Obtain Mean Aboslute Percentage Error (MAPE) for expanding window sets
    AUTO_ARIMA_model_ew.fit(y=ew_train)
    predictions = AUTO_ARIMA_model_ew.predict(n_periods=TEST_SIZE)
    MAPE = MAPE_metrics(ew_val, predictions)
    MAPE_values_ew.append(MAPE)
    

- Model summary

In [None]:
print(AUTO_ARIMA_model_ew.summary())

- Plot MAPE values over time

In [None]:
plt.plot(MAPE_values_ew)
plt.title("Overall MAPE values (Expanding Window)")
plt.show()

### 6. Predict last 14 days (using all historical data)

In [None]:
# Fit arima model
start_time = time.time()
AUTO_ARIMA_model_ew.fit(y=X_train)
predictions = AUTO_ARIMA_model_ew.predict(n_periods=TEST_SIZE)
end_time = time.time()

print(f"ARIMA EW model fitting time: {end_time - start_time} seconds")

mape = MAPE_metrics(test_data, predictions)
print(f"Last 14 days prediction MAPE values: {mape}")
smape = calculate_smape(test_data["TOTAL_SALES"].values, predictions)
print(f"Last 14 days prediction SMAPE values: {smape}")

In [None]:
plot_graph(X_train[-40:], test_data, predictions)

In [None]:
session.close()