# Setup

In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to the folder
%cd /content/drive/MyDrive/Python - Time Series Forecasting/Advanced Content for Time Series /FINAL PROJECT - Build an Automated Forecasting Pipeline

/content/drive/MyDrive/Python - Time Series Forecasting/Advanced Content for Time Series /FINAL PROJECT - Build an Automated Forecasting Pipeline


In [None]:
# Install the greykite library
!pip install greykite

Collecting greykite
  Downloading greykite-1.0.0-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting dill>=0.3.1.1 (from greykite)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting holidays<0.25 (from greykite)
  Downloading holidays-0.24-py3-none-any.whl.metadata (16 kB)
Collecting holidays-ext>=0.0.7 (from greykite)
  Downloading holidays_ext-0.0.8-py3-none-any.whl.metadata (1.3 kB)
Collecting numpy<1.25.0,>=1.22.0 (from greykite)
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting overrides>=2.8.0 (from greykite)
  Downloading overrides-7.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pandas<2.0.0,>=1.5.0 (from greykite)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pmdarima<=1.8.5,>=1.8.0 (from greykite)
  Downloading pmdarima-1.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (7.7 kB)
Co

In [None]:
# Name of the datasets
data = "nyc_data.csv"
future = "nyc_data_future.csv"

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

# Greykite functions
from greykite.framework.templates.autogen.forecast_config import *
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.common.features.timeseries_features import *
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results
from plotly.offline import iplot

# Suppress warning
import warnings
warnings.filterwarnings("ignore", message = "Requested holiday")
warnings.filterwarnings("ignore",
                        message = "The following Fourier series terms are removed due to collinearity")

# Load and display the data
df = pd.read_csv(data)
print(df.head())

       Date      Demand  Easter  Thanksgiving  Christmas  Temperature  \
0  1/1/2015  720.000885       0             0          0         3.68   
1  1/2/2015  581.276773       0             0          0         4.73   
2  1/3/2015  754.117039       0             0          0         7.23   
3  1/4/2015  622.252774       0             0          0        10.96   
4  1/5/2015  785.373319       0             0          0         6.92   

   Marketing  
0     41.305  
1    131.574  
2    162.700  
3    160.281  
4     51.077  


In [None]:
# Inputs for configuring the forecasting setup
time_series_var = input("Enter the name of the time series variable: ")  # Name of the target time series column
date_var = input("Enter the name of the time / date variable: ")  # Name of the date or time column
country = input("Enter the country (e.g., US for United States): ")  # Country code for holiday and event modeling
regressor_cols_input = input("Enter the regressor columns (comma-separated): ")  # Names of additional regressor columns, comma-separated
forecast_horizon = int(input("Enter the number of periods for the forecast horizon (e.g., 11, 34): "))  # Number of periods to forecast

Enter the name of the time series variable: Demand
Enter the name of the time / date variable: Date
Enter the country (e.g, US for United States): US
Enter the regressor columns (comma-separated): Temperature, Marketing
Enter the number of periods for the forecast horizon (e.g., 11, 34): 31


In [None]:
# Data Processing
df_copy = df.copy()  # Create a copy of the dataframe to avoid modifying the original
df_copy[date_var] = pd.to_datetime(df_copy[date_var])  # Convert the date column to datetime format
df_copy.set_index(date_var, inplace=True)  # Set the date column as the index
inferred_freq = pd.infer_freq(df_copy.index)  # Infer the frequency of the time series data
print(f"The inferred frequency is: {inferred_freq}")

# train end date
train_end_date= df_copy.index.max()  # Get the maximum date, which represents the end of the training period
print(f"The train end date is: {train_end_date}")

# Loading the future data
future_df = pd.read_csv(future)
df = pd.concat([df, future_df]).reset_index(drop = True)  # Append the future data to the original dataframe and reset the index

# Renaming columns for consistency
df = df.rename(columns = {time_series_var: "y",
                          date_var : "Date"})

# Process regressor columns
regressor_cols = [col.strip() for col in regressor_cols_input.split(",")] # Split and strip the regressor column names
print(f"The regressor columns are: {regressor_cols}")

# Clean data by removing non-alphanumeric characters
df["y"] = df["y"].replace({r'[^a-zA-Z0-9\s]': ''}, regex=True)  # Clean the target variable
for col in regressor_cols:
  df[col] = df[col].replace({r'[^a-zA-Z0-9\s]': ''}, regex=True)  # Clean each regressor column

The inferred frequency is: D
The train end date is: 2020-12-31 00:00:00
The regressor columns are: ['Temperature', 'Marketing']


In [None]:
# Silkerkite parameters
metadata = MetadataParam(
    time_col="Date",  # Column name for time/date
    value_col="y",  # Column name for the target variable
    freq=inferred_freq,  # Frequency of the time series
    train_end_date=train_end_date  # End date for training data
)

# Growth terms for the model
growth = dict(
    growth_term=["linear", "quadratic", "sqrt"]  # Types of growth terms to consider
)

# Seasonality parameters
seasonality = dict(yearly_seasonality = "auto",
                   quarterly_seasonality = "auto",
                   monthly_seasonality = "auto",
                   weekly_seasonality = "auto",
                   daily_seasonality = "auto")

# Event parameters for holidays
events = dict(
    holiday_lookup_countries=[country],  # List of countries for holiday lookups
    holiday_pre_num_days=2,  # Number of days before a holiday to consider
    holiday_post_num_days=2  # Number of days after a holiday to consider
)

# Changepoints configuration
changepoints = dict(
    changepoints_dict=dict(method="auto")  # Method to automatically detect changepoints
)

# Validate and configure regressors
valid_regressors = [] # List to store valid regressor columns
for col in regressor_cols:
    if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
        valid_regressors.append(col)  # Add valid regressor columns
    else:
        print(f"{col} is not included")  # Print if the column is not valid

# Check if there are any valid regressors
if not valid_regressors:
    print("No valid regressors found")  # Notify if no valid regressors are found
    regressors = None  # Set regressors to None if no valid regressor
    lagged_regressors = None  # Set lagged regressors to None
else:
    # Define regressors and lagged regressors
    regressors = dict(
        regressor_cols=valid_regressors  # List of valid regressor columns
    )
    lagged_regressors = dict(
        lagged_regressor_dict={col: "auto" for col in valid_regressors})  # Automatically include lagged regressors for valid columns

# Define autoregression settings
autoregression = dict(autoreg_dict = "auto")

# Define custom fit algorithms
custom = dict(
    fit_algorithm_dict=[
        dict(fit_algorithm="linear"),  # Linear regression
        dict(fit_algorithm="ridge"),  # Ridge regression
        dict(fit_algorithm="gradient_boosting")  # Gradient boosting
    ]
)

# Build the Silverkite model with the defined parameters
model = ModelComponentsParam(
    growth=growth,  # Growth terms for the model
    seasonality=seasonality,  # Seasonality settings
    events=events,  # Event settings for holidays
    changepoints=changepoints,  # Changepoints detection settings
    regressors=regressors,  # Regressor columns
    lagged_regressors=lagged_regressors,  # Lagged regressor settings
    autoregression=autoregression,  # Autoregression settings
    custom=custom  # Custom fit algorithms
)

# Define evaluation metric
evaluation_metric = EvaluationMetricParam(
    cv_selection_metric=EvaluationMetricEnum.RootMeanSquaredError.name  # Metric for cross-validation
)

# Define evaluation period parameters
cv_min_train_periods = df.shape[0] - 10 * forecast_horizon  # Minimum training periods for CV
cv_periods_between_splits = (forecast_horizon // 2) + 1  # Number of periods between splits
evaluation_period = EvaluationPeriodParam(
    cv_min_train_periods=cv_min_train_periods,  # Minimum training periods
    cv_expanding_window=True,  # Use expanding window for CV
    cv_periods_between_splits=cv_periods_between_splits,  # Periods between CV splits
    cv_max_splits=20  # Maximum number of splits for CV
)

# Define the forecast configuration
config = ForecastConfig(
    model_template=ModelTemplateEnum.SILVERKITE.name,  # Model template
    forecast_horizon=forecast_horizon,  # Number of periods to forecast
    metadata_param=metadata,  # Metadata parameters
    model_components_param=model,  # Model components parameters
    evaluation_metric_param=evaluation_metric,  # Evaluation metric parameters
    evaluation_period_param=evaluation_period  # Evaluation period parameters
)

# Initialize and run the forecaster
forecaster = Forecaster()
result = forecaster.run_forecast_config(df = df, config = config)

# Summarize cross-validation results
cv_results = summarize_grid_search_results(
    grid_search=result.grid_search,  # Grid search results
    decimals=1,  # Number of decimal places
    score_func=EvaluationMetricEnum.RootMeanSquaredError.name  # Scoring function
)

# Process and format cross-validation results
cv_results["params"] = cv_results["params"].astype(str)  # Convert params to string
cv_results.set_index("params", drop=True, inplace=True)  # Set parameters as index
best_results = cv_results[["rank_test_RMSE", "mean_test_RMSE",  # Select relevant columns
                           "param_estimator__fit_algorithm_dict",
                           "param_estimator__growth_term"]]

# Print the best results in a tabular format
from tabulate import tabulate
print(tabulate(best_results, headers="keys", tablefmt="pretty", showindex=False))

# Retrieve and save the forecast
forecast = result.forecast.df[["ts", "forecast"]]  # Select relevant columns
forecast = forecast.iloc[-forecast_horizon:, :]  # Slice to get the forecast horizon
forecast.to_csv("forecast_silverkite.csv", index=False)  # Save to CSV

# Visualize the backtest results
result.backtest.plot()

Fitting 14 folds for each of 9 candidates, totalling 126 fits



There is a high number of CV splits (14). If training is slow, increase `periods_between_splits` or `min_train_periods`, or decrease `max_splits`


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_toq_quarterly', 'cos4_tow_weekly']


The following Fourier series terms are removed due to collinearity:
['sin1_toq_quarterly', 'cos1_toq_quarterly', 'sin2_toq_quarterly', 'cos2_toq_quarterly', 'sin3_toq_quarterly', 'cos3_

+----------------+----------------+----------------------------------------+------------------------------+
| rank_test_RMSE | mean_test_RMSE |  param_estimator__fit_algorithm_dict   | param_estimator__growth_term |
+----------------+----------------+----------------------------------------+------------------------------+
|       5        |      57.2      |      {'fit_algorithm': 'linear'}       |            linear            |
|       4        |      57.1      |      {'fit_algorithm': 'linear'}       |          quadratic           |
|       6        |      57.3      |      {'fit_algorithm': 'linear'}       |             sqrt             |
|       2        |      55.7      |       {'fit_algorithm': 'ridge'}       |            linear            |
|       1        |      54.9      |       {'fit_algorithm': 'ridge'}       |          quadratic           |
|       3        |      55.7      |       {'fit_algorithm': 'ridge'}       |             sqrt             |
|       7        |      60.0