In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Change to the working directory
%cd /content/drive/MyDrive/Python - Time Series Forecasting/Advanced Content for Time Series /FINAL PROJECT - Build an Automated Forecasting Pipeline
# Install greykite library
!pip install greykite

/content/drive/MyDrive/Python - Time Series Forecasting/Advanced Content for Time Series /FINAL PROJECT - Build an Automated Forecasting Pipeline


In [1]:
!pip freeze > requirements.txt

In [None]:
# Name of the datasets
data = "electricity-BE.csv"
future = "electricity-future-BE.csv"

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from plotly.offline import iplot
import re

# Greykite functions
from greykite.framework.templates.autogen.forecast_config import *
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.common.features.timeseries_features import *
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results

# Suppress specific warnings
import warnings
warnings.filterwarnings("ignore", message="Requested holiday")
warnings.filterwarnings("ignore", message="The following Fourier series terms are removed due to collinearity")

# Load the data
df = pd.read_csv(data)

# Display the first few rows of the dataset
print("Preview of the data:")
print(df.head())

# Ask user for the name of the time series variable and the date variable
time_series_var = input("Enter the name of the time series variable: ")
date_var = input("Enter the name of the date variable: ")
country = input("Enter the country code for the data (e.g., 'US' for the United States): ")
# Ask the user for the regressor columns
regressor_cols_input = input("Enter the regressor columns separated by commas (e.g., 'Easter, Temperature, Marketing'): ")
regressor_cols = [col.strip() for col in regressor_cols_input.split(",")]

# Ask the user for the forecasting horizon
forecast_horizon = int(input("Enter the forecasting horizon (number of periods to forecast): "))

# Determine the frequency of the data without setting the index
df_copy = df.copy()
df_copy[date_var] = pd.to_datetime(df_copy[date_var])
df_copy = df_copy.set_index(date_var)
inferred_freq = pd.infer_freq(df_copy.index)
print(f"Detected frequency: {inferred_freq}")

# Determine the training end date
train_end_date = df_copy.index.max()
print(f"Training end date: {train_end_date}")

# Load the future data
future_df = pd.read_csv(future)

# Rename columns as per user input
df = df.rename(columns={time_series_var: 'y', date_var: 'Date'})
future_df = future_df.rename(columns={time_series_var: 'y', date_var: 'Date'})

# Merge both datasets
df = pd.concat([df, future_df]).reset_index(drop=True)

# Clean the time series and regressor columns by removing non-alphanumeric characters, except letters and digits
df[time_series_var] = df[time_series_var].replace({r'[^a-zA-Z0-9\s]': ''}, regex=True)

for col in regressor_cols:
    df[col] = df[col].replace({r'[^a-zA-Z0-9\s]': ''}, regex=True)

# Silverkite Preparations
metadata = MetadataParam(time_col="Date",
                         value_col="y",
                         freq=inferred_freq,
                         train_end_date=train_end_date)

# Define model components
growth = dict(growth_term=["linear",
                         #  "quadratic",
                           "sqrt"])
seasonality = dict(yearly_seasonality="auto",
                   quarterly_seasonality="auto",
                   monthly_seasonality="auto",
                   weekly_seasonality="auto",
                   daily_seasonality="auto")

events = dict(holiday_lookup_countries=["US"],
              holiday_pre_num_days=2,
              holiday_post_num_days=2)

changepoints = dict(changepoints_dict=dict(method="auto"))

# Check if the regressor columns exist and are numeric, discard any invalid ones
valid_regressors = []
for col in regressor_cols:
    if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
        valid_regressors.append(col)
    else:
        print(f"Discarding invalid or non-numeric regressor column: {col}")

# If no valid regressors are found, notify the user and continue without them
if not valid_regressors:
    print("No valid regressor columns found. Continuing without regressors.")
    regressors = None
    lagged_regressors = None
else:
    regressors = dict(regressor_cols=valid_regressors)
    lagged_regressors = dict(lagged_regressor_dict={col: "auto" for col in valid_regressors})

# Print the valid regressors for verification
print(f"Valid regressor columns: {valid_regressors}")

# Define autoregression settings
autoregression = dict(autoreg_dict="auto")
# Define custom fit algorithms
custom = dict(fit_algorithm_dict=[dict(fit_algorithm="linear"),
                               #   dict(fit_algorithm="ridge"),
                                  dict(fit_algorithm="gradient_boosting")])

# Build the Silverkite model
model = ModelComponentsParam(growth=growth,
                             seasonality=seasonality,
                             events=events,
                             changepoints=changepoints,
                             regressors=regressors,
                             lagged_regressors=lagged_regressors,
                             autoregression=autoregression,
                             custom=custom)

# Define evaluation metric
evaluation_metric = EvaluationMetricParam(
    cv_selection_metric=EvaluationMetricEnum.RootMeanSquaredError.name)

# Calculate CV parameters
cv_periods_between_splits = (forecast_horizon // 2) + 1   # Ensure at least 1 period between splits
cv_min_train_periods = 10 * forecast_horizon  # Ensure at least 10 CV periods

# CV parameters
evaluation_period = EvaluationPeriodParam(cv_min_train_periods=df.shape[0] -cv_min_train_periods,
                                          cv_expanding_window=True,
                                          cv_periods_between_splits=cv_periods_between_splits)

# Configuration of the CV
config = ForecastConfig(model_template=ModelTemplateEnum.SILVERKITE.name,
                        forecast_horizon=forecast_horizon,
                        metadata_param=metadata,
                        model_components_param=model,
                        evaluation_metric_param=evaluation_metric,
                        evaluation_period_param=evaluation_period)

# Initialize and run the forecaster
forecaster = Forecaster()
result = forecaster.run_forecast_config(df=df, config=config)

# Summarize cross-validation results
cv_results = summarize_grid_search_results(
    grid_search=result.grid_search,
    decimals=1,
    score_func=EvaluationMetricEnum.RootMeanSquaredError.name)

# Set the index
cv_results["params"] = cv_results["params"].astype(str)
cv_results.set_index("params", drop=True, inplace=True)

from tabulate import tabulate

# Assuming cv_results is already defined and has the necessary data
best_results = cv_results[["rank_test_RMSE", "mean_test_RMSE",
                           "param_estimator__fit_algorithm_dict",
                           "param_estimator__growth_term"]]

# Print best results using tabulate for a nicer output
print(tabulate(best_results, headers='keys', tablefmt='pretty', showindex=False))

# Retrieve and save the forecast
forecast = result.forecast.df[["ts", "forecast"]]
forecast_silverkite = forecast.iloc[-len(future_df):, :]
forecast_silverkite.to_csv("forecast_silverkite.csv", index=False)

# Visualize the backtest results
result.backtest.plot()


Preview of the data:
  unique_id               ds      y  Exogenous1  Exogenous2
0        BE  10/22/2016 0:00  70.00       49593       57253
1        BE  10/22/2016 1:00  37.10       46073       51887
2        BE  10/22/2016 2:00  37.10       44927       51896
3        BE  10/22/2016 3:00  44.75       44483       48428
4        BE  10/22/2016 4:00  37.10       44338       46721
Enter the name of the time series variable: y
Enter the name of the date variable: ds
Enter the country code for the data (e.g., 'US' for the United States): BE
Enter the regressor columns separated by commas (e.g., 'Easter, Temperature, Marketing'): Exogenous1, Exogenous2
Enter the forecasting horizon (number of periods to forecast): 24
Detected frequency: H
Training end date: 2016-12-30 23:00:00
Valid regressor columns: ['Exogenous1', 'Exogenous2']
Fitting 3 folds for each of 4 candidates, totalling 12 fits



Trained model's `min_lagged_regressor_order` (24) is less than the size of `fut_df` (1476), NaN values (if there are any) in lagged regressor columns have been imputed. More info: {'Exogenous1': {'lag_dict': {'orders': [168]}, 'agg_lag_dict': {'orders_list': [[168, 336, 504]], 'interval_list': [(24, 191)]}, 'series_na_fill_func': <function SilverkiteForecast.__get_default_lagged_regressor_dict.<locals>.<lambda> at 0x78ca570bb5b0>}, 'Exogenous2': {'lag_dict': {'orders': [168]}, 'agg_lag_dict': {'orders_list': [[168, 336, 504]], 'interval_list': [(24, 191)]}, 'series_na_fill_func': <function SilverkiteForecast.__get_default_lagged_regressor_dict.<locals>.<lambda> at 0x78ca570ba3b0>}}.


Trained model's `min_lagged_regressor_order` (24) is less than the size of `fut_df` (1619), NaN values (if there are any) in lagged regressor columns have been imputed. More info: {'Exogenous1': {'lag_dict': {'orders': [168]}, 'agg_lag_dict': {'orders_list': [[168, 336, 504]], 'interval_list': [(24, 191)

+----------------+----------------+----------------------------------------+------------------------------+
| rank_test_RMSE | mean_test_RMSE |  param_estimator__fit_algorithm_dict   | param_estimator__growth_term |
+----------------+----------------+----------------------------------------+------------------------------+
|       4        |      24.8      |      {'fit_algorithm': 'linear'}       |            linear            |
|       3        |      13.8      |      {'fit_algorithm': 'linear'}       |             sqrt             |
|       1        |      8.5       | {'fit_algorithm': 'gradient_boosting'} |            linear            |
|       2        |      8.5       | {'fit_algorithm': 'gradient_boosting'} |             sqrt             |
+----------------+----------------+----------------------------------------+------------------------------+
Forecasting and visualizations complete. Files saved.
