## Model Training & Validation 

In [1]:
# Data Manipulation
# ======================================================
import pandas as pd 
import numpy as np
from helper_functions import *
import math

# Statistics
# ======================================================
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.Sarimax import Sarimax
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.model_selection_sarimax import backtesting_sarimax, grid_search_sarimax
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from skforecast.model_selection import bayesian_search_forecaster
from skforecast.ForecasterBaseline import ForecasterEquivalentDate

# Warnings Config
# ======================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading & Manipulating Dataset
# ======================================================

df = fetch_preprocess_dataset()

variables = ['GS_cold', 'GS_cough', 'GS_fever', 'GS_flu', 
            'AWND', 'PRCP','SNOW', 'TAVG','TMAX', 'TMIN',
            'Overall AQI Value', 
            'CO', 'Ozone', 'PM10', 'PM25', 'Days Moderate', 
            'Days Unhealthy', 'visits','Main Pollutant_CO', 
            'Main Pollutant_NO2', 'Main Pollutant_PM2.5']

# adding lagged variables up to three weeks
# to capture potential lagged effects
for v in variables:
    for i in range(1,4):
        df[f'{v}_L{i}'] = df[v].shift(i)

df = df.dropna()

end_train, end_validation, df_train, df_val, df_test = train_test_validate_split(
                                                        df, end_train = date(2015,12,31), 
                                                        end_validation = date(2017,12,31)
                                                                                )

Dates train      : 2005-01-16 00:00:00 --- 2015-12-27 00:00:00  (n=572)
Dates validacion : 2016-01-03 00:00:00 --- 2017-12-31 00:00:00  (n=105)
Dates test       : 2017-12-31 00:00:00 --- 2019-12-29 00:00:00  (n=105)


---

# Elastic Net

In [29]:
# Model instantiation & fit
# ======================================================

forecaster1 = ForecasterAutoreg(
    regressor=ElasticNetCV(
        verbose=False,
        random_state=123,
        l1_ratio=.5,
        selection='cyclic'
    ),
    lags=[1,2,52],
    transformer_y=FunctionTransformer(func = np.log1p, inverse_func = np.expm1, validate=True),
    transformer_exog=StandardScaler())

forecaster1.fit(y=df_train['cases'],exog=df_train.drop(columns=['cases']))

# Model optimization on validation dataset
# ======================================================

met, preds = backtesting_forecaster(
    forecaster = forecaster1,
    y = df['cases'][:end_validation],
    exog = df.drop(columns=['cases'])[:end_validation],
    initial_train_size = len(df_train),
    steps = 2,
    metric = 'mean_absolute_error',
    refit = False,
)

100%|██████████| 53/53 [00:00<00:00, 196.45it/s]


In [31]:
# Hyperparameter tuning via grid search
# ======================================================

lags_grid = [[52],[1],[1,52],[1,2],[1,2,52],[1,2,3],[1,2,3,52]]
param_grid = {
    'l1_ratio':[.001, .05, 0.10, 0.50, 0.70],
    'selection':['cyclic','random']
}

results_grid = grid_search_forecaster(
    forecaster = forecaster1,
    y = df['cases'][:end_validation],
    exog = df.drop(columns=['cases'])[:end_validation],
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=2,
    refit=False,
    metric='mean_squared_error',
    initial_train_size=len(df_train),
    fixed_train_size=True,
    return_best=True,
    verbose=False,
    show_progress=True
)

met, preds = backtesting_forecaster(
    forecaster = forecaster1,
    y = df['cases'][:end_validation],
    exog = df.drop(columns=['cases'])[:end_validation],
    initial_train_size = len(df_train),
    steps = 2,
    metric = 'mean_absolute_error',
    refit = False)

print(met)

Number of models compared: 70.


lags grid: 100%|██████████| 7/7 [00:27<00:00,  3.89s/it]


`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1 52] 
  Parameters: {'l1_ratio': 0.1, 'selection': 'random'}
  Backtesting metric: 104937.6564688119



100%|██████████| 53/53 [00:00<00:00, 196.02it/s]

216.4302285042319





---
# SARIMAX

In [36]:
# Preliminary modeling for feature selection
# ======================================================

exog = pd.DataFrame(
        StandardScaler()\
                .fit_transform(df_train.drop(columns=['cases'])),    # scaling exog features
                columns = df_train.drop(columns=['cases']).columns,
                index = df_train.index
                )

sarimaxmod2 = SARIMAX(
                        endog = df_train['cases'].apply(np.log1p),
                        exog = exog.drop(columns=['epiweek','Days Good']),  # prelim. sarimax
                        order = (1, 1, 1),
                        seasonal_order = (0, 1, 0, 52)
                        )

sarimaxres2 = sarimaxmod2.fit()
sarimax_exogs = sarimaxres2.params\
                .loc[sarimaxres2.params>=.05]\
                .index.to_list()[:-2]               # selecting features with sufficiently large params

In [None]:
# Model instantiation
# ======================================================

forecaster2 = ForecasterSarimax(
    regressor=Sarimax(
        order=(1, 1, 1),
        seasonal_order=(0, 1, 0, 52),
        maxiter=100
    ),
    transformer_y=FunctionTransformer(func = np.log1p, inverse_func = np.expm1, validate=True),
    transformer_exog=StandardScaler())

forecaster2.fit(y=df_train['cases'],exog=df_train[sarimax_exogs])

In [None]:
# Grid search for hyperparameter tuning
# note that this grid search took over 6 hours
# ======================================================

param_grid = {
    'order': [(0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (2, 1, 1)],
    'seasonal_order': [(0, 0, 0, 0), (0, 1, 0, 52), (1, 1, 1, 52)],
    'trend': [None, 'n', 'c']
}

results_grid = grid_search_sarimax(
                forecaster            = forecaster2,
                y                     = df['cases'].loc[:end_validation],
                param_grid            = param_grid,
                steps                 = 2,
                refit                 = False,
                metric                = 'mean_absolute_error',
                initial_train_size    = len(df_train),
                fixed_train_size      = False,
                return_best           = False,
                n_jobs                = 'auto',
                suppress_warnings_fit = True,
                verbose               = False,
                show_progress         = True
            )

In [None]:
# Refit with new hyperparameters
# ======================================================

forecaster3 = ForecasterSarimax(
    regressor=Sarimax(
        order=(1, 1, 1),
        seasonal_order=(0, 1, 0, 52),
        maxiter=100,
        trend='c'
    ),
    transformer_y=FunctionTransformer(func = np.log1p, inverse_func = np.expm1, validate=True),
    transformer_exog=StandardScaler())

forecaster3.fit(y=df_train['cases'],exog=df_train[sarimax_exogs+['epiweek_sin','epiweek_cos','epiweek']])

# Model optimization on validation dataset
# ======================================================

met3, preds3 = backtesting_sarimax(
    forecaster = forecaster3,
    y = df['cases'][:end_validation],
    exog = df[sarimax_exogs+['epiweek_sin','epiweek_cos','epiweek']][:end_validation],
    initial_train_size = len(df_train),
    steps = 2,
    metric = 'mean_absolute_error',
    refit = False,
    verbose = False,
    show_progress=True,
    suppress_warnings_fit=True
)

---
# Random Forest

In [None]:
# Model instantiation 
# ======================================================

forecaster4 = ForecasterAutoreg(
    regressor=RandomForestRegressor(
        random_state=123,
        n_estimators=400,
        max_depth=10
    ),
    lags = [1,2,52],
    transformer_y=FunctionTransformer(func = np.log1p, inverse_func = np.expm1, validate=True),
    transformer_exog=StandardScaler())

forecaster4.fit(y=df_train['cases'],exog=df_train.drop(columns=['cases']))

# Optimizing on validation dataset
# ======================================================

met4, preds4 = backtesting_forecaster(
    forecaster=forecaster4,
    y=df['cases'][:end_validation],
    exog=df.drop(columns=['cases'])[:end_validation],
    initial_train_size=len(df_train),
    steps = 2,
    metric='mean_absolute_error',
    refit=False,
    verbose=False,
    show_progress=True
)

In [None]:
# Recursive feature elimination with cross-validation 
# ======================================================

X_train, y_train = forecaster4.create_train_X_y(
                    y    = df_train['cases'],
                    exog = df_train.drop(columns=['cases'])
                )

rng = np.random.default_rng(seed=785412)
sample = rng.choice(X_train.index, size=int(len(X_train)*0.5), replace=False)
X_train_sample = X_train.loc[sample, :]
y_train_sample = y_train.loc[sample]

regressor = RandomForestRegressor(random_state=123)

selector = RFECV(
                estimator              = regressor,
                step                   = 1,
                cv                     = 3,
                min_features_to_select = 15,
                n_jobs                 = -1
                )

selector.fit(X_train_sample, y_train_sample)
selected_features_rfe = selector.get_feature_names_out()

selected_exog_features = [
    feature
    for feature in selected_features_rfe
    if not feature.startswith("lag_")
]

In [None]:
# Refit with selected features 
# ======================================================

forecaster5 = ForecasterAutoreg(
    regressor=RandomForestRegressor(random_state=123,
        n_estimators=400,
        max_depth=10),
    lags=[1,2,52],
    transformer_y=FunctionTransformer(func = np.log1p, inverse_func = np.expm1, validate=True),
    transformer_exog=StandardScaler())

forecaster5.fit(y=df_train['cases'],exog=df_train[selected_exog_features])

# Optimize on validation dataset 
# ======================================================

met5, preds5 = backtesting_forecaster(
    forecaster=forecaster5,
    y=df['cases'][:end_validation],
    exog=df[selected_exog_features][:end_validation],
    initial_train_size=len(df_train),
    steps = 2,
    metric='mean_absolute_error',
    refit=False,
    verbose=False,
    show_progress=True
)

---
# Seasonal Naive
**Baseline model**

In [None]:
# Model instanciation 
# ======================================================

baselineforecaster = ForecasterEquivalentDate(
                                                offset=52,
                                                n_offsets=2,
                                                agg_func=np.mean
                                            )

baselineforecaster.fit(df_train['cases'])

In [None]:
# Optimizing with validation dataset
# ======================================================

mb1, baseline_preds1 = backtesting_forecaster(
    forecaster=baselineforecaster,
    y=df['cases'],
    initial_train_size=len(df_train),
    refit=True,
    steps=1,
    verbose=False,
    metric='mean_absolute_error',
)

---
# Saving models

In [64]:
from skforecast.utils import save_forecaster, load_forecaster

In [None]:
save_forecaster(forecaster1, file_name='ElasticNet_001 hide.py',verbose=False)
save_forecaster(forecaster3, file_name='SARIMAX_001 hide.py',verbose=False)
save_forecaster(forecaster5, file_name='RandomForest_001 hide.py',verbose=False)
save_forecaster(baselineforecaster, file_name='baseline_001 hide.py',verbose=False)