## Rolling Origin Model Testing 

In [189]:
# Data Manipulation
# ======================================================
import pandas as pd 
import numpy as np
from os import path
import configparser
from epiweeks import Week
from datetime import date, datetime
from calendar import month_name, month_abbr
from helper_functions import *
import math
from skforecast.utils import save_forecaster, load_forecaster

# Reading Secrets
# ======================================================
cfg = configparser.ConfigParser()
cfg.read('secrets.ini')
ROOT_PATH = path.abspath(cfg.get('default','root'))
DATA_PATH = path.join(ROOT_PATH, 'datasets/data')

# Modeling
# ======================================================
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import ElasticNetCV
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.Sarimax import Sarimax
from statsmodels.tsa.statespace.sarimax import SARIMAX
from skforecast.model_selection import backtesting_forecaster, grid_search_forecaster
from skforecast.model_selection_sarimax import backtesting_sarimax, grid_search_sarimax
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from skforecast.ForecasterBaseline import ForecasterEquivalentDate

# Warnings Config
# ======================================================
import warnings
warnings.filterwarnings('ignore')

In [190]:
# Loading & Manipulating Dataset
# ======================================================

df = pd.read_csv(path.join(DATA_PATH, 'raw_dataset.csv'))

df['epiweek'] = df['epiweek']\
                .map(lambda x: Week.fromstring(str(x)))

df['weekstart'] = pd.to_datetime(
                                df['epiweek']\
                                .map(lambda x: Week.startdate(x))
                                )

df.set_index('weekstart', inplace=True)

df['epiweek'] = df['epiweek']\
                .map(lambda x: int(str(x)[4:]))

col_ordered = (
                'cases','visits','GS_cold', 
                'GS_cough', 'GS_fever', 'GS_flu', 
                'AWND', 'PRCP', 'SNOW', 
                'TMAX', 'TMIN', 'TAVG',
                'Overall AQI Value',
                'CO', 'Ozone', 'PM10', 
                'PM25', 'Days Good',
                'Days Moderate', 'Days Unhealthy'
                )

means = df[list(col_ordered)].groupby(df.index.month).mean()
means.index.name = 'month'
df['month'] = df.index.month

df = df.reset_index().set_index(['weekstart','month'])

df['TAVG'] = df[['TAVG']].fillna(
                            pd.DataFrame(
                            (df['TMAX'] + df['TMIN']) / 2)\
                            .rename(columns={0:'TAVG'})
                                )

df[list(col_ordered)] = df[list(col_ordered)].fillna(means)

# One-hot encoding 'Main Pollutant'
df['Main Pollutant'] = df['Main Pollutant'].astype('category')
df = pd.get_dummies(df)\
    .drop(columns=["Main Pollutant_['Ozone' 'PM2.5']"])

df = df.astype({col: np.float32 for col in df.select_dtypes("number").columns})

epiweek_encoded = cyclical_encoding(df['epiweek'].apply(lambda x: x-1), cycle_length=52)

df = pd.concat([df, epiweek_encoded], axis=1)
df = df.reset_index()\
    .set_index('weekstart')\
    .drop(columns=['month'])\
    .resample('W').first()\
    .fillna(method='ffill')

variables = ['GS_cold', 'GS_cough', 'GS_fever', 'GS_flu', 
            'AWND', 'PRCP','SNOW', 'TAVG','TMAX', 'TMIN',
            'Overall AQI Value', 
            'CO', 'Ozone', 'PM10', 'PM25', 'Days Moderate', 
            'Days Unhealthy', 'visits','Main Pollutant_CO', 
            'Main Pollutant_NO2', 'Main Pollutant_PM2.5']

for v in variables:
    for i in range(1,4):
        df[f'{v}_L{i}'] = df[v].shift(i)

df = df.dropna()

end_train, end_validation, df_train, df_val, df_test = train_test_validate_split(
                                                        df, end_train = date(2015,12,31), 
                                                        end_validation = date(2017,12,31)
                                                                                )

Dates train      : 2005-01-16 00:00:00 --- 2015-12-27 00:00:00  (n=572)
Dates validacion : 2016-01-03 00:00:00 --- 2017-12-31 00:00:00  (n=105)
Dates test       : 2017-12-31 00:00:00 --- 2019-12-29 00:00:00  (n=105)


---

## 1-Week Predictions

In [14]:
# elastic net 1-week
# ==============================================================================

ENF = load_forecaster('ElasticNet_001 hide.py',verbose=False)

elasticnet_backtesting_met1, elasticnet_backtesting_preds1 = backtesting_forecaster(
    forecaster=ENF,
    y=df['cases'],
    exog=df[ENF.exog_col_names],
    initial_train_size=len(df[:end_validation]),
    steps=1,
    fixed_train_size=True,
    metric='mean_absolute_error',
    refit=True,
    verbose=False,
    show_progress=True
) 

100%|██████████| 104/104 [00:12<00:00,  8.39it/s]


In [16]:
# random forest 1-week
# ==============================================================================

RFF = load_forecaster("RandomForest_001 hide.py", verbose=False)

randomforest_backtesting_met1, randomforest_backtesting_preds1 = backtesting_forecaster(
    forecaster=RFF,
    y=df['cases'],
    exog=df[RFF.exog_col_names],
    initial_train_size=len(df[:end_validation]),
    steps=1,
    fixed_train_size=True,
    metric='mean_absolute_error',
    refit=True,
    verbose=False,
    show_progress=True
) 

100%|██████████| 104/104 [07:03<00:00,  4.08s/it]


In [18]:
# sarimax 1-week
# ==============================================================================

SMF = load_forecaster('SARIMAX_001 hide.py',verbose=False)

sarimax_backtesting_met1, sarimax_backtesting_preds1 = backtesting_sarimax(
    forecaster=SMF,
    y=df['cases'],
    exog=df[SMF.exog_col_names],
    initial_train_size=len(df[:end_validation]),
    steps=1,
    fixed_train_size=True,
    metric='mean_absolute_error',
    refit=True,
    verbose=False,
    show_progress=True
) 

100%|██████████| 104/104 [3:57:05<00:00, 136.78s/it]


In [194]:
# seasonal naive 1-week
# ==============================================================================

base = load_forecaster('baseline_001 hide.py',verbose=False)

mb1, baseline_preds1 = backtesting_forecaster(
    forecaster=base,
    y=df['cases'],
    initial_train_size=len(df_train),
    refit=True,
    steps=1,
    verbose=False,
    metric='mean_absolute_error',
)

100%|██████████| 209/209 [00:00<00:00, 503.87it/s]

255.94498





In [205]:
# saving 1-week predictions
# ==============================================================================

elasticnet_backtesting_preds1.to_csv('elasticnetbacktestingpreds1.csv')
sarimax_backtesting_preds1.to_csv('sarimxbacktestingpreds1.csv')
randomforest_backtesting_preds1.to_csv('randomforestbacktestingpreds1.csv')
baseline_preds1.to_csv('baseline_preds1.csv')

---

## 2-Week Predictions

In [None]:
# elastic net 2-week
# ==============================================================================

ENF = load_forecaster('ElasticNet_001 hide.py',verbose=False)

elasticnet_backtesting_met2, elasticnet_backtesting_preds2 = backtesting_forecaster(
    forecaster=ENF,
    y=df['cases'],
    exog=df[ENF.exog_col_names],
    initial_train_size=len(df[:end_validation]),
    steps=2,
    fixed_train_size=True,
    metric='mean_absolute_error',
    refit=True,
    verbose=False,
    show_progress=True
) 

100%|██████████| 52/52 [00:08<00:00,  6.31it/s]


In [None]:
# random forest 2-week
# ==============================================================================

RFF = load_forecaster("RandomForest_001 hide.py", verbose=False)

randomforest_backtesting_met2, randomforest_backtesting_preds2 = backtesting_forecaster(
    forecaster=RFF,
    y=df['cases'],
    exog=df[RFF.exog_col_names],
    initial_train_size=len(df[:end_validation]),
    steps=2,
    fixed_train_size=True,
    metric='mean_absolute_error',
    refit=True,
    verbose=False,
    show_progress=True
) 

100%|██████████| 52/52 [03:06<00:00,  3.59s/it]


In [None]:
# sarimax 2-week
# ==============================================================================

SMF = load_forecaster('SARIMAX_001 hide.py',verbose=False)

sarimax_backtesting_met2, sarimax_backtesting_preds2 = backtesting_sarimax(
    forecaster=SMF,
    y=df['cases'],
    exog=df[SMF.exog_col_names],
    initial_train_size=len(df[:end_validation]),
    steps=2,
    fixed_train_size=True,
    metric='mean_absolute_error',
    refit=True,
    verbose=False,
    show_progress=True
) 

100%|██████████| 52/52 [2:14:13<00:00, 154.88s/it]


In [None]:
# seasonal naive 2-week
# ==============================================================================

base = load_forecaster('baseline_001 hide.py',verbose=False)

mb2, baseline_preds2 = backtesting_forecaster(
    forecaster=base,
    y=df['cases'],
    initial_train_size=len(df_train),
    refit=True,
    steps=2,
    verbose=False,
    metric='mean_absolute_error',
)

100%|██████████| 105/105 [00:00<00:00, 568.05it/s]

255.94498





In [201]:
# saving 2-week predictions
# ==============================================================================

elasticnet_backtesting_preds2.to_csv('elasticnetbacktestingpreds2.csv')
sarimax_backtesting_preds2.to_csv('sarimxbacktestingpreds2.csv')
randomforest_backtesting_preds2.to_csv('randomforestbacktestingpreds2.csv')
baseline_preds2.to_csv('baseline_preds2.csv')