# Model Selection with WBIC

In [2]:
from datetime import timedelta

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import orbit
print(orbit.__version__)
from orbit.models import DLT
from orbit.utils.simulation import make_trend, make_regression

1.1.0dev


In [3]:
orbit.__version__

'1.1.0dev'

In [4]:
%load_ext autoreload
%autoreload 2

Generate a regression problem with trend with `8` number of regressors where only `3` of them are effective. First, generate the `3` effective regressors.

In [5]:
NUM_OF_REGRESSORS = 8
NUM_OF_EFFECTIVE_REGRESSORS = 3
SERIES_LEN = 100
SEED = 20210101
# sample some coefficients
COEFS = np.random.default_rng(SEED).uniform(-1, 1, NUM_OF_EFFECTIVE_REGRESSORS)
trend = make_trend(SERIES_LEN, rw_loc=0.01, rw_scale=0.1)
x, regression, coefs = make_regression(series_len=SERIES_LEN, coefs=COEFS)
print(regression.shape, x.shape)

# combine trend and the regression
y = trend + regression

(100,) (100, 3)


We can add `5` irrelevant regressors into the dataset to add challenge in selecting the best model.

In [6]:
x_extra = np.random.normal(0, 1, (SERIES_LEN, NUM_OF_REGRESSORS - NUM_OF_EFFECTIVE_REGRESSORS))
x = np.concatenate([x, x_extra], axis=-1)
print(x.shape)

(100, 8)


In [7]:
x_cols = [f"x{x}" for x in range(1, NUM_OF_REGRESSORS + 1)]
response_col = "y"
dt_col = "date"
obs_matrix = np.concatenate([y.reshape(-1, 1), x], axis=1)
# make a data frame for orbit inputs
df = pd.DataFrame(obs_matrix, columns=[response_col] + x_cols)
# make some dummy date stamp
dt = pd.date_range(start='2016-01-04', periods=SERIES_LEN, freq="1W")
df['date'] = dt
df.shape

(100, 10)

In [8]:
 regressor_col = x_cols[:3 + 1]


In [11]:
dlt_mod = DLT(
        response_col=response_col,
        date_col=dt_col,
        regressor_col=regressor_col,
        seed=2020,
        # fixing the smoothing parameters to learn regression coefficients more effectively
        level_sm_input=0.01,
        slope_sm_input=0.01,
        num_warmup=4000,
        num_sample=4000,

    )

dlt_mod.fit(df=df, sampling_temperature = np.log(100.0)) #, sampling_temperature = 5.3


Gradient evaluation took 0.000301 seconds
1000 transitions using 10 leapfrog steps per transition would take 3.01 seconds.
Adjust your expectations accordingly!


Iteration:    1 / 2000 [  0%]  (Warmup)

Gradient evaluation took 0.000238 seconds
1000 transitions using 10 leapfrog steps per transition would take 2.38 seconds.
Adjust your expectations accordingly!



Gradient evaluation took 0.000247 seconds
1000 transitions using 10 leapfrog steps per transition would take 2.47 seconds.
Adjust your expectations accordingly!


Iteration:    1 / 2000 [  0%]  (Warmup)

Gradient evaluation took 0.000243 seconds
1000 transitions using 10 leapfrog steps per transition would take 2.43 seconds.
Adjust your expectations accordingly!


Iteration:    1 / 2000 [  0%]  (Warmup)
Iteration:    1 / 2000 [  0%]  (Warmup)
Iteration:  200 / 2000 [ 10%]  (Warmup)
Iteration:  200 / 2000 [ 10%]  (Warmup)
Iteration:  400 / 2000 [ 20%]  (Warmup)
Iteration:  400 / 2000 [ 20%]  (Warmup)
Iteration:  600 / 2000 [



<orbit.forecaster.full_bayes.FullBayesianForecaster at 0x13887ba90>

In [13]:
A = dlt_mod.get_WBIC()
A

-165.74165349090822

In [None]:
%debug

In [None]:
ForecasterException: Model class: <class 'orbit.template.dlt.DLTModel'> is incompatible with 
        Estimator: <class 'orbit.estimators.stan_estimator.StanEstimatorMCMC'>.  
            Estimator Support: [<class 'orbit.estimators.stan_estimator.StanEstimatorMAP'>, 
                                <class 'orbit.estimators.stan_estimator.StanEstimatorMCMC'>
                                <class 'orbit.estimators.stan_estimator.StanEstimatorMCMC'>]

Now, we can calculate WBIC and compare them across models.

In [None]:
%%time
wbics = np.empty(NUM_OF_REGRESSORS)

for idx in range(NUM_OF_REGRESSORS):
    regressor_col = x_cols[:idx + 1]

    dlt_mod = DLT(
        response_col=response_col,
        date_col=dt_col,
        regressor_col=regressor_col,
        seed=2020,
        # fixing the smoothing parameters to learn regression coefficients more effectively
        level_sm_input=0.01,
        slope_sm_input=0.01,
        num_warmup=4000,
        num_sample=4000,
    )
    dlt_mod.fit(df=df)
    wbic = dlt.get_training_metrics()['WBIC']
    print('Regressors:{} WBIC:{:.5f}'.format(regressor_col, wbic))
    wbics[idx] = wbic

We plot the chart with WBICs against number of regressors included.  As we can see, WBIC is lowest when regressors overlapped exactly with the truth.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
ax.plot(np.arange(1, NUM_OF_REGRESSORS + 1), wbics, color='dodgerblue', label='WBICs')
ax.axvline(x=3, linestyle='--', color='orange', label='truth')
ax.set_xlabel('Number of Regressors')
ax.set_ylabel('WBIC')
ax.set_title('Model Selection with WBIC')
fig.legend()