In [1]:
import pandas as pd
import numpy as np
import orbit
import matplotlib.pyplot as plt

from orbit.utils.dataset import load_iclaims
from orbit.diagnostics.plot import plot_predicted_data, plot_predicted_components
from orbit.utils.plot import get_orbit_style
plt.style.use(get_orbit_style())
from orbit.models import ETS, LGT, DLT

from orbit.diagnostics.metrics import smape

In [2]:
orbit.__version__

'1.1.0dev'

In [3]:
%load_ext autoreload
%autoreload 2

## Data

In [4]:
# can also consider transform=False
raw_df = load_iclaims(transform=True)
raw_df.dtypes

week              datetime64[ns]
claims                   float64
trend.unemploy           float64
trend.filling            float64
trend.job                float64
sp500                    float64
vix                      float64
dtype: object

In [5]:
df = raw_df.copy()

In [6]:
df.head()

Unnamed: 0,week,claims,trend.unemploy,trend.filling,trend.job,sp500,vix
0,2010-01-03,13.386595,0.219882,-0.318452,0.1175,-0.417633,0.122654
1,2010-01-10,13.624218,0.219882,-0.194838,0.168794,-0.42548,0.110445
2,2010-01-17,13.398741,0.236143,-0.292477,0.1175,-0.465229,0.532339
3,2010-01-24,13.137549,0.203353,-0.194838,0.106918,-0.481751,0.428645
4,2010-01-31,13.19676,0.13436,-0.242466,0.074483,-0.488929,0.487404


In [7]:
test_size=52

train_df=df[:-test_size]
test_df=df[-test_size:]

Now we manually created a dataset with a few missing values in the response variable.

In [8]:
np.random.seed(123)
na_idx = np.sort(np.random.choice(np.arange(train_df.shape[0]), 10, replace=False))
na_idx

array([ 33, 134, 147, 220, 226, 245, 263, 266, 327, 381])

In [9]:
train_df_na = train_df.copy()
train_df_na.iloc[na_idx, 1] = np.nan

## ETS

In [10]:
ets = ETS(response_col='claims',
          date_col='week',
          seasonality=52,
          seed=2020,
          estimator='stan-mcmc',
)

In [11]:
ets.fit(train_df_na)



<orbit.forecaster.full_bayes.FullBayesianForecaster at 0x140d37090>

In [12]:
predicted_df = ets.predict(df=train_df_na)
predicted_df.iloc[na_idx, :]

Unnamed: 0,week,prediction_5,prediction,prediction_95
33,2010-08-22,12.684338,12.8567,13.052488
134,2012-07-29,12.572014,12.679722,12.790795
147,2012-10-28,12.672589,12.770397,12.877641
220,2014-03-23,12.516778,12.617131,12.728789
226,2014-05-04,12.481102,12.605834,12.73101
245,2014-09-14,12.366648,12.495904,12.610347
263,2015-01-18,12.671526,12.797576,12.924648
266,2015-02-08,12.502506,12.614119,12.747006
327,2016-04-10,12.447682,12.553162,12.675651
381,2017-04-23,12.215289,12.320035,12.400624


In [13]:
smape(train_df_na['claims'].values, predicted_df['prediction'].values)

0.002865783307036313

It is worth pointing out that the very first value of the response variable cannot be missing, since this is the starting point of the time series fitting.

In [14]:
na_idx2 = list(na_idx) + [0]
train_df_na2 = train_df.copy()
train_df_na2.iloc[na_idx2, 1] = np.nan
ets.fit(train_df_na2)

  ss[idx] = np.nanmean(adjusted_response[idx::self._seasonality])


DataInputException: The first value of response column claims cannot be missing..

## LGT

In [15]:
lgt = LGT(response_col='claims',
          date_col='week',
          estimator='stan-mcmc',
          seasonality=52,
          seed=8888)

In [16]:
lgt.fit(df=train_df_na)

To run all diagnostics call pystan.check_hmc_diagnostics(fit)


<orbit.forecaster.full_bayes.FullBayesianForecaster at 0x141078dd0>

In [17]:
predicted_df = lgt.predict(df=train_df_na)
predicted_df.iloc[na_idx, :]

Unnamed: 0,week,prediction_5,prediction,prediction_95
33,2010-08-22,12.750985,12.871307,12.981723
134,2012-07-29,12.534918,12.646589,12.767774
147,2012-10-28,12.621442,12.734111,12.870138
220,2014-03-23,12.482266,12.571192,12.696406
226,2014-05-04,12.468498,12.572705,12.685813
245,2014-09-14,12.353079,12.479061,12.553327
263,2015-01-18,12.68267,12.769886,12.876856
266,2015-02-08,12.497793,12.61228,12.718102
327,2016-04-10,12.456949,12.535072,12.658977
381,2017-04-23,12.223128,12.313617,12.405959


## DLT

In [18]:
dlt = DLT(response_col='claims',
          date_col='week',
          estimator='stan-mcmc',
          seasonality=52,
          seed=8888)

In [19]:
dlt.fit(df=train_df_na)

To run all diagnostics call pystan.check_hmc_diagnostics(fit)


<orbit.forecaster.full_bayes.FullBayesianForecaster at 0x1427d4490>

In [20]:
predicted_df = dlt.predict(df=train_df_na)
predicted_df.iloc[na_idx, :]

Unnamed: 0,week,prediction_5,prediction,prediction_95
33,2010-08-22,12.750169,12.858053,13.007971
134,2012-07-29,12.530594,12.645414,12.764947
147,2012-10-28,12.635962,12.737997,12.854073
220,2014-03-23,12.478771,12.571726,12.696811
226,2014-05-04,12.478803,12.568907,12.704323
245,2014-09-14,12.37048,12.478977,12.551105
263,2015-01-18,12.676262,12.753837,12.860741
266,2015-02-08,12.482016,12.602319,12.710764
327,2016-04-10,12.448289,12.528239,12.653225
381,2017-04-23,12.228236,12.315597,12.405317
