Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
tgsmith61591 committed Jun 10, 2017
2 parents 1545afc + 8bcd987 commit 97242a6
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 15 deletions.
2 changes: 1 addition & 1 deletion pyramid/__init__.py
Expand Up @@ -4,7 +4,7 @@
#
# The pyramid module

__version__ = '0.2-alpha'
__version__ = '0.3'

try:
# this var is injected in the setup build to enable
Expand Down
54 changes: 51 additions & 3 deletions pyramid/arima/arima.py
Expand Up @@ -8,22 +8,30 @@
from __future__ import print_function, absolute_import, division

from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_array, check_is_fitted
from sklearn.utils.validation import check_array, check_is_fitted, column_or_1d
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.utils.metaestimators import if_delegate_has_method
from statsmodels.tsa.arima_model import ARIMA as _ARIMA
from statsmodels.tsa.base.tsa_model import TimeSeriesModelResults
from statsmodels import api as sm
import numpy as np
import datetime
import warnings
import os

# DTYPE for arrays
from ..compat.numpy import DTYPE
from ..utils import get_callable

__all__ = [
'ARIMA'
]

VALID_SCORING = {
'mse': mean_squared_error,
'mae': mean_absolute_error
}


class ARIMA(BaseEstimator):
"""An ARIMA, or autoregressive integrated moving average, is a generalization of an autoregressive
Expand Down Expand Up @@ -123,6 +131,17 @@ class ARIMA(BaseEstimator):
Many warnings might be thrown inside of statsmodels. If ``suppress_warnings``
is True, all of these warnings will be squelched.
out_of_sample_size : int, optional (default=0)
The number of examples from the tail of the time series to use as validation
examples.
scoring : str, optional (default='mse')
If performing validation (i.e., if ``out_of_sample_size`` > 0), the metric
to use for scoring the out-of-sample data. One of {'mse', 'mae'}
scoring_args : dict, optional (default=None)
A dictionary of key-word arguments to be passed to the ``scoring`` metric.
Notes
-----
Expand All @@ -141,7 +160,8 @@ class ARIMA(BaseEstimator):
"""
def __init__(self, order, seasonal_order=None, start_params=None, trend='c',
method=None, transparams=True, solver='lbfgs', maxiter=50,
disp=0, callback=None, suppress_warnings=False):
disp=0, callback=None, suppress_warnings=False, out_of_sample_size=0,
scoring='mse', scoring_args=None):
super(ARIMA, self).__init__()

self.order = order
Expand All @@ -155,6 +175,9 @@ def __init__(self, order, seasonal_order=None, start_params=None, trend='c',
self.disp = disp
self.callback = callback
self.suppress_warnings = suppress_warnings
self.out_of_sample_size = out_of_sample_size
self.scoring = scoring
self.scoring_args = dict() if not scoring_args else scoring_args

def fit(self, y, exogenous=None, **fit_args):
"""Fit an ARIMA to a vector, ``y``, of observations with an
Expand All @@ -171,13 +194,19 @@ def fit(self, y, exogenous=None, **fit_args):
include a constant or trend. If provided, these variables are
used as additional features in the regression operation.
"""
y = check_array(y, ensure_2d=False, force_all_finite=False, copy=True, dtype=DTYPE)
y = column_or_1d(check_array(y, ensure_2d=False, force_all_finite=False, copy=True, dtype=DTYPE))
n_samples = y.shape[0]

# if exog was included, check the array...
if exogenous is not None:
exogenous = check_array(exogenous, ensure_2d=True, force_all_finite=False,
copy=False, dtype=DTYPE)

# determine the CV args, if any
cv = self.out_of_sample_size
scoring = get_callable(self.scoring, VALID_SCORING)
cv = max(min(cv, n_samples), 0) # don't allow negative, don't allow > n_samples

def _fit_wrapper():
# these might change depending on which one
method = self.method
Expand Down Expand Up @@ -227,6 +256,14 @@ def _fit_wrapper():
# if the model is fit with an exogenous array, it must be predicted with one as well.
self.fit_with_exog_ = exogenous is not None

# now make a prediction if we're validating to save the out-of-sample value
if cv > 0:
# get the predictions
pred = self.arima_res_.predict(exog=exogenous, typ='linear')[-cv:]
self.oob_ = scoring(y[-cv:], pred, **self.scoring_args)
else:
self.oob_ = np.nan

return self

def predict(self, n_periods=10, exogenous=None):
Expand Down Expand Up @@ -500,6 +537,17 @@ def maroots(self):
"""
return self.arima_res_.maroots

def oob(self):
"""If the model was built with ``out_of_sample_size`` > 0, a validation
score will have been computed. Otherwise it will be np.nan.
Returns
-------
oob_ : float
The "out-of-bag" score.
"""
return self.oob_

@if_delegate_has_method('arima_res_')
def params(self):
"""Get the parameters of the model. The order of variables is the trend
Expand Down
41 changes: 30 additions & 11 deletions pyramid/arima/auto.py
Expand Up @@ -24,7 +24,7 @@
]

# The valid information criteria
VALID_CRITERIA = {'aic', 'bic', 'hqic'}
VALID_CRITERIA = {'aic', 'bic', 'hqic', 'oob'}


def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2, max_q=5,
Expand All @@ -33,7 +33,8 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
seasonal_test='ch', n_jobs=1, start_params=None, trend='c', method=None, transparams=True,
solver='lbfgs', maxiter=50, disp=0, callback=None, offset_test_args=None, seasonal_test_args=None,
suppress_warnings=False, error_action='warn', trace=False, random=False, random_state=None,
n_fits=10, return_valid_fits=False, **fit_args):
n_fits=10, return_valid_fits=False, out_of_sample_size=0, scoring='mse', scoring_args=None,
**fit_args):
"""The ``auto_arima`` function seeks to identify the most optimal parameters for an ``ARIMA`` model,
and returns a fitted ARIMA model. This function is based on the commonly-used R function,
`forecase::auto.arima``[3].
Expand All @@ -45,8 +46,9 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
conducting the Canova-Hansen to determine the optimal order of seasonal differencing, ``D``.
In order to find the best model, ``auto_arima`` optimizes for a given ``information_criterion``, one of
{'aic', 'bic', 'hqic'} (Akaine Information Criterion, Bayesian Information Criterion or Hannan-Quinn
Information Criterion, respectively) and returns the ARIMA which minimizes the value.
{'aic', 'bic', 'hqic', 'oob'} (Akaine Information Criterion, Bayesian Information Criterion, Hannan-Quinn
Information Criterion, or "out of bag"--for validation scoring--respectively) and returns the ARIMA which
minimizes the value.
Note that due to stationarity issues, ``auto_arima`` might not find a suitable model that will converge. If this
is the case, a ``ValueError`` will be thrown suggesting stationarity-inducing measures be taken prior
Expand Down Expand Up @@ -127,8 +129,7 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
information_criterion : str, optional (default='aic')
The information criterion used to select the best ARIMA model. One of
``pyramid.arima.auto_arima.VALID_CRITERIA``, ('aic', 'bic'). Note that if
n_samples <= 3, AIC will be used.
``pyramid.arima.auto_arima.VALID_CRITERIA``, ('aic', 'bic', 'hqic', 'oob').
alpha : float, optional (default=0.05)
Level of the test for testing significance.
Expand Down Expand Up @@ -224,6 +225,17 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
If True, will return all valid ARIMA fits. If False (by default), will only
return the best fit.
out_of_sample_size : int, optional (default=0)
The number of examples from the tail of the time series to use as validation
examples.
scoring : str, optional (default='mse')
If performing validation (i.e., if ``out_of_sample_size`` > 0), the metric
to use for scoring the out-of-sample data. One of {'mse', 'mae'}
scoring_args : dict, optional (default=None)
A dictionary of key-word arguments to be passed to the ``scoring`` metric.
**fit_args : dict, optional (default=None)
A dictionary of keyword arguments to pass to the :func:`ARIMA.fit` method.
Expand Down Expand Up @@ -282,7 +294,9 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
transparams=transparams, solver=solver, maxiter=maxiter,
disp=disp, callback=callback, fit_params=fit_args,
suppress_warnings=suppress_warnings, trace=trace,
error_action=error_action)),
error_action=error_action, scoring=scoring,
out_of_sample_size=out_of_sample_size,
scoring_args=scoring_args)),
return_valid_fits)

# test ic, and use AIC if n <= 3
Expand Down Expand Up @@ -396,7 +410,9 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
transparams=transparams, solver=solver, maxiter=maxiter,
disp=disp, callback=callback, fit_params=fit_args,
suppress_warnings=suppress_warnings, trace=trace,
error_action=error_action)),
error_action=error_action, scoring=scoring,
out_of_sample_size=out_of_sample_size,
scoring_args=scoring_args)),
return_valid_fits)

# seasonality issues
Expand Down Expand Up @@ -442,7 +458,8 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
start_params=start_params, trend=trend, method=method, transparams=transparams,
solver=solver, maxiter=maxiter, disp=disp, callback=callback,
fit_params=fit_args, suppress_warnings=suppress_warnings,
trace=trace, error_action=error_action)
trace=trace, error_action=error_action, out_of_sample_size=out_of_sample_size,
scoring=scoring, scoring_args=scoring_args)
for order, seasonal_order in gen)

# filter the non-successful ones
Expand All @@ -461,12 +478,14 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2

def _fit_arima(x, xreg, order, seasonal_order, start_params, trend, method, transparams,
solver, maxiter, disp, callback, fit_params, suppress_warnings, trace,
error_action):
error_action, out_of_sample_size, scoring, scoring_args):
try:
fit = ARIMA(order=order, seasonal_order=seasonal_order, start_params=start_params,
trend=trend, method=method, transparams=transparams,
solver=solver, maxiter=maxiter, disp=disp,
callback=callback, suppress_warnings=suppress_warnings)\
callback=callback, suppress_warnings=suppress_warnings,
out_of_sample_size=out_of_sample_size, scoring=scoring,
scoring_args=scoring_args)\
.fit(x, exogenous=xreg, **fit_params)

# for non-stationarity errors, return None
Expand Down
24 changes: 24 additions & 0 deletions pyramid/arima/tests/test_arima.py
Expand Up @@ -101,6 +101,21 @@ def test_basic_arima():
assert_array_almost_equal(preds, expected_preds)


def test_with_oob():
# show we can fit with CV (kinda)
arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=10).fit(y=hr)
assert not np.isnan(arima.oob()) # show this works

# show we can fit if ooss < 0 and oob will be nan
arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=-1).fit(y=hr)
assert np.isnan(arima.oob())

# can we do one with an exogenous array, too?
arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=10).fit(
y=hr, exogenous=rs.rand(hr.shape[0], 4))
assert not np.isnan(arima.oob())


def _try_get_attrs(arima):
# show we can get all these attrs without getting an error
attrs = {
Expand Down Expand Up @@ -294,6 +309,15 @@ def test_with_seasonality6():
# FIXME: we get an IndexError from statsmodels summary if (0, 0, 0)


def test_with_seasonality7():
# show we can fit one with OOB as the criterion
_ = auto_arima(wineind, start_p=1, start_q=1, max_p=2, max_q=2, m=12,
start_P=0, seasonal=True, n_jobs=1, d=1, D=1,
out_of_sample_size=10, information_criterion='oob',
suppress_warnings=True, error_action='raise', # do raise so it fails fast
random=True, random_state=42, n_fits=3)


def test_corner_cases():
assert_raises(ValueError, auto_arima, wineind, error_action='some-bad-string')

Expand Down

0 comments on commit 97242a6

Please sign in to comment.