Merge branch 'develop'

alkaline-ml · Jun 10, 2017 · 97242a6 · 97242a6
2 parents 1545afc + 8bcd987
commit 97242a6
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 15 deletions.
diff --git a/pyramid/__init__.py b/pyramid/__init__.py
@@ -4,7 +4,7 @@
 #
 # The pyramid module
 
-__version__ = '0.2-alpha'
+__version__ = '0.3'
 
 try:
     # this var is injected in the setup build to enable

diff --git a/pyramid/arima/arima.py b/pyramid/arima/arima.py
@@ -8,22 +8,30 @@
 from __future__ import print_function, absolute_import, division
 
 from sklearn.base import BaseEstimator
-from sklearn.utils.validation import check_array, check_is_fitted
+from sklearn.utils.validation import check_array, check_is_fitted, column_or_1d
+from sklearn.metrics import mean_absolute_error, mean_squared_error
 from sklearn.utils.metaestimators import if_delegate_has_method
 from statsmodels.tsa.arima_model import ARIMA as _ARIMA
 from statsmodels.tsa.base.tsa_model import TimeSeriesModelResults
 from statsmodels import api as sm
+import numpy as np
 import datetime
 import warnings
 import os
 
 # DTYPE for arrays
 from ..compat.numpy import DTYPE
+from ..utils import get_callable
 
 __all__ = [
     'ARIMA'
 ]
 
+VALID_SCORING = {
+    'mse': mean_squared_error,
+    'mae': mean_absolute_error
+}
+
 
 class ARIMA(BaseEstimator):
     """An ARIMA, or autoregressive integrated moving average, is a generalization of an autoregressive
@@ -123,6 +131,17 @@ class ARIMA(BaseEstimator):
         Many warnings might be thrown inside of statsmodels. If ``suppress_warnings``
         is True, all of these warnings will be squelched.
 
+    out_of_sample_size : int, optional (default=0)
+        The number of examples from the tail of the time series to use as validation
+        examples.
+
+    scoring : str, optional (default='mse')
+        If performing validation (i.e., if ``out_of_sample_size`` > 0), the metric
+        to use for scoring the out-of-sample data. One of {'mse', 'mae'}
+
+    scoring_args : dict, optional (default=None)
+        A dictionary of key-word arguments to be passed to the ``scoring`` metric.
+
 
     Notes
     -----
@@ -141,7 +160,8 @@ class ARIMA(BaseEstimator):
     """
     def __init__(self, order, seasonal_order=None, start_params=None, trend='c',
                  method=None, transparams=True, solver='lbfgs', maxiter=50,
-                 disp=0, callback=None, suppress_warnings=False):
+                 disp=0, callback=None, suppress_warnings=False, out_of_sample_size=0,
+                 scoring='mse', scoring_args=None):
         super(ARIMA, self).__init__()
 
         self.order = order
@@ -155,6 +175,9 @@ def __init__(self, order, seasonal_order=None, start_params=None, trend='c',
         self.disp = disp
         self.callback = callback
         self.suppress_warnings = suppress_warnings
+        self.out_of_sample_size = out_of_sample_size
+        self.scoring = scoring
+        self.scoring_args = dict() if not scoring_args else scoring_args
 
     def fit(self, y, exogenous=None, **fit_args):
         """Fit an ARIMA to a vector, ``y``, of observations with an
@@ -171,13 +194,19 @@ def fit(self, y, exogenous=None, **fit_args):
             include a constant or trend. If provided, these variables are
             used as additional features in the regression operation.
         """
-        y = check_array(y, ensure_2d=False, force_all_finite=False, copy=True, dtype=DTYPE)
+        y = column_or_1d(check_array(y, ensure_2d=False, force_all_finite=False, copy=True, dtype=DTYPE))
+        n_samples = y.shape[0]
 
         # if exog was included, check the array...
         if exogenous is not None:
             exogenous = check_array(exogenous, ensure_2d=True, force_all_finite=False,
                                     copy=False, dtype=DTYPE)
 
+        # determine the CV args, if any
+        cv = self.out_of_sample_size
+        scoring = get_callable(self.scoring, VALID_SCORING)
+        cv = max(min(cv, n_samples), 0)  # don't allow negative, don't allow > n_samples
+
         def _fit_wrapper():
             # these might change depending on which one
             method = self.method
@@ -227,6 +256,14 @@ def _fit_wrapper():
         # if the model is fit with an exogenous array, it must be predicted with one as well.
         self.fit_with_exog_ = exogenous is not None
 
+        # now make a prediction if we're validating to save the out-of-sample value
+        if cv > 0:
+            # get the predictions
+            pred = self.arima_res_.predict(exog=exogenous, typ='linear')[-cv:]
+            self.oob_ = scoring(y[-cv:], pred, **self.scoring_args)
+        else:
+            self.oob_ = np.nan
+
         return self
 
     def predict(self, n_periods=10, exogenous=None):
@@ -500,6 +537,17 @@ def maroots(self):
         """
         return self.arima_res_.maroots
 
+    def oob(self):
+        """If the model was built with ``out_of_sample_size`` > 0, a validation
+        score will have been computed. Otherwise it will be np.nan.
+
+        Returns
+        -------
+        oob_ : float
+            The "out-of-bag" score.
+        """
+        return self.oob_
+
     @if_delegate_has_method('arima_res_')
     def params(self):
         """Get the parameters of the model. The order of variables is the trend

diff --git a/pyramid/arima/auto.py b/pyramid/arima/auto.py
@@ -24,7 +24,7 @@
 ]
 
 # The valid information criteria
-VALID_CRITERIA = {'aic', 'bic', 'hqic'}
+VALID_CRITERIA = {'aic', 'bic', 'hqic', 'oob'}
 
 
 def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2, max_q=5,
@@ -33,7 +33,8 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
                seasonal_test='ch', n_jobs=1, start_params=None, trend='c', method=None, transparams=True,
                solver='lbfgs', maxiter=50, disp=0, callback=None, offset_test_args=None, seasonal_test_args=None,
                suppress_warnings=False, error_action='warn', trace=False, random=False, random_state=None,
-               n_fits=10, return_valid_fits=False, **fit_args):
+               n_fits=10, return_valid_fits=False, out_of_sample_size=0, scoring='mse', scoring_args=None,
+               **fit_args):
     """The ``auto_arima`` function seeks to identify the most optimal parameters for an ``ARIMA`` model,
     and returns a fitted ARIMA model. This function is based on the commonly-used R function,
     `forecase::auto.arima``[3].
@@ -45,8 +46,9 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
     conducting the Canova-Hansen to determine the optimal order of seasonal differencing, ``D``.
 
     In order to find the best model, ``auto_arima`` optimizes for a given ``information_criterion``, one of
-    {'aic', 'bic', 'hqic'} (Akaine Information Criterion, Bayesian Information Criterion or Hannan-Quinn
-    Information Criterion, respectively) and returns the ARIMA which minimizes the value.
+    {'aic', 'bic', 'hqic', 'oob'} (Akaine Information Criterion, Bayesian Information Criterion, Hannan-Quinn
+    Information Criterion, or "out of bag"--for validation scoring--respectively) and returns the ARIMA which
+    minimizes the value.
 
     Note that due to stationarity issues, ``auto_arima`` might not find a suitable model that will converge. If this
     is the case, a ``ValueError`` will be thrown suggesting stationarity-inducing measures be taken prior
@@ -127,8 +129,7 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
 
     information_criterion : str, optional (default='aic')
         The information criterion used to select the best ARIMA model. One of
-        ``pyramid.arima.auto_arima.VALID_CRITERIA``, ('aic', 'bic'). Note that if
-        n_samples <= 3, AIC will be used.
+        ``pyramid.arima.auto_arima.VALID_CRITERIA``, ('aic', 'bic', 'hqic', 'oob').
 
     alpha : float, optional (default=0.05)
         Level of the test for testing significance.
@@ -224,6 +225,17 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
         If True, will return all valid ARIMA fits. If False (by default), will only
         return the best fit.
 
+    out_of_sample_size : int, optional (default=0)
+        The number of examples from the tail of the time series to use as validation
+        examples.
+
+    scoring : str, optional (default='mse')
+        If performing validation (i.e., if ``out_of_sample_size`` > 0), the metric
+        to use for scoring the out-of-sample data. One of {'mse', 'mae'}
+
+    scoring_args : dict, optional (default=None)
+        A dictionary of key-word arguments to be passed to the ``scoring`` metric.
+
     **fit_args : dict, optional (default=None)
         A dictionary of keyword arguments to pass to the :func:`ARIMA.fit` method.
 
@@ -282,7 +294,9 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
                        transparams=transparams, solver=solver, maxiter=maxiter,
                        disp=disp, callback=callback, fit_params=fit_args,
                        suppress_warnings=suppress_warnings, trace=trace,
-                       error_action=error_action)),
+                       error_action=error_action, scoring=scoring,
+                       out_of_sample_size=out_of_sample_size,
+                       scoring_args=scoring_args)),
             return_valid_fits)
 
     # test ic, and use AIC if n <= 3
@@ -396,7 +410,9 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
                                        transparams=transparams, solver=solver, maxiter=maxiter,
                                        disp=disp, callback=callback, fit_params=fit_args,
                                        suppress_warnings=suppress_warnings, trace=trace,
-                                       error_action=error_action)),
+                                       error_action=error_action, scoring=scoring,
+                                       out_of_sample_size=out_of_sample_size,
+                                       scoring_args=scoring_args)),
             return_valid_fits)
 
     # seasonality issues
@@ -442,7 +458,8 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
                             start_params=start_params, trend=trend, method=method, transparams=transparams,
                             solver=solver, maxiter=maxiter, disp=disp, callback=callback,
                             fit_params=fit_args, suppress_warnings=suppress_warnings,
-                            trace=trace, error_action=error_action)
+                            trace=trace, error_action=error_action, out_of_sample_size=out_of_sample_size,
+                            scoring=scoring, scoring_args=scoring_args)
         for order, seasonal_order in gen)
 
     # filter the non-successful ones
@@ -461,12 +478,14 @@ def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2
 
 def _fit_arima(x, xreg, order, seasonal_order, start_params, trend, method, transparams,
                solver, maxiter, disp, callback, fit_params, suppress_warnings, trace,
-               error_action):
+               error_action, out_of_sample_size, scoring, scoring_args):
     try:
         fit = ARIMA(order=order, seasonal_order=seasonal_order, start_params=start_params,
                     trend=trend, method=method, transparams=transparams,
                     solver=solver, maxiter=maxiter, disp=disp,
-                    callback=callback, suppress_warnings=suppress_warnings)\
+                    callback=callback, suppress_warnings=suppress_warnings,
+                    out_of_sample_size=out_of_sample_size, scoring=scoring,
+                    scoring_args=scoring_args)\
             .fit(x, exogenous=xreg, **fit_params)
 
     # for non-stationarity errors, return None

diff --git a/pyramid/arima/tests/test_arima.py b/pyramid/arima/tests/test_arima.py
@@ -101,6 +101,21 @@ def test_basic_arima():
     assert_array_almost_equal(preds, expected_preds)
 
 
+def test_with_oob():
+    # show we can fit with CV (kinda)
+    arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=10).fit(y=hr)
+    assert not np.isnan(arima.oob())  # show this works
+
+    # show we can fit if ooss < 0 and oob will be nan
+    arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=-1).fit(y=hr)
+    assert np.isnan(arima.oob())
+
+    # can we do one with an exogenous array, too?
+    arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=10).fit(
+        y=hr, exogenous=rs.rand(hr.shape[0], 4))
+    assert not np.isnan(arima.oob())
+
+
 def _try_get_attrs(arima):
     # show we can get all these attrs without getting an error
     attrs = {
@@ -294,6 +309,15 @@ def test_with_seasonality6():
     # FIXME: we get an IndexError from statsmodels summary if (0, 0, 0)
 
 
+def test_with_seasonality7():
+    # show we can fit one with OOB as the criterion
+    _ = auto_arima(wineind, start_p=1, start_q=1, max_p=2, max_q=2, m=12,
+                   start_P=0, seasonal=True, n_jobs=1, d=1, D=1,
+                   out_of_sample_size=10, information_criterion='oob',
+                   suppress_warnings=True, error_action='raise',  # do raise so it fails fast
+                   random=True, random_state=42, n_fits=3)
+
+
 def test_corner_cases():
     assert_raises(ValueError, auto_arima, wineind, error_action='some-bad-string')