Add more tests, coverage, fix some bugs

alkaline-ml · Jun 1, 2017 · a63f7b7 · a63f7b7
1 parent 09ee745
commit a63f7b7
Show file tree

Hide file tree

Showing 8 changed files with 206 additions and 140 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,9 @@
 .DS_Store
 .idea/
 
+# custom extension
+*.pmdpkl
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/pyramid/arima/approx.py b/pyramid/arima/approx.py
@@ -98,15 +98,16 @@ def approx(x, y, xout, method='linear', rule=1, n=50, f=0, yleft=None,
     xout = c(xout).astype(np.float64)  # ensure double
 
     # check method
-    method = get_callable(method, VALID_APPROX)  # not a callable, actually...
+    method_key = method
+    method = get_callable(method_key, VALID_APPROX)  # not a callable, actually...
 
     # copy/regularize vectors
     x, y = _regularize(x, y, ties)
     nx = x.shape[0]
 
     # if len 1?
     if nx <= 1:
-        if method == 'linear':
+        if method_key == 'linear':
             raise ValueError('need at least two points to linearly interpolate')
         if nx == 0:
             raise ValueError('empty array')

diff --git a/pyramid/arima/arima.py b/pyramid/arima/arima.py
@@ -11,8 +11,11 @@
 from sklearn.utils.validation import check_array, check_is_fitted
 from sklearn.utils.metaestimators import if_delegate_has_method
 from statsmodels.tsa.arima_model import ARIMA as _ARIMA
+from statsmodels.tsa.base.tsa_model import TimeSeriesModelResults
 from statsmodels import api as sm
+import datetime
 import warnings
+import os
 
 # The DTYPE we'll use for everything here. Since there are
 # lots of spots where we define the DTYPE in a numpy array,
@@ -204,19 +207,17 @@ def _fit_wrapper():
                                     disp=self.disp, callback=self.callback,
                                     **fit_args)
 
-
         # sometimes too many warnings...
         if self.suppress_warnings:
             with warnings.catch_warnings(record=False):
                 warnings.simplefilter('ignore')
-                self.arima_, self.arima_res_ = _fit_wrapper()
+                _, self.arima_res_ = _fit_wrapper()
         else:
-            self.arima_, self.arima_res_ = _fit_wrapper()
+            _, self.arima_res_ = _fit_wrapper()
 
         return self
 
-    def predict(self, n_periods=10, exogenous=None, alpha=0.05,
-                include_std_err=False, include_conf_int=False):
+    def predict(self, n_periods=10, exogenous=None):
         """Generate predictions (forecasts) ``n_periods`` in the future. Note that unless
         ``include_std_err`` or ``include_conf_int`` are True, only the forecast
         array will be returned (otherwise, a tuple with the corresponding elements
@@ -246,32 +247,21 @@ def predict(self, n_periods=10, exogenous=None, alpha=0.05,
         -------
         forecasts : array-like, shape=(n_periods,)
             The array of fore-casted values.
-
-        stderr : array-like, shape=(n_periods,)
-            The array of standard errors (NOTE this is only returned
-            if ``include_std_err`` is True).
-
-        conf_int : array-like, shape=(n_periods, 2)
-            The array of confidence intervals (NOTE this is only returned
-            if ``include_conf_int`` is True).
         """
         check_is_fitted(self, 'arima_res_')
 
-        # use the results wrapper to predict so it injects its own params
-        # (also if I was 0, ARMA will not have a forecast method natively)
-        f, s, c = self.arima_res_.forecast(steps=n_periods, exog=exogenous, alpha=alpha)
+        # ARIMA predicts differently...
+        if self.seasonal_order is None:
+            # use the results wrapper to predict so it injects its own params
+            # (also if I was 0, ARMA will not have a forecast method natively)
+            f, _, _ = self.arima_res_.forecast(steps=n_periods, exog=exogenous)
+        else:
+            f = self.arima_res_.forecast(steps=n_periods, exog=exogenous)
 
         # different variants of the stats someone might want returned...
-        if include_std_err and include_conf_int:
-            return f, s, c
-        elif include_conf_int:
-            return f, c
-        elif include_std_err:
-            return f, s
         return f
 
-    def fit_predict(self, y, exogenous=None, n_periods=10, alpha=0.05,
-                    include_std_err=False, include_conf_int=False, **fit_args):
+    def fit_predict(self, y, exogenous=None, n_periods=10, **fit_args):
         """Fit an ARIMA to a vector, ``y``, of observations, and then
         generate predictions.
 
@@ -300,8 +290,56 @@ def fit_predict(self, y, exogenous=None, n_periods=10, alpha=0.05,
             Any keyword args to pass to the fit method.
         """
         self.fit(y, exogenous, **fit_args)
-        return self.predict(n_periods=n_periods, exogenous=exogenous, alpha=alpha,
-                            include_std_err=include_std_err, include_conf_int=include_conf_int)
+        return self.predict(n_periods=n_periods, exogenous=exogenous)
+
+    def _get_pickle_hash_file(self):
+        # Mmmm, pickle hash...
+        return '.%s-%i.pmdpkl' % (str(datetime.datetime.now()).replace(' ', '_'), hash(self))
+
+    def __getstate__(self):
+        """I am being pickled..."""
+        loc = self.__dict__.get('tmp_pkl_', None)
+
+        # if this already contains a pointer to a "saved state" model,
+        # delete that model and replace it with a new one
+        if loc is not None:
+            os.unlink(loc)
+
+        # get the new location for where to save the results
+        new_loc = self._get_pickle_hash_file()
+        cwd = os.path.abspath(os.getcwd())
+        new_loc = os.path.join(cwd, new_loc)
+
+        # save the results - but only if it's fit...
+        if hasattr(self, 'arima_res_'):
+            # statsmodels result views work by caching metrics. If they
+            # are not cached prior to pickling, we might hit issues. This is
+            # a bug documented here: https://github.com/statsmodels/statsmodels/issues/3290
+            _ = self.arima_res_.summary()
+            self.arima_res_.save(fname=new_loc, remove_data=False)
+
+            # point to the location of the saved MLE model
+            self.tmp_pkl_ = new_loc
+
+        return self.__dict__
+
+    def __setstate__(self, state):
+        # I am being unpickled...
+        self.__dict__ = state
+
+        # re-set the results class
+        loc = state.get('tmp_pkl_', None)
+        if loc is not None:
+            self.arima_res_ = TimeSeriesModelResults.load(loc)
+
+        return self
+
+    def _clear_cached_state(self):
+        # when fit in an auto-arima, a lot of cached .pmdpkl files
+        # are generated if fit in parallel... this removes the tmp file
+        loc = self.__dict__.get('tmp_pkl_', None)
+        if loc is not None:
+            os.unlink(loc)
 
     @if_delegate_has_method('arima_res_')
     def aic(self):
@@ -376,19 +414,6 @@ def bse(self):
         """
         return self.arima_res_.bse
 
-    @if_delegate_has_method('arima_res_')
-    def df_model(self):
-        """Get the model degrees of freedom:
-
-            k_exog + k_trend + k_ar + k_ma
-
-        Returns
-        -------
-        df_model : array-like
-            The degrees of freedom.
-        """
-        return self.arima_res_.df_model
-
     @if_delegate_has_method('arima_res_')
     def df_resid(self):
         """Get the residual degrees of freedom:
@@ -418,40 +443,6 @@ def hqic(self):
         """
         return self.arima_res_.hqic
 
-    @if_delegate_has_method('arima_res_')
-    def k_ar(self):
-        """Get the number of AR coefficients in the model.
-
-        Returns
-        -------
-        k_ar : int
-            The number of AR coefficients.
-        """
-        return self.arima_res_.k_ar
-
-    @if_delegate_has_method('arima_res_')
-    def k_exog(self):
-        """Get the number of exogenous variables included in the model. Does not
-        include the constant.
-
-        Returns
-        -------
-        k_exog : int
-            The number of features in the exogenous variables.
-        """
-        return self.arima_res_.k_exog
-
-    @if_delegate_has_method('arima_res_')
-    def k_ma(self):
-        """Get the number of MA coefficients in the model.
-
-        Returns
-        -------
-        k_ma : int
-            The number of MA coefficients.
-        """
-        return self.arima_res_.k_ma
-
     @if_delegate_has_method('arima_res_')
     def maparams(self):
         """Get the value of the moving average coefficients.
@@ -518,18 +509,3 @@ def resid(self):
             The model residuals.
         """
         return self.arima_res_.resid
-
-    @if_delegate_has_method('arima_res_')
-    def sigma2(self):
-        """Get the variance of the residuals. If the model is fit by 'css',
-        sigma2 = ssr/nobs, where ssr is the sum of squared residuals. If
-        the model is fit by 'mle', then sigma2 = 1/nobs * sum(v^2 / F)
-        where v is the one-step forecast error and F is the forecast error
-        variance.
-
-        Returns
-        -------
-        sigma2 : float
-            The variance of the residuals
-        """
-        return self.arima_res_.sigma2
diff --git a/pyramid/arima/auto.py b/pyramid/arima/auto.py
@@ -37,7 +37,7 @@
 def auto_arima(y, exogenous=None, start_p=2, d=None, start_q=2, max_p=5, max_d=2, max_q=5,
                start_P=1, D=None, start_Q=1, max_P=2, max_D=1, max_Q=2, max_order=None, m=1,
                seasonal=True, stationary=False, information_criterion='aic', alpha=0.05, test='kpss',
-               seasonal_test='ch', n_jobs=1, start_params=None, trend='c', method="css-mle", transparams=True,
+               seasonal_test='ch', n_jobs=1, start_params=None, trend='c', method=None, transparams=True,
                solver='lbfgs', maxiter=50, disp=0, callback=None, offset_test_args=None, seasonal_test_args=None,
                suppress_warnings=False, error_action='warn', **fit_args):
     """The ``AutoARIMA`` function seeks to identify the most optimal parameters for an ``ARIMA`` model,
@@ -380,12 +380,13 @@ def generator():
         )
 
     # get results in parallel
+    gen = generator()  # the combos we need to fit
     all_res = Parallel(n_jobs=n_jobs)(
         delayed(_fit_arima)(y, xreg=exogenous, order=order, seasonal_order=seasonal_order,
                             start_params=start_params, trend=trend, method=method, transparams=transparams,
                             solver=solver, maxiter=maxiter, disp=disp, callback=callback,
                             fit_params=fit_args, suppress_warnings=suppress_warnings, error_action=error_action)
-        for order, seasonal_order in generator())
+        for order, seasonal_order in gen)
 
     # filter the non-successful ones
     filtered = [m for m in all_res if m is not None]
@@ -395,7 +396,13 @@ def generator():
 
     # sort by the criteria
     sorted_res = sorted(filtered, key=(lambda model: getattr(model, information_criterion)()), reverse=True)
-    return sorted_res[0]
+    best = sorted_res[0]
+
+    # remove all the cached .pmdpkl files...
+    for model in sorted_res:
+        model._clear_cached_state()
+
+    return best
 
 
 def _fit_arima(x, xreg, order, seasonal_order, start_params, trend, method, transparams,
@@ -411,10 +418,16 @@ def _fit_arima(x, xreg, order, seasonal_order, start_params, trend, method, tran
     # for non-stationarity errors, return None
     except ValueError as v:
         if error_action == 'warn':
-            warnings.warn('Unable to fit ARIMA for order=(%i, %i, %i); data is likely non-stationary. '
-                          '(if you do not want to see these warnings, run with error_action="ignore")'
-                          % (order[0], order[1], order[2]))
+            warnings.warn(_fmt_warning_str(order, seasonal_order))
         elif error_action == 'raise':
             raise v
         # otherwise it's 'ignore'
         return None
+
+
+def _fmt_warning_str(order, seasonal_order):
+    """This is just so we can test that the string will format even if we don't want the warnings in the tests"""
+    return ('Unable to fit ARIMA for order=(%i, %i, %i)%s; data is likely non-stationary. '
+            '(if you do not want to see these warnings, run with error_action="ignore")'
+            % (order[0], order[1], order[2], '' if seasonal_order is None else ' seasonal_order=(%i, %i, %i, %i)'
+               % (seasonal_order[0], seasonal_order[1], seasonal_order[2], seasonal_order[3])))
diff --git a/pyramid/arima/tests/test_approx.py b/pyramid/arima/tests/test_approx.py
@@ -4,6 +4,7 @@
 from pyramid.arima.approx import approx, _regularize
 from pyramid.utils.array import c
 from numpy.testing import assert_array_almost_equal
+from nose.tools import assert_raises
 import numpy as np
 
 table = c(0.216, 0.176, 0.146, 0.119)
@@ -31,3 +32,17 @@ def test_approx_rule2():
     x, y = approx(table, tablep, stat, rule=2)
     assert_array_almost_equal(x, c(1.01))
     assert_array_almost_equal(y, c(0.01))
+
+
+def test_corners():
+    # fails for length differences
+    assert_raises(ValueError, approx, x=[1, 2, 3], y=[1, 2], xout=1.0)
+
+    # fails for bad string
+    assert_raises(ValueError, approx, x=table, y=table, xout=1.0, method='bad-string')
+
+    # fails for bad length
+    assert_raises(ValueError, approx, x=[], y=[], xout=[], ties='mean')
+
+    # fails for linear when < 2 samples
+    assert_raises(ValueError, approx, x=[1], y=[1], xout=[], method='linear', ties='ordered')