Amend doc, add is_iterable func, compute machine eps dynamically

alkaline-ml · Jan 6, 2018 · 55fc876 · 55fc876
1 parent 4029c77
commit 55fc876
Show file tree

Hide file tree

Showing 7 changed files with 135 additions and 16 deletions.
diff --git a/benchmarks/benchmark_funcs.py b/benchmarks/benchmark_funcs.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+#
+# Benchmark various approaches to functions to speed things up.
+# ... hopefully.
+
+from __future__ import absolute_import
+
+import numpy as np
+
+import time
+
+
+def _do_time(func, n_iter=10, *args, **kwargs):
+    times = []
+    for _ in range(n_iter):
+        start = time.time()
+        func(*args, **kwargs)
+        times.append(time.time() - start)
+
+    times = np.asarray(times)
+    print("Completed %i iterations (avg=%.6f, min=%.6f, max=%.6f)"
+          % (n_iter, times.mean(), times.min(), times.max()))
+
+
+def benchmark_is_constant():
+    """This benchmarks the "is_constant" function from ``pyramid.arima.utils``.
+    This was added in 0.6.2.
+    """
+    # WINNER!
+    def is_const1(x):
+        """This is the version in Pyramid 0.6.2.
+
+        Parameters
+        ----------
+        x : np.ndarray
+            This is the array.
+        """
+        return (x == x[0]).all()
+
+    def is_const2(x):
+        """This should ostensibly only take O(N) rather than O(2N) like
+        its predecessor. But we'll see...
+
+        Parameters
+        ----------
+        x : np.ndarray
+            This is the array.
+        """
+        return np.unique(x).shape[0] == 1
+
+    x = np.random.choice(np.arange(10), 1000000, replace=True)
+    _do_time(is_const1, 25, x)
+    _do_time(is_const2, 25, x)
+
+
+if __name__ == '__main__':
+    benchmark_is_constant()
diff --git a/pyramid/arima/auto.py b/pyramid/arima/auto.py
@@ -5,17 +5,20 @@
 # Automatically find optimal parameters for an ARIMA
 
 from __future__ import absolute_import
+
 from sklearn.utils.validation import check_array, column_or_1d
 from sklearn.utils import check_random_state
 from sklearn.externals.joblib import Parallel, delayed
 from sklearn.linear_model import LinearRegression
+
 from numpy.linalg import LinAlgError
 import numpy as np
+
 import warnings
 import time
 
 from .utils import ndiffs, is_constant, nsdiffs
-from ..utils import diff
+from ..utils import diff, is_iterable
 from .arima import ARIMA
 
 # for python 3 compat
@@ -874,8 +877,9 @@ def _post_ppc_arima(a):
         The list or ARIMAs, or an ARIMA
     """
     # if it's a result of making it to the end, it will
-    # be a list of ARIMA models.
-    if hasattr(a, '__iter__'):
+    # be a list of ARIMA models. Filter out the Nones
+    # (the failed models)...
+    if is_iterable(a):
         a = [m for m in a if m is not None]
 
     # if the list is empty, or if it was an ARIMA and it's None
@@ -906,7 +910,7 @@ def _return_wrapper(fits, return_all, start, trace):
         Whether to return all.
     """
     # make sure it's an iterable
-    if not hasattr(fits, '__iter__'):
+    if not is_iterable(fits):
         fits = [fits]
 
     # whether to print the final runtime

diff --git a/pyramid/arima/seasonality.py b/pyramid/arima/seasonality.py
@@ -150,12 +150,22 @@ def _sd_test(wts, s):
 
         # UPDATE 01/04/2018 - we can get away without computing u, v
         # (this is also MUCH MUCH faster!!!)
-        sv = svd(tmp, compute_uv=False)
-        if sv.min() < 2.220446e-16:  # machine min eps
+        sv = svd(tmp, compute_uv=False)  # type: np.ndarray
+
+        # From R:
+        # double.eps: the smallest positive floating-point number ‘x’ such that
+        # ‘1 + x != 1’.  It equals ‘double.base ^ ulp.digits’ if either
+        # ‘double.base’ is 2 or ‘double.rounding’ is 0; otherwise, it
+        # is ‘(double.base ^ double.ulp.digits) / 2’.  Normally
+        # ‘2.220446e-16’.
+        # Numpy's float64 has an eps of 2.2204460492503131e-16
+        if sv.min() < np.finfo(sv.dtype).eps:  # machine min eps
             return 0
 
         # solve against the identity matrix, then produce
-        # a nasty mess of dot products...
+        # a nasty mess of dot products... this is the (horrendous) R code:
+        # (1/N^2) * sum(diag(solve(tmp) %*% t(A) %*% t(Fhat) %*% Fhat %*% A))
+        # https://github.com/robjhyndman/forecast/blob/master/R/arima.R#L321
         solved = solve(tmp, np.identity(tmp.shape[0]))
         return (1.0 / n ** 2) * solved.dot(A.T).dot(
             Fhat.T).dot(Fhat).dot(A).diagonal().sum()

diff --git a/pyramid/arima/utils.py b/pyramid/arima/utils.py
@@ -5,6 +5,7 @@
 # Common ARIMA functions
 
 from __future__ import absolute_import
+
 from sklearn.utils.validation import check_array, column_or_1d
 import numpy as np
 

diff --git a/pyramid/utils/array.py b/pyramid/utils/array.py
@@ -5,12 +5,16 @@
 # Array utilities
 
 from __future__ import absolute_import, division
+
 from sklearn.utils.validation import check_array
+from sklearn.externals import six
+
 import numpy as np
 
 __all__ = [
     'c',
-    'diff'
+    'diff',
+    'is_iterable'
 ]
 
 
@@ -20,16 +24,27 @@ def c(*args):
     that wraps ``numpy.concatenate``? Similar to R, this works with scalars,
     iterables, and any mix therein.
 
+    Note that using the ``c`` function on multi-nested lists or iterables
+    will fail!
+
     Examples
     --------
-    >>> from pyramid.utils import c
+    Using ``c`` with *args will yield a single array:
     >>> c(1, 2, 3, 4)
     array([1, 2, 3, 4])
 
-    >>> from pyramid.utils import c
+    Using ``c`` with nested lists and scalars will also yield a single array:
     >>> c([1, 2], 4, c(5, 4))
     array([1, 2, 4, 5, 4])
 
+    However, using ``c`` with multi-level lists will fail!
+    >>> c([1, 2, 3], [[1, 2]])
+    Traceback (most recent call last):
+      File "<stdin>", line 1, in <module>
+      File "pyramid/utils/array.py", line 64, in c
+        return np.concatenate([a if is_iterable(a) else [a] for a in args])
+    ValueError: all the input arrays must have same number of dimensions
+
     References
     ----------
     .. [1] https://stat.ethz.ch/R-manual/R-devel/library/base/html/c.html
@@ -43,14 +58,14 @@ def c(*args):
         element = args[0]
 
         # if it's iterable, make it an array
-        if hasattr(element, '__iter__'):
+        if is_iterable(element):
             return np.asarray(element)
 
         # otherwise it's not iterable, put it in an array
         return np.asarray([element])
 
     # concat all
-    return np.concatenate([a if hasattr(a, '__iter__') else [a] for a in args])
+    return np.concatenate([a if is_iterable(a) else [a] for a in args])
 
 
 def _diff_vector(x, lag):
@@ -107,7 +122,7 @@ def diff(x, lag=1, differences=1):
     >>> diff(x, 6, 1)
     array([], dtype=float32)
 
-    >>> from pyramid.utils import c, diff
+    >>> from pyramid.utils import diff
     >>> import numpy as np
     >>>
     >>> x = np.arange(1, 10).reshape((3, 3)).T
@@ -138,7 +153,7 @@ def diff(x, lag=1, differences=1):
     if any(v < 1 for v in (lag, differences)):
         raise ValueError('lag and differences must be positive (> 0) integers')
 
-    x = check_array(x, ensure_2d=False, dtype=np.float32)
+    x = check_array(x, ensure_2d=False, dtype=np.float32)  # type: np.ndarray
     fun = _diff_vector if len(x.shape) == 1 else _diff_matrix
     res = x
 
@@ -150,3 +165,25 @@ def diff(x, lag=1, differences=1):
             return res
 
     return res
+
+
+def is_iterable(x):
+    """Determine whether an object ``x`` is iterable. In Python 2, this
+    was as simple as checking for the ``__iter__`` attribute. However, in
+    Python 3, strings became iterable. Therefore, this function checks for the
+    ``__iter__`` attribute, returning True if present (except for strings,
+    for which it will return False).
+
+    Parameters
+    ----------
+    x : str, iterable or object
+        The object in question.
+
+    Returns
+    -------
+    isiter : bool
+        True if iterable, else False.
+    """
+    if isinstance(x, six.string_types):
+        return False
+    return hasattr(x, '__iter__')
diff --git a/pyramid/utils/metaestimators.py b/pyramid/utils/metaestimators.py
@@ -2,7 +2,8 @@
 #
 # Author: Taylor Smith <taylor.smith@alkaline-ml.com>
 #
-# Metaestimators for the ARIMA class
+# Metaestimators for the ARIMA class. These classes are derived from the
+# sklearn metaestimators, but adapted for more specific use with pyramid.
 
 from __future__ import absolute_import
 from operator import attrgetter

diff --git a/pyramid/utils/tests/test_array.py b/pyramid/utils/tests/test_array.py
@@ -1,7 +1,9 @@
 
 from __future__ import absolute_import
-from pyramid.utils.array import diff, c
+
+from pyramid.utils.array import diff, c, is_iterable
 from pyramid.utils import get_callable
+
 from numpy.testing import assert_array_equal
 from nose.tools import assert_raises
 import numpy as np
@@ -42,3 +44,10 @@ def test_corner_in_callable():
 def test_corner():
     # fails because lag < 1
     assert_raises(ValueError, diff, x=x, lag=0)
+
+
+def test_is_iterable():
+    assert not is_iterable("this string")
+    assert is_iterable(["this", "list"])
+    assert not is_iterable(None)
+    assert is_iterable(np.array([1, 2]))