Merge branch 'python3.5-taylor' into python3.5

tgsmith61591 · Nov 29, 2016 · 7acf30e · 7acf30e
2 parents 2d6cdf6 + a5a5e8f
commit 7acf30e
Show file tree

Hide file tree

Showing 23 changed files with 3,771 additions and 3,331 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,6 @@ gh-pages/
 
 # pycharm
 .idea/
+
+# model directory
+*.mdl
diff --git a/.travis.yml b/.travis.yml
@@ -27,14 +27,14 @@ env:
      WITH_MATPLOTLIB="true" WITH_SEABORN="true" COVERAGE="true"
    - DISTRIB="conda" PYTHON_VERSION="2.7" SCIKIT_LEARN_VERSION="0.18"
      WITH_MATPLOTLIB="false" WITH_SEABORN="false" COVERAGE="true"
-   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.17.1"
-     WITH_MATPLOTLIB="true" WITH_SEABORN="true" COVERAGE="true"
-   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.17.1"
-     WITH_MATPLOTLIB="false" WITH_SEABORN="false" COVERAGE="true"
-   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.18"
-     WITH_MATPLOTLIB="true" WITH_SEABORN="true" COVERAGE="true"
-   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.18"
-     WITH_MATPLOTLIB="false" WITH_SEABORN="false" COVERAGE="true"
+#   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.17.1"
+#     WITH_MATPLOTLIB="true" WITH_SEABORN="true" COVERAGE="true"
+#   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.17.1"
+#     WITH_MATPLOTLIB="false" WITH_SEABORN="false" COVERAGE="true"
+#   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.18"
+#     WITH_MATPLOTLIB="true" WITH_SEABORN="true" COVERAGE="true"
+#   - DISTRIB="conda" PYTHON_VERSION="3.5" SCIKIT_LEARN_VERSION="0.18"
+#     WITH_MATPLOTLIB="false" WITH_SEABORN="false" COVERAGE="true"
 
 matrix:
   allow_failures:

diff --git a/skutil/base.py b/skutil/base.py
@@ -4,10 +4,12 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.externals import six
 from abc import ABCMeta
+import re
 import warnings
 
 __all__ = [
     'overrides',
+    'since',
     'suppress_warnings',
     'BaseSkutil',
     'SelectiveMixin'
@@ -46,6 +48,8 @@ def overrides(interface_class):
         Traceback (most recent call last):  
         AssertionError: A.b must override a super method!
 
+
+    .. versionadded:: 0.1.0
     """
 
     def overrider(method):
@@ -56,6 +60,34 @@ def overrider(method):
     return overrider
 
 
+def since(version):
+    """A decorator that annotates a function to append the version 
+    of skutil the function was added. This decorator is an adaptation of PySpark's.
+
+    Examples
+    --------
+
+        >>> @since('0.1.5')
+        ... def some_fun():
+        ...     '''Some docstring'''
+        ...     return None
+        ...
+        >>>
+        >>> some_fun.__doc__ # doctest: +SKIP
+        'Some docstring\n\n.. versionadded:: 0.1.5'
+
+
+    .. versionadded:: 0.1.5
+    """
+    indent_p = re.compile(r'\n( +)')
+    def deco(f):
+        indents = indent_p.findall(f.__doc__)
+        indent = ' ' * (min(len(m) for m in indents) if indents else 0)
+        f.__doc__ = f.__doc__.rstrip() + "\n\n%s.. versionadded:: %s" % (indent, version)
+        return f
+    return deco
+
+
 def suppress_warnings(func):
     """Decorator that forces a method to suppress
     all warnings it may raise. This should be used with caution,
@@ -87,6 +119,9 @@ def suppress_warnings(func):
         >>>
         >>> fun_that_warns()
         1
+
+
+    .. versionadded:: 0.1.0
     """
 
     def suppressor(*args, **kwargs):

diff --git a/skutil/decomposition/decompose.py b/skutil/decomposition/decompose.py
@@ -11,7 +11,7 @@
 from skutil.base import *
 from skutil.base import overrides
 from ..utils import *
-from ..utils.fixes import _cols_if_none
+from ..utils.fixes import _cols_if_none, _as_numpy
 
 __all__ = [
     'SelectivePCA',
@@ -187,7 +187,7 @@ def fit(self, X, y=None):
         # fails thru if names don't exist:
         self.pca_ = PCA(
             n_components=self.n_components,
-            whiten=self.whiten).fit(X[cols])
+            whiten=self.whiten).fit(X[cols].as_matrix())
 
         return self
 
@@ -216,7 +216,7 @@ def transform(self, X):
         cols = _cols_if_none(X, self.cols)
 
         other_nms = [nm for nm in X.columns if nm not in cols]
-        transform = self.pca_.transform(X[cols])
+        transform = self.pca_.transform(X[cols].as_matrix())
 
         # do weighting if necessary
         if self.weight:
@@ -281,7 +281,7 @@ def score(self, X, y=None):
         X, _ = validate_is_pd(X, self.cols)
         cols = X.columns if not self.cols else self.cols
 
-        ll = self.pca_.score(X[cols], y)
+        ll = self.pca_.score(X[cols].as_matrix(), _as_numpy(y))
         return ll
 
 
@@ -377,7 +377,7 @@ def fit(self, X, y=None):
         self.svd_ = TruncatedSVD(
             n_components=self.n_components,
             algorithm=self.algorithm,
-            n_iter=self.n_iter).fit(X[cols])
+            n_iter=self.n_iter).fit(X[cols].as_matrix())
 
         return self
 
@@ -406,9 +406,12 @@ def transform(self, X):
         cols = _cols_if_none(X, self.cols)
 
         other_nms = [nm for nm in X.columns if nm not in cols]
-        transform = self.svd_.transform(X[cols])
+        transform = self.svd_.transform(X[cols].as_matrix())
         left = pd.DataFrame.from_records(data=transform,
-                                         columns=[('Concept%i' % (i + 1)) for i in range(transform.shape[1])])
+                                         columns=[
+                                            ('Concept%i' % (i + 1)) 
+                                            for i in range(transform.shape[1])
+                                        ])
 
         # concat if needed
         x = pd.concat([left, X[other_nms]], axis=1) if other_nms else left

diff --git a/skutil/feature_selection/base.py b/skutil/feature_selection/base.py
@@ -80,14 +80,14 @@ def transform(self, X):
         # check on state of X and cols
         X, _ = validate_is_pd(X, self.cols)
 
-        if self.drop_ is None:
+        if not self.drop_: # empty or None
             return X if self.as_df else X.as_matrix()
         else:
             # what if we don't want to throw this key error for a non-existent
             # column that we hope to drop anyways? We need to at least inform the
             # user...
             drops = [x for x in self.drop_ if x in X.columns]
-            if not len(drops) == len(self.drop_):
+            if len(drops) != len(self.drop_):
                 warnings.warn('one or more features to drop not contained '
                               'in input data feature names', UserWarning)
 

diff --git a/skutil/feature_selection/select.py b/skutil/feature_selection/select.py
@@ -134,7 +134,7 @@ def fit(self, X, y=None):
         # assess sparsity
         self.sparsity_ = X[cols].apply(lambda x: x.isnull().sum() / x.shape[0]).values  # numpy array
         mask = self.sparsity_ > thresh  # numpy boolean array
-        self.drop_ = X.columns[mask].tolist() if mask.sum() > 0 else None
+        self.drop_ = X.columns[mask].tolist()
         return self
 
 
@@ -289,7 +289,7 @@ def transform(self, X):
         X, _ = validate_is_pd(X, self.cols)  # copy X
         cols = X.columns if self.cols is None else self.cols
 
-        retained = X[cols]  # if cols is None, returns all
+        retained = X[cols]  # if not cols, returns all
         return retained if self.as_df else retained.as_matrix()
 
 
@@ -521,10 +521,7 @@ def fit(self, X, y=None):
         c = X[cols].corr(method=self.method).apply(lambda x: np.abs(x))
 
         # get drops list
-        d, mac, crz = filter_collinearity(c, self.threshold)
-        self.drop_ = d if d else None
-        self.mean_abs_correlations_ = mac if mac else None
-        self.correlations_ = crz if crz else None
+        self.drop_, self.mean_abs_correlations_, self.correlations_ = filter_collinearity(c, self.threshold)
 
         return self
 
@@ -702,11 +699,4 @@ def fit(self, X, y=None):
             self.drop_ = np.asarray(cols)[drop_mask].tolist()
             self.var_ = dict(zip(self.drop_, matrix[drop_mask, 0].tolist())) # just retain the variances
 
-        # I don't like making this None; it opens up bugs in pd.drop,
-        # but it was the precedent the API set from early on, so don't
-        # want to change it without a warning. TODO: in future versions,
-        # don't do this...
-        if not self.drop_:
-            self.drop_ = None
-
         return self
diff --git a/skutil/feature_selection/tests/test_select.py b/skutil/feature_selection/tests/test_select.py
@@ -232,7 +232,7 @@ def test_multi_collinearity():
 
 def test_nzv_filterer():
     transformer = NearZeroVarianceFilterer().fit(X)
-    assert transformer.drop_ is None
+    assert not transformer.drop_
 
     y = X.copy()
     y['zeros'] = np.zeros(150)

diff --git a/skutil/h2o/balance.py b/skutil/h2o/balance.py
@@ -144,6 +144,8 @@ class H2OOversamplingClassBalancer(_BaseH2OBalancer):
         2     50
         Name A, dtype: int64
         
+
+    .. versionadded:: 0.1.0
     """
 
     def __init__(self, target_feature, ratio=BalancerMixin._def_ratio, shuffle=True):
@@ -241,6 +243,8 @@ class (1) is represented at a ratio of 0.5.
         2    10
         Name A, dtype: int64
 
+
+    .. versionadded:: 0.1.0
     """
 
     _min_version = '3.8.2.9'

diff --git a/skutil/h2o/base.py b/skutil/h2o/base.py
@@ -3,9 +3,12 @@
 import h2o
 import os
 from ..utils.fixes import is_iterable
+from ..base import since
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.externals import six
 from h2o.frame import H2OFrame
+from pkg_resources import parse_version
+from ..utils import is_numeric
 
 # in different versions, we get different exceptions
 try:
@@ -18,9 +21,6 @@
 except ImportError as e:
     H2OConnectionError = EnvironmentError
 
-from pkg_resources import parse_version
-from ..utils import is_numeric
-
 try:
     import cPickle as pickle
 except ImportError as e:
@@ -350,6 +350,9 @@ class BaseH2OFunctionWrapper(BaseEstimator):
 
     max_version : str or float, optional (default=None)
         The maximum version of h2o that is compatible with the transformer
+
+
+    .. versionadded:: 0.1.0
     """
 
     def __init__(self, target_feature=None, min_version='any', max_version=None):
@@ -527,6 +530,9 @@ class BaseH2OTransformer(BaseH2OFunctionWrapper, TransformerMixin):
 
     max_version : str or float, optional (default=None)
         The maximum version of h2o that is compatible with the transformer
+
+
+    .. versionadded:: 0.1.0
     """
 
     def __init__(self, feature_names=None, target_feature=None, exclude_features=None,