Merge branch 'rbind' into develop

tgsmith61591 · Jan 31, 2017 · 41172c5 · 41172c5
2 parents 2c75a2f + 45e7594
commit 41172c5
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 143 deletions.
diff --git a/skutil/h2o/__init__.py b/skutil/h2o/__init__.py
@@ -7,6 +7,7 @@ class balancers.
 from .base import *
 from .balance import *
 from .encode import *
+from .fixes import *
 from .frame import *
 from .grid_search import *
 from .metrics import *

diff --git a/skutil/h2o/balance.py b/skutil/h2o/balance.py
@@ -1,11 +1,11 @@
 from __future__ import absolute_import, division, print_function
-from collections import Counter
 import pandas as pd
 from abc import ABCMeta
 import warnings
 from sklearn.externals import six
 from skutil.base import overrides
-from .util import reorder_h2o_frame
+from .transform import _flatten_one
+from .util import reorder_h2o_frame, _gen_optimized_chunks, h2o_col_to_numpy
 from .base import check_frame, BaseH2OFunctionWrapper
 from ..preprocessing.balance import (_validate_ratio, _validate_target, _validate_num_classes,
                                      _OversamplingBalancePartitioner, _UndersamplingBalancePartitioner,
@@ -58,45 +58,34 @@ def _validate_x_y_ratio(X, y, ratio):
     # validate ratio, if the current ratio is >= the ratio, it's "balanced enough"
     ratio = _validate_ratio(ratio)
     y = _validate_target(y)  # cast to string type
+    is_factor = _flatten_one(X[y].isfactor())  # is the target a factor?
 
-    # generate cts. Have to get kludgier in h2o...
-    unq_vals = X[y].unique()
-    unq_vals = unq_vals.as_data_frame(use_pandas=True)[unq_vals.columns[0]].values  # numpy array of unique vals
-    unq_cts = dict([(val, X[y][X[y] == val].shape[0]) for val in unq_vals])
+    # if the target is a factor, we might have an issue here...
+    """
+    if is_factor:
+        warnings.warn('Balancing with the target as a factor can cause unpredictable '
+                      'sampling behavior (H2O makes it difficult to assess equality '
+                      'between two factors). Balancing works best when the target '
+                      'is an int. If possible, consider using `asnumeric`.', UserWarning)
+    """
 
-    # validate is < max classes
-    cts = pd.Series(unq_cts).sort_values(ascending=True)
+    # generate cts. Have to get kludgier in h2o... then validate is < max classes
+    # we have to do it this way, because H2O might treat the vals as enum, and we cannot
+    # slice based on equality (dernit, H2O).
+    target_col = pd.Series(h2o_col_to_numpy(X[y]))
+    cts = target_col.value_counts().sort_values(ascending=True)
     n_classes = _validate_num_classes(cts)
     needs_balancing = (cts.values[0] / cts.values[-1]) < ratio
 
-    out_tup = (cts, n_classes, needs_balancing)
+    index = cts.index if not is_factor else cts.index.astype('str')
+    out_tup = (dict(zip(index, cts.values)),  # cts
+               index,  # labels sorted ascending by commonality
+               target_col.values if not is_factor else target_col.astype('str').values,  # the target
+               n_classes,
+               needs_balancing)
     return out_tup
 
 
-def _gen_optimized_chunks(idcs):
-    """Given the list of indices, create more efficient chunks to minimize
-    the number of rbind operations required for the H2OFrame ExprNode cache.
-    """
-    idcs = sorted(idcs)
-    counter = Counter(idcs)
-    counts = counter.most_common()  # order desc
-
-    # the first index is the number of chunks we'll need to create.
-    n_chunks = counts[0][1]
-    chunks = [[] for _ in range(n_chunks)]  # gen the number of chunks we'll need
-
-    # 1. populate the chunks each with their first idx (the most common)
-    # 2. pop from the counter
-    # 3. re-generate the most_common(), repeat
-    while counts:
-        val, n_iter = counts[0]  # the one at the head of the list is the most common
-        for i in range(n_iter):
-            chunks[i].append(val)
-        counts.pop(0)  # pop out the first idx...
-    # sort them
-    return [sorted(chunk) for chunk in chunks]
-
-
 class _BaseH2OBalancer(six.with_metaclass(ABCMeta, 
                                           BaseH2OFunctionWrapper, 
                                           BalancerMixin)):

diff --git a/skutil/h2o/fixes.py b/skutil/h2o/fixes.py
@@ -0,0 +1,73 @@
+from __future__ import print_function, absolute_import, division
+import h2o
+import numpy as np
+from pkg_resources import parse_version
+from .base import check_frame
+
+_h2ov = h2o.__version__
+
+__all__ = [
+    'rbind_all'
+]
+
+if parse_version(_h2ov) < parse_version('3.10.0.7'):
+    def rbind_all(*args):
+        """Given a variable set of H2OFrames,
+        rbind all of them into a single H2OFrame.
+
+        Parameters
+        ----------
+
+        array1, array2, ... : H2OFrame, shape=(n_samples, n_features)
+            The H2OFrames to rbind. All should match in column
+            dimensionality.
+
+
+        Returns
+        -------
+
+        f : H2OFrame
+            The rbound H2OFrame
+        """
+        # check all are H2OFrames
+        for x in args:
+            check_frame(x, copy=False)
+
+        # check col dim
+        if np.unique([x.shape[1] for x in args]).shape[0] != 1:
+            raise ValueError('inconsistent column dimensions')
+
+        f = None
+        for x in args:
+            f = x if f is None else f.rbind(x)
+
+        return f
+
+else:
+    def rbind_all(*args):
+        """Given a variable set of H2OFrames,
+        rbind all of them into a single H2OFrame.
+
+        Parameters
+        ----------
+
+        array1, array2, ... : H2OFrame, shape=(n_samples, n_features)
+            The H2OFrames to rbind. All should match in column
+            dimensionality.
+
+
+        Returns
+        -------
+
+        f : H2OFrame
+            The rbound H2OFrame
+        """
+        # lazily evaluate type on the h2o side
+        if isinstance(args, (tuple, list)):
+            lst = args[0]
+            if len(lst) == 1:  # there's only one element
+                return lst[0]
+            return lst[0].rbind(lst[1:])
+        if len(args) == 1:
+            return args[0]
+        return args[0].rbind(args[1:])
diff --git a/skutil/h2o/one_way_fs.py b/skutil/h2o/one_way_fs.py
@@ -6,7 +6,8 @@
 from sklearn.externals import six
 from .split import *
 from .select import BaseH2OFeatureSelector
-from .util import _unq_vals_col, rbind_all
+from .util import _unq_vals_col
+from .fixes import rbind_all
 from ..utils import is_integer
 from .base import (check_frame, _frame_from_x_y)
 from ..base import overrides, since

diff --git a/skutil/h2o/tests/test_h2o.py b/skutil/h2o/tests/test_h2o.py
@@ -22,7 +22,6 @@
 from skutil.h2o.grid_search import *
 from skutil.h2o.base import BaseH2OFunctionWrapper
 from skutil.h2o.one_way_fs import h2o_f_classif, H2OFScorePercentileSelector, H2OFScoreKBestSelector
-from skutil.preprocessing.balance import _pd_frame_to_np
 from skutil.h2o.util import (h2o_frame_memory_estimate, h2o_corr_plot, h2o_bincount,
                              load_iris_h2o, load_breast_cancer_h2o, load_boston_h2o,
                              shuffle_h2o_frame, h2o_col_to_numpy)
@@ -205,7 +204,7 @@ def valid_use():
 
                 try:
                     dfh = new_h2o_frame(df)
-                except Exception as e:
+                except Exception:
                     dfh = None
                     return
 
@@ -298,7 +297,7 @@ def nzv():
             # test with strategy == ratio
             if X is not None:
                 transformer = H2ONearZeroVarianceFilterer(strategy='ratio', threshold=0.1)
-                assert_fails(transformer.fit, ValueError, Y) # will fail because thresh must be greater than 1.0
+                assert_fails(transformer.fit, ValueError, Y)  # will fail because thresh must be greater than 1.0
 
                 x = np.array([
                     [1, 2, 3],
@@ -328,15 +327,15 @@ def pipeline():
             X_train, X_test, y_train, y_test = train_test_split(f, targ, train_size=0.7)
 
             # add the y into the matrix for h2o's sake -- pandas will throw a warning here...
-            with warnings.catch_warnings(record=True) as w:
+            with warnings.catch_warnings(record=True):
                 warnings.simplefilter("ignore")
                 X_train['species'] = y_train
                 X_test['species'] = y_test
 
             try:
                 train = new_h2o_frame(X_train)
                 test = new_h2o_frame(X_test)
-            except Exception as e:
+            except Exception:
                 train = None
                 test = None
 
@@ -362,8 +361,8 @@ def pipeline():
                     pipe.predict(test)
 
                     # coverage:
-                    fe = pipe._final_estimator
-                    ns = pipe.named_steps
+                    _ = pipe._final_estimator
+                    _ = pipe.named_steps
 
                     # test pojo
                     assert not pipe.download_pojo()
@@ -408,7 +407,7 @@ def pipeline():
                     excepted = False
                     try:
                         pipe.fit(train)
-                    except (TypeError, ValueError, EnvironmentError) as e:
+                    except (TypeError, ValueError, EnvironmentError):
                         excepted = True
                     assert excepted, 'expected failure for y=%s' % str(y)
 
@@ -456,14 +455,14 @@ def pipeline():
 
                     # won't even get here...
                     # pipe.fit(train)
-                except TypeError as t:
+                except TypeError:
                     failed = True
                 assert failed
 
                 # type error for non-h2o estimators
                 failed = False
                 try:
-                    pipe = H2OPipeline([
+                    _ = H2OPipeline([
                         ('nzv', H2ONearZeroVarianceFilterer()),
                         ('mc', H2OMulticollinearityFilterer(threshold=0.9)),
                         ('est', RandomForestClassifier())
@@ -474,7 +473,7 @@ def pipeline():
 
                     # won't even get here...
                     # pipe.fit(train)
-                except TypeError as t:
+                except TypeError:
                     failed = True
                 assert failed
 
@@ -497,7 +496,7 @@ def pipeline():
                 ],
                     feature_names=F.columns.tolist(),
                     target_feature='species',
-                    exclude_from_fit=['sepal width (cm)'] # will not be included in the final fit
+                    exclude_from_fit=['sepal width (cm)']  # will not be included in the final fit
                 )
 
                 # fit pipe, predict...
@@ -526,7 +525,7 @@ def grid():
             # try uploading...
             try:
                 frame = new_h2o_frame(f)
-            except Exception as e:
+            except Exception:
                 frame = None
 
             def get_param_grid(est):
@@ -627,7 +626,8 @@ def get_param_grid(est):
                                         if not do_pipe:
                                             # we're just testing the search on actual estimators
                                             grid = grid_module(estimator=estimator,
-                                                               feature_names=F.columns.tolist(), target_feature='species',
+                                                               feature_names=F.columns.tolist(),
+                                                               target_feature='species',
                                                                param_grid=get_param_grid(estimator),
                                                                scoring=scoring, iid=iid, verbose=verbose,
                                                                cv=which_cv, minimize=minimize)
@@ -651,7 +651,8 @@ def get_param_grid(est):
                                                 }
 
                                             grid = grid_module(pipe, param_grid=params,
-                                                               feature_names=F.columns.tolist(), target_feature='species',
+                                                               feature_names=F.columns.tolist(),
+                                                               target_feature='species',
                                                                scoring=scoring, iid=iid, verbose=verbose,
                                                                cv=which_cv, minimize=minimize)
 
@@ -660,8 +661,8 @@ def get_param_grid(est):
                                             grid.n_iter = n_folds
 
                                         # sometimes we'll expect it to fail...
-                                        expect_failure = scoring is None or (
-                                        isinstance(scoring, str) and scoring in ('bad'))
+                                        expect_failure = scoring is None or (isinstance(scoring, str) and
+                                                                             scoring in ('bad'))
                                         try:
                                             # fit the grid
                                             grid.fit(frame)
@@ -671,10 +672,10 @@ def get_param_grid(est):
                                             expect_failure = False
 
                                             # predict on the grid
-                                            p = grid.predict(frame)
+                                            _ = grid.predict(frame)
 
                                             # score on the frame
-                                            s = grid.score(frame)
+                                            _ = grid.score(frame)
                                         except ValueError as v:
                                             if expect_failure:
                                                 pass
@@ -1333,35 +1334,34 @@ def cust_add(a, b):
         def balance():
             if X is not None:
                 # test that we can turn a frame's first col into a np array
-                x = _pd_frame_to_np(X)  # just gets back the first col...
-                assert isinstance(x, np.ndarray)
-
                 # upload to cloud with the target
                 f = F.copy()
                 f['species'] = iris.target
 
                 try:
                     Y = from_pandas(f)
-                except Exception as e:
+                except Exception:
                     Y = None
 
                 if Y is not None:
                     # assert undersampling the balance changes nothing:
-                    b = H2OUndersamplingClassBalancer(target_feature='species').balance(Y)
+                    b = H2OUndersamplingClassBalancer(target_feature='species', shuffle=False).balance(Y)
                     assert b.shape[0] == Y.shape[0]
 
                     # do a real undersample
                     x = Y[:60, :]  # 50 zeros, 10 ones
-                    b = H2OUndersamplingClassBalancer(target_feature='species', ratio=0.5).balance(x).as_data_frame(
-                        use_pandas=True)
+                    b = H2OUndersamplingClassBalancer(
+                                target_feature='species', shuffle=False, ratio=0.5)\
+                            .balance(x).as_data_frame(use_pandas=True)
                     assert b.shape[0] == 30
                     cts = b.species.value_counts()
                     assert cts[0] == 20
                     assert cts[1] == 10
 
                     # assert oversampling works
                     y = Y[:105, :]
-                    d = H2OOversamplingClassBalancer(target_feature='species', ratio=1.0).balance(y).as_data_frame(
+                    d = H2OOversamplingClassBalancer(
+                                target_feature='species', ratio=1.0, shuffle=False).balance(y).as_data_frame(
                         use_pandas=True)
                     assert d.shape[0] == 150
 
@@ -1726,14 +1726,13 @@ def log_loss():
 
         # run the tests -- put new or commonly failing tests
         # up front as smoke tests. i.e., act, persist and grid
-        auc()
-        log_loss()
+        balance()
+        grid()
         val_counts()
         impute()
         fscore()
         persist()
         act_search()
-        grid()
         encoder()
         bincount()
         metrics()
@@ -1748,12 +1747,13 @@ def log_loss():
         if CAN_CHART_MPL:
             corr()
         interactions()
-        balance()
         encode()
         feature_dropper()
         scale()
         load_frames()
         isinteger_isfloat()
         shuffle()
         valid_use()
+        auc()
+        log_loss()
         feature_dropper_coverage()