Skip to content

Commit

Permalink
Merge branch 'rbind' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
tgsmith61591 committed Jan 31, 2017
2 parents 2c75a2f + 45e7594 commit 41172c5
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 143 deletions.
1 change: 1 addition & 0 deletions skutil/h2o/__init__.py
Expand Up @@ -7,6 +7,7 @@ class balancers.
from .base import *
from .balance import *
from .encode import *
from .fixes import *
from .frame import *
from .grid_search import *
from .metrics import *
Expand Down
55 changes: 22 additions & 33 deletions skutil/h2o/balance.py
@@ -1,11 +1,11 @@
from __future__ import absolute_import, division, print_function
from collections import Counter
import pandas as pd
from abc import ABCMeta
import warnings
from sklearn.externals import six
from skutil.base import overrides
from .util import reorder_h2o_frame
from .transform import _flatten_one
from .util import reorder_h2o_frame, _gen_optimized_chunks, h2o_col_to_numpy
from .base import check_frame, BaseH2OFunctionWrapper
from ..preprocessing.balance import (_validate_ratio, _validate_target, _validate_num_classes,
_OversamplingBalancePartitioner, _UndersamplingBalancePartitioner,
Expand Down Expand Up @@ -58,45 +58,34 @@ def _validate_x_y_ratio(X, y, ratio):
# validate ratio, if the current ratio is >= the ratio, it's "balanced enough"
ratio = _validate_ratio(ratio)
y = _validate_target(y) # cast to string type
is_factor = _flatten_one(X[y].isfactor()) # is the target a factor?

# generate cts. Have to get kludgier in h2o...
unq_vals = X[y].unique()
unq_vals = unq_vals.as_data_frame(use_pandas=True)[unq_vals.columns[0]].values # numpy array of unique vals
unq_cts = dict([(val, X[y][X[y] == val].shape[0]) for val in unq_vals])
# if the target is a factor, we might have an issue here...
"""
if is_factor:
warnings.warn('Balancing with the target as a factor can cause unpredictable '
'sampling behavior (H2O makes it difficult to assess equality '
'between two factors). Balancing works best when the target '
'is an int. If possible, consider using `asnumeric`.', UserWarning)
"""

# validate is < max classes
cts = pd.Series(unq_cts).sort_values(ascending=True)
# generate cts. Have to get kludgier in h2o... then validate is < max classes
# we have to do it this way, because H2O might treat the vals as enum, and we cannot
# slice based on equality (dernit, H2O).
target_col = pd.Series(h2o_col_to_numpy(X[y]))
cts = target_col.value_counts().sort_values(ascending=True)
n_classes = _validate_num_classes(cts)
needs_balancing = (cts.values[0] / cts.values[-1]) < ratio

out_tup = (cts, n_classes, needs_balancing)
index = cts.index if not is_factor else cts.index.astype('str')
out_tup = (dict(zip(index, cts.values)), # cts
index, # labels sorted ascending by commonality
target_col.values if not is_factor else target_col.astype('str').values, # the target
n_classes,
needs_balancing)
return out_tup


def _gen_optimized_chunks(idcs):
"""Given the list of indices, create more efficient chunks to minimize
the number of rbind operations required for the H2OFrame ExprNode cache.
"""
idcs = sorted(idcs)
counter = Counter(idcs)
counts = counter.most_common() # order desc

# the first index is the number of chunks we'll need to create.
n_chunks = counts[0][1]
chunks = [[] for _ in range(n_chunks)] # gen the number of chunks we'll need

# 1. populate the chunks each with their first idx (the most common)
# 2. pop from the counter
# 3. re-generate the most_common(), repeat
while counts:
val, n_iter = counts[0] # the one at the head of the list is the most common
for i in range(n_iter):
chunks[i].append(val)
counts.pop(0) # pop out the first idx...
# sort them
return [sorted(chunk) for chunk in chunks]


class _BaseH2OBalancer(six.with_metaclass(ABCMeta,
BaseH2OFunctionWrapper,
BalancerMixin)):
Expand Down
73 changes: 73 additions & 0 deletions skutil/h2o/fixes.py
@@ -0,0 +1,73 @@
from __future__ import print_function, absolute_import, division
import h2o
import numpy as np
from pkg_resources import parse_version
from .base import check_frame

_h2ov = h2o.__version__

__all__ = [
'rbind_all'
]

if parse_version(_h2ov) < parse_version('3.10.0.7'):
def rbind_all(*args):
"""Given a variable set of H2OFrames,
rbind all of them into a single H2OFrame.
Parameters
----------
array1, array2, ... : H2OFrame, shape=(n_samples, n_features)
The H2OFrames to rbind. All should match in column
dimensionality.
Returns
-------
f : H2OFrame
The rbound H2OFrame
"""
# check all are H2OFrames
for x in args:
check_frame(x, copy=False)

# check col dim
if np.unique([x.shape[1] for x in args]).shape[0] != 1:
raise ValueError('inconsistent column dimensions')

f = None
for x in args:
f = x if f is None else f.rbind(x)

return f

else:
def rbind_all(*args):
"""Given a variable set of H2OFrames,
rbind all of them into a single H2OFrame.
Parameters
----------
array1, array2, ... : H2OFrame, shape=(n_samples, n_features)
The H2OFrames to rbind. All should match in column
dimensionality.
Returns
-------
f : H2OFrame
The rbound H2OFrame
"""
# lazily evaluate type on the h2o side
if isinstance(args, (tuple, list)):
lst = args[0]
if len(lst) == 1: # there's only one element
return lst[0]
return lst[0].rbind(lst[1:])
if len(args) == 1:
return args[0]
return args[0].rbind(args[1:])
3 changes: 2 additions & 1 deletion skutil/h2o/one_way_fs.py
Expand Up @@ -6,7 +6,8 @@
from sklearn.externals import six
from .split import *
from .select import BaseH2OFeatureSelector
from .util import _unq_vals_col, rbind_all
from .util import _unq_vals_col
from .fixes import rbind_all
from ..utils import is_integer
from .base import (check_frame, _frame_from_x_y)
from ..base import overrides, since
Expand Down
62 changes: 31 additions & 31 deletions skutil/h2o/tests/test_h2o.py
Expand Up @@ -22,7 +22,6 @@
from skutil.h2o.grid_search import *
from skutil.h2o.base import BaseH2OFunctionWrapper
from skutil.h2o.one_way_fs import h2o_f_classif, H2OFScorePercentileSelector, H2OFScoreKBestSelector
from skutil.preprocessing.balance import _pd_frame_to_np
from skutil.h2o.util import (h2o_frame_memory_estimate, h2o_corr_plot, h2o_bincount,
load_iris_h2o, load_breast_cancer_h2o, load_boston_h2o,
shuffle_h2o_frame, h2o_col_to_numpy)
Expand Down Expand Up @@ -205,7 +204,7 @@ def valid_use():

try:
dfh = new_h2o_frame(df)
except Exception as e:
except Exception:
dfh = None
return

Expand Down Expand Up @@ -298,7 +297,7 @@ def nzv():
# test with strategy == ratio
if X is not None:
transformer = H2ONearZeroVarianceFilterer(strategy='ratio', threshold=0.1)
assert_fails(transformer.fit, ValueError, Y) # will fail because thresh must be greater than 1.0
assert_fails(transformer.fit, ValueError, Y) # will fail because thresh must be greater than 1.0

x = np.array([
[1, 2, 3],
Expand Down Expand Up @@ -328,15 +327,15 @@ def pipeline():
X_train, X_test, y_train, y_test = train_test_split(f, targ, train_size=0.7)

# add the y into the matrix for h2o's sake -- pandas will throw a warning here...
with warnings.catch_warnings(record=True) as w:
with warnings.catch_warnings(record=True):
warnings.simplefilter("ignore")
X_train['species'] = y_train
X_test['species'] = y_test

try:
train = new_h2o_frame(X_train)
test = new_h2o_frame(X_test)
except Exception as e:
except Exception:
train = None
test = None

Expand All @@ -362,8 +361,8 @@ def pipeline():
pipe.predict(test)

# coverage:
fe = pipe._final_estimator
ns = pipe.named_steps
_ = pipe._final_estimator
_ = pipe.named_steps

# test pojo
assert not pipe.download_pojo()
Expand Down Expand Up @@ -408,7 +407,7 @@ def pipeline():
excepted = False
try:
pipe.fit(train)
except (TypeError, ValueError, EnvironmentError) as e:
except (TypeError, ValueError, EnvironmentError):
excepted = True
assert excepted, 'expected failure for y=%s' % str(y)

Expand Down Expand Up @@ -456,14 +455,14 @@ def pipeline():

# won't even get here...
# pipe.fit(train)
except TypeError as t:
except TypeError:
failed = True
assert failed

# type error for non-h2o estimators
failed = False
try:
pipe = H2OPipeline([
_ = H2OPipeline([
('nzv', H2ONearZeroVarianceFilterer()),
('mc', H2OMulticollinearityFilterer(threshold=0.9)),
('est', RandomForestClassifier())
Expand All @@ -474,7 +473,7 @@ def pipeline():

# won't even get here...
# pipe.fit(train)
except TypeError as t:
except TypeError:
failed = True
assert failed

Expand All @@ -497,7 +496,7 @@ def pipeline():
],
feature_names=F.columns.tolist(),
target_feature='species',
exclude_from_fit=['sepal width (cm)'] # will not be included in the final fit
exclude_from_fit=['sepal width (cm)'] # will not be included in the final fit
)

# fit pipe, predict...
Expand Down Expand Up @@ -526,7 +525,7 @@ def grid():
# try uploading...
try:
frame = new_h2o_frame(f)
except Exception as e:
except Exception:
frame = None

def get_param_grid(est):
Expand Down Expand Up @@ -627,7 +626,8 @@ def get_param_grid(est):
if not do_pipe:
# we're just testing the search on actual estimators
grid = grid_module(estimator=estimator,
feature_names=F.columns.tolist(), target_feature='species',
feature_names=F.columns.tolist(),
target_feature='species',
param_grid=get_param_grid(estimator),
scoring=scoring, iid=iid, verbose=verbose,
cv=which_cv, minimize=minimize)
Expand All @@ -651,7 +651,8 @@ def get_param_grid(est):
}

grid = grid_module(pipe, param_grid=params,
feature_names=F.columns.tolist(), target_feature='species',
feature_names=F.columns.tolist(),
target_feature='species',
scoring=scoring, iid=iid, verbose=verbose,
cv=which_cv, minimize=minimize)

Expand All @@ -660,8 +661,8 @@ def get_param_grid(est):
grid.n_iter = n_folds

# sometimes we'll expect it to fail...
expect_failure = scoring is None or (
isinstance(scoring, str) and scoring in ('bad'))
expect_failure = scoring is None or (isinstance(scoring, str) and
scoring in ('bad'))
try:
# fit the grid
grid.fit(frame)
Expand All @@ -671,10 +672,10 @@ def get_param_grid(est):
expect_failure = False

# predict on the grid
p = grid.predict(frame)
_ = grid.predict(frame)

# score on the frame
s = grid.score(frame)
_ = grid.score(frame)
except ValueError as v:
if expect_failure:
pass
Expand Down Expand Up @@ -1333,35 +1334,34 @@ def cust_add(a, b):
def balance():
if X is not None:
# test that we can turn a frame's first col into a np array
x = _pd_frame_to_np(X) # just gets back the first col...
assert isinstance(x, np.ndarray)

# upload to cloud with the target
f = F.copy()
f['species'] = iris.target

try:
Y = from_pandas(f)
except Exception as e:
except Exception:
Y = None

if Y is not None:
# assert undersampling the balance changes nothing:
b = H2OUndersamplingClassBalancer(target_feature='species').balance(Y)
b = H2OUndersamplingClassBalancer(target_feature='species', shuffle=False).balance(Y)
assert b.shape[0] == Y.shape[0]

# do a real undersample
x = Y[:60, :] # 50 zeros, 10 ones
b = H2OUndersamplingClassBalancer(target_feature='species', ratio=0.5).balance(x).as_data_frame(
use_pandas=True)
b = H2OUndersamplingClassBalancer(
target_feature='species', shuffle=False, ratio=0.5)\
.balance(x).as_data_frame(use_pandas=True)
assert b.shape[0] == 30
cts = b.species.value_counts()
assert cts[0] == 20
assert cts[1] == 10

# assert oversampling works
y = Y[:105, :]
d = H2OOversamplingClassBalancer(target_feature='species', ratio=1.0).balance(y).as_data_frame(
d = H2OOversamplingClassBalancer(
target_feature='species', ratio=1.0, shuffle=False).balance(y).as_data_frame(
use_pandas=True)
assert d.shape[0] == 150

Expand Down Expand Up @@ -1726,14 +1726,13 @@ def log_loss():

# run the tests -- put new or commonly failing tests
# up front as smoke tests. i.e., act, persist and grid
auc()
log_loss()
balance()
grid()
val_counts()
impute()
fscore()
persist()
act_search()
grid()
encoder()
bincount()
metrics()
Expand All @@ -1748,12 +1747,13 @@ def log_loss():
if CAN_CHART_MPL:
corr()
interactions()
balance()
encode()
feature_dropper()
scale()
load_frames()
isinteger_isfloat()
shuffle()
valid_use()
auc()
log_loss()
feature_dropper_coverage()

0 comments on commit 41172c5

Please sign in to comment.