Skip to content

Commit

Permalink
Merge branch 'safe_qcut' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
tgsmith61591 committed Oct 7, 2016
2 parents f2da5cc + aab3186 commit 32f6cd2
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 4 deletions.
9 changes: 8 additions & 1 deletion skutil/h2o/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
except ImportError as e:
H2OServerError = EnvironmentError


try:
from h2o.exceptions import H2OConnectionError
except ImportError as e:
H2OConnectionError = EnvironmentError


from pkg_resources import parse_version
from ..utils import is_numeric

Expand Down Expand Up @@ -262,7 +269,7 @@ def __init__(self, target_feature=None, min_version='any', max_version=None):
# test connection, warn where needed
try:
g = h2o.frames() # returns a dict of frames
except (EnvironmentError, ValueError, H2OServerError) as v:
except (EnvironmentError, ValueError, H2OServerError, H2OConnectionError) as v:
warnings.warn('h2o has not been started; '
'initializing an H2O transformer without '
'a connection will not cause any issues, '
Expand Down
2 changes: 2 additions & 0 deletions skutil/h2o/tests/test_h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def test_h2o_with_conn():
#F = load_iris_df(include_tgt=False)
X = None


try:
h2o.init()
#h2o.init(ip='localhost', port=54321) # this might throw a warning
Expand All @@ -123,6 +124,7 @@ def test_h2o_with_conn():
warnings.warn('could not successfully start H2O instance, tried %d times' % max_tries, UserWarning)



def catch_warning_assert_thrown(fun, kwargs):
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
Expand Down
3 changes: 2 additions & 1 deletion skutil/metrics/_act.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
import abc
from ..utils import safe_qcut


__all__ = [
Expand Down Expand Up @@ -129,7 +130,7 @@ def _compute_stats(self, pred, expo, loss, prem):
pred_ser = pd.Series(pred)
loss_to_returns = np.sum(loss) / np.sum(prem)

rank = pd.qcut(pred_ser, n_groups, labels=False)
rank = safe_qcut(pred_ser, n_groups, labels=False)
n_groups = np.amax(rank) + 1
groups = np.arange(n_groups)

Expand Down
140 changes: 138 additions & 2 deletions skutil/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@
import warnings
import numbers
import scipy.stats as st

from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.datasets import load_iris
from sklearn.externals import six

import pandas.core.algorithms as algos
import pandas.core.nanops as nanops
from pandas.core.api import Series
from pandas.core.categorical import Categorical
from pandas.tools.tile import _format_levels

from ..base import SelectiveWarning, ModuleImportWarning

try:
Expand Down Expand Up @@ -54,11 +62,21 @@
'pd_stats',
'report_confusion_matrix',
'report_grid_score_detail',
'safe_qcut',
'shuffle_dataframe',
'validate_is_pd'
'validate_is_pd',
'QCutWarning'
]


## Classes
class QCutWarning(UserWarning):
"""Denotes that a UserWarning has
been raised from the safe_qcut function
"""
pass



######## MATHEMATICAL UTILITIES #############
def _log_single(x):
Expand Down Expand Up @@ -554,14 +572,20 @@ def is_entirely_numeric(X):
return X.shape[1] == len(get_numeric(X))


def is_integer(x):
return isinstance(x, (numbers.Integral, int, long, np.int, np.long))

def is_float(x):
return isinstance(x, (float, np.float))

def is_numeric(x):
"""Determines whether the arg is numeric
Parameters
----------
x : anytype
"""
return isinstance(x, (numbers.Integral, int, float, long, np.int, np.float, np.long))
return is_float(x) or is_integer(x)


def load_iris_df(include_tgt=True, tgt_name="Species"):
Expand All @@ -586,6 +610,118 @@ def load_iris_df(include_tgt=True, tgt_name="Species"):
return X


def safe_qcut(x, q, labels=None, retbins=False, precision=3):
"""This will perform a safe version of the Pandas
qcut function, and will not raise an Exception for non-unique
bins, but will warn instead.
Parameters
----------
x : ndarray or Series
q : integer or array of quantiles
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
labels : array or boolean, default None
Used as labels for the resulting bins. Must be of the same length as
the resulting bins. If False, return only integer indicators of the
bins.
retbins : bool, optional
Whether to return the bins or not. Can be useful if bins is given
as a scalar.
precision : int
The precision at which to store and display the bins labels
Returns
-------
out : Categorical or Series or array of integers if labels is False
The return type (Categorical or Series) depends on the input: a Series
of type category if input is a Series else Categorical. Bins are
represented as categories when categorical data is returned.
bins : ndarray of floats
Returned only if `retbins` is True.
"""
if is_integer(q):
quantiles = np.linspace(0, 1, q + 1)
else:
quantiles = q

bins = algos.quantile(x, quantiles)
return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
precision=precision, include_lowest=True)



def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
precision=3, name=None, include_lowest=False):
x_is_series = isinstance(x, Series)
series_index = None

if x_is_series:
series_index = x.index
if name is None:
name = x.name

x = np.asarray(x)

side = 'left' if right else 'right'
ids = bins.searchsorted(x, side=side)

if len(algos.unique(bins)) < len(bins):
warnings.warn('Bin edges must be unique: %s'
% repr(bins), UserWarning)

if include_lowest:
ids[x == bins[0]] = 1

na_mask = pd.isnull(x) | (ids == len(bins)) | (ids == 0)
has_nas = na_mask.any()

if labels is not False:
if labels is None:
increases = 0
while True:
try:
levels = _format_levels(bins, precision, right=right,
include_lowest=include_lowest)
except ValueError:
increases += 1
precision += 1
if increases >= 20:
raise
else:
break

else:
if len(labels) != len(bins) - 1:
raise ValueError('Bin labels must be one fewer than '
'the number of bin edges')
levels = labels

levels = np.asarray(levels, dtype=object)
np.putmask(ids, na_mask, 0)
fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
else:
fac = ids - 1
if has_nas:
fac = fac.astype(np.float64)
np.putmask(fac, na_mask, np.nan)

if x_is_series:
fac = Series(fac, index=series_index, name=name)

if not retbins:
return fac

return fac, bins



def report_grid_score_detail(random_search, charts=True, sort_results=True,
ascending=True, percentile=0.975, y_axis='score', sort_by='score',
highlight_best=True):
Expand Down

0 comments on commit 32f6cd2

Please sign in to comment.