Merge branch 'safe_qcut' into develop

tgsmith61591 · Oct 7, 2016 · 32f6cd2 · 32f6cd2
2 parents f2da5cc + aab3186
commit 32f6cd2
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 4 deletions.
diff --git a/skutil/h2o/base.py b/skutil/h2o/base.py
@@ -17,6 +17,13 @@
 except ImportError as e:
     H2OServerError = EnvironmentError
 
+
+try:
+    from h2o.exceptions import H2OConnectionError
+except ImportError as e:
+    H2OConnectionError = EnvironmentError
+
+
 from pkg_resources import parse_version
 from ..utils import is_numeric
 
@@ -262,7 +269,7 @@ def __init__(self, target_feature=None, min_version='any', max_version=None):
         # test connection, warn where needed
         try:
             g = h2o.frames() # returns a dict of frames
-        except (EnvironmentError, ValueError, H2OServerError) as v:
+        except (EnvironmentError, ValueError, H2OServerError, H2OConnectionError) as v:
             warnings.warn('h2o has not been started; '
                           'initializing an H2O transformer without '
                           'a connection will not cause any issues, '

diff --git a/skutil/h2o/tests/test_h2o.py b/skutil/h2o/tests/test_h2o.py
@@ -97,6 +97,7 @@ def test_h2o_with_conn():
     #F = load_iris_df(include_tgt=False)
     X = None
 
+
     try:
         h2o.init()
         #h2o.init(ip='localhost', port=54321) # this might throw a warning
@@ -123,6 +124,7 @@ def test_h2o_with_conn():
             warnings.warn('could not successfully start H2O instance, tried %d times' % max_tries, UserWarning)
 
 
+
     def catch_warning_assert_thrown(fun, kwargs):
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")

diff --git a/skutil/metrics/_act.py b/skutil/metrics/_act.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import numpy as np
 import abc
+from ..utils import safe_qcut
 
 
 __all__ = [
@@ -129,7 +130,7 @@ def _compute_stats(self, pred, expo, loss, prem):
 		pred_ser = pd.Series(pred)
 		loss_to_returns = np.sum(loss) / np.sum(prem)
 
-		rank = pd.qcut(pred_ser, n_groups, labels=False)
+		rank = safe_qcut(pred_ser, n_groups, labels=False)
 		n_groups = np.amax(rank) + 1
 		groups = np.arange(n_groups)
 

diff --git a/skutil/utils/util.py b/skutil/utils/util.py
@@ -4,10 +4,18 @@
 import warnings
 import numbers
 import scipy.stats as st
+
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import confusion_matrix as cm
 from sklearn.datasets import load_iris
 from sklearn.externals import six
+
+import pandas.core.algorithms as algos
+import pandas.core.nanops as nanops
+from pandas.core.api import Series
+from pandas.core.categorical import Categorical
+from pandas.tools.tile import _format_levels
+
 from ..base import SelectiveWarning, ModuleImportWarning
 
 try:
@@ -54,11 +62,21 @@
     'pd_stats',
     'report_confusion_matrix',
     'report_grid_score_detail',
+    'safe_qcut',
     'shuffle_dataframe',
-    'validate_is_pd'
+    'validate_is_pd',
+    'QCutWarning'
 ]
 
 
+## Classes
+class QCutWarning(UserWarning):
+    """Denotes that a UserWarning has
+    been raised from the safe_qcut function
+    """
+    pass
+
+
 
 ######## MATHEMATICAL UTILITIES #############    
 def _log_single(x):
@@ -554,14 +572,20 @@ def is_entirely_numeric(X):
     return X.shape[1] == len(get_numeric(X))
 
 
+def is_integer(x):
+    return isinstance(x, (numbers.Integral, int, long, np.int, np.long))
+
+def is_float(x):
+    return isinstance(x, (float, np.float))
+
 def is_numeric(x):
     """Determines whether the arg is numeric
 
     Parameters
     ----------
     x : anytype
     """
-    return isinstance(x, (numbers.Integral, int, float, long, np.int, np.float, np.long))
+    return is_float(x) or is_integer(x)
 
 
 def load_iris_df(include_tgt=True, tgt_name="Species"):
@@ -586,6 +610,118 @@ def load_iris_df(include_tgt=True, tgt_name="Species"):
     return X
 
 
+def safe_qcut(x, q, labels=None, retbins=False, precision=3):
+    """This will perform a safe version of the Pandas
+    qcut function, and will not raise an Exception for non-unique
+    bins, but will warn instead.
+
+    Parameters
+    ----------
+    x : ndarray or Series
+
+    q : integer or array of quantiles
+        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
+        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
+
+    labels : array or boolean, default None
+        Used as labels for the resulting bins. Must be of the same length as
+        the resulting bins. If False, return only integer indicators of the
+        bins.
+
+    retbins : bool, optional
+        Whether to return the bins or not. Can be useful if bins is given
+        as a scalar.
+
+    precision : int
+        The precision at which to store and display the bins labels
+
+
+    Returns
+    -------
+    out : Categorical or Series or array of integers if labels is False
+        The return type (Categorical or Series) depends on the input: a Series
+        of type category if input is a Series else Categorical. Bins are
+        represented as categories when categorical data is returned.
+
+    bins : ndarray of floats
+        Returned only if `retbins` is True.
+    """
+    if is_integer(q):
+        quantiles = np.linspace(0, 1, q + 1)
+    else:
+        quantiles = q
+
+    bins = algos.quantile(x, quantiles)
+    return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
+                         precision=precision, include_lowest=True)
+
+
+
+def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
+                  precision=3, name=None, include_lowest=False):
+    x_is_series = isinstance(x, Series)
+    series_index = None
+
+    if x_is_series:
+        series_index = x.index
+        if name is None:
+            name = x.name
+
+    x = np.asarray(x)
+
+    side = 'left' if right else 'right'
+    ids = bins.searchsorted(x, side=side)
+
+    if len(algos.unique(bins)) < len(bins):
+        warnings.warn('Bin edges must be unique: %s' 
+                      % repr(bins), UserWarning)
+
+    if include_lowest:
+        ids[x == bins[0]] = 1
+
+    na_mask = pd.isnull(x) | (ids == len(bins)) | (ids == 0)
+    has_nas = na_mask.any()
+
+    if labels is not False:
+        if labels is None:
+            increases = 0
+            while True:
+                try:
+                    levels = _format_levels(bins, precision, right=right,
+                                            include_lowest=include_lowest)
+                except ValueError:
+                    increases += 1
+                    precision += 1
+                    if increases >= 20:
+                        raise
+                else:
+                    break
+
+        else:
+            if len(labels) != len(bins) - 1:
+                raise ValueError('Bin labels must be one fewer than '
+                                 'the number of bin edges')
+            levels = labels
+
+        levels = np.asarray(levels, dtype=object)
+        np.putmask(ids, na_mask, 0)
+        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
+    else:
+        fac = ids - 1
+        if has_nas:
+            fac = fac.astype(np.float64)
+            np.putmask(fac, na_mask, np.nan)
+
+    if x_is_series:
+        fac = Series(fac, index=series_index, name=name)
+
+    if not retbins:
+        return fac
+
+    return fac, bins
+
+
+
 def report_grid_score_detail(random_search, charts=True, sort_results=True, 
         ascending=True, percentile=0.975, y_axis='score', sort_by='score',
         highlight_best=True):