Skip to content

Commit

Permalink
v 0.0.43
Browse files Browse the repository at this point in the history
  - Added pd_stats
  • Loading branch information
tgsmith61591 committed Aug 30, 2016
1 parent e335dcb commit c13a750
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 3 deletions.
2 changes: 1 addition & 1 deletion skutil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys, warnings
from .utils import log, exp # want these visible at module level

__version__ = '0.0.42'
__version__ = '0.0.43'

try:
# This variable is injected in the __builtins__ by the build
Expand Down
4 changes: 2 additions & 2 deletions skutil/h2o/transform.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from __future__ import print_function, division, absolute_import
from .base import BaseH2OTransformer, _frame_from_x_y, _check_is_frame
from ..utils import is_numeric, flatten_all
import numpy as np
import numbers
from .base import BaseH2OTransformer, _frame_from_x_y, _check_is_frame
from ..utils import is_numeric, flatten_all
from sklearn.utils.validation import check_is_fitted

__all__ = [
Expand Down
4 changes: 4 additions & 0 deletions skutil/h2o/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ def h2o_corr_plot(X, plot_type='cor', cmap='Blues_d', n_levels=5,
X.columns = cols # set the cols to the same names
X.index = cols
corr = 'precomputed'

else:
# WARNING! This pulls everything into memory...
X = X.as_data_frame(use_pandas=True)

corr_plot(X, plot_type=plot_type, cmap=cmap, n_levels=n_levels,
figsize=figsize, cmap_a=cmap_a, cmap_b=cmap_b,
Expand Down
29 changes: 29 additions & 0 deletions skutil/utils/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,35 @@ def test_safe_log_exp():
assert_fails(exp, ValueError, 'A')


def test_pd_stats():
Y = load_iris_df()

# add a float copy of species
Y['species_float'] = Y.Species.astype('float')

# add an object col
Y['species_factor'] = ['a' if i == 0 else 'b' if i == 1 else 'c' for i in Y.Species]

# test with all
stats = pd_stats(Y, col_type='all')
assert all([nm in stats.columns for nm in Y.columns])
assert stats['species_float']['dtype'].startswith('int') # we assert it's considered an int

# test with numerics
stats = pd_stats(Y, col_type='numeric')
assert not 'species_factor' in stats.columns
assert stats.shape[1] == (Y.shape[1]-1)

# test with object
stats = pd_stats(Y, col_type='object')
assert 'species_factor' in stats.columns
assert stats.shape[1] == 1

# test with bad col_type
assert_fails(pd_stats, ValueError, Y, 'bad_type')



def test_corr():
with warnings.catch_warnings():
warnings.simplefilter("ignore")
Expand Down
117 changes: 117 additions & 0 deletions skutil/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.datasets import load_iris
from sklearn.externals import six
from ..base import SelectiveWarning, ModuleImportWarning

try:
Expand Down Expand Up @@ -49,6 +50,7 @@
'is_numeric',
'load_iris_df',
'log',
'pd_stats',
'report_confusion_matrix',
'report_grid_score_detail',
'shuffle_dataframe',
Expand Down Expand Up @@ -375,6 +377,121 @@ def df_memory_estimate(X, bit_est=32, unit='MB', index=False):
return human_bytes(X.memory_usage(index=index).sum(), unit)


def _is_int(x, tp):
"""Determine whether a column can be cast to int
without loss of data
"""
if not any([tp.startswith(i) for i in ('float', 'int')]):
return False

# if there's no difference between the two, then it's an int.
return (x - x.astype('int')).abs().sum() == 0

def pd_stats(X, col_type='all'):
"""Get a descriptive report of the elements in the data frame.
Builds on existing pandas `describe` method.
Parameters
----------
X : pd.DataFrame
The DataFrame
col_type : str, optional (default='all')
The types of columns to analyze. One of ('all',
'numeric', 'object'). If not all, will only return
corresponding typed columns.
"""
X, _ = validate_is_pd(X, None, False)
raw_stats = X.describe()
stats = raw_stats.to_dict()
dtypes = X.dtypes

# validate col_type
valid_types = ('all','numeric','object')
if not col_type in valid_types:
raise ValueError('expected col_type in (%s), but got %s'
% (','.join(valid_types), col_type))

# if user only wants part back, we can use this...
type_dict = {}

# the string to use when we don't want to populate a cell
_nastr = '--'

# objects are going to get dropped in the describe() call,
# so we need to add them back in with dicts of nastr for all...
object_dtypes = dtypes[dtypes=='object']
if object_dtypes.shape[0] > 0:
obj_nms = object_dtypes.index.values

for nm in obj_nms:
obj_dct = {stat:_nastr for stat in raw_stats.index.values}
stats[nm] = obj_dct


# we'll add rows to the stats...
for col, dct in six.iteritems(stats):
# add the dtype
_dtype = str(dtypes[col])
is_numer = any([_dtype.startswith(x) for x in ('int','float')])
dct['dtype'] = _dtype

# populate type_dict
type_dict[col] = 'numeric' if is_numer else 'object'

# if the dtype is not a float, we can
# get the count of uniques, then do a
# ratio of majority : minority
_isint = _is_int(X[col], _dtype)
if _isint or _dtype == 'object':
_unique = len(X[col].unique())
_val_cts= X[col].value_counts().sort_values(ascending=True)
_min_cls, _max_cls = _val_cts.index[0], _val_cts.index[-1]

# if there's only one class...
if _min_cls == _max_cls:
_min_cls = _nastr
_min_max_ratio = _nastr
else:
_min_max_ratio = _val_cts.values[0] / _val_cts.values[-1]

# chance we didn't recognize it as an int before...
if 'float' in dct['dtype']:
dct['dtype'] = dct['dtype'].replace('float', 'int')

else:
_unique = _min_max_ratio = _nastr

# populate the unique count and more
dct['unique_ct'] = _unique
dct['min_max_class_ratio'] = _min_max_ratio

# get the skewness...
if is_numer:
_skew, _kurt = X[col].skew(), X[col].kurtosis()
abs_skew = abs(_skew)
_skew_risk = 'high skew' if abs_skew > 1 else 'mod. skew' if (0.5 < abs_skew < 1) else 'symmetric'
else:
_skew = _kurt = _skew_risk = _nastr

dct['skewness'] = _skew
dct['skewness rating'] = _skew_risk
dct['kurtosis'] = _kurt

# go through and pop the keys that might be filtered on
if col_type != 'all':
stat_out = {}
for col, dtype in six.iteritems(type_dict):
if col_type == dtype:
stat_out[col] = stats[col]

else:
stat_out = stats

return pd.DataFrame.from_dict(stat_out)



def get_numeric(X):
"""Return list of indices of numeric dtypes variables
Expand Down

0 comments on commit c13a750

Please sign in to comment.