Permalink
Browse files

describe() outputs bool similarly to categorical data

  • Loading branch information...
vii committed Feb 26, 2016
1 parent fe584e7 commit c8b3690c4ba98299d35abd65aa90d26e927db88c
Showing with 19 additions and 27 deletions.
  1. +10 −9 pandas/core/generic.py
  2. +7 −15 pandas/tests/frame/test_analytics.py
  3. +2 −3 pandas/tests/test_generic.py
View
@@ -4875,26 +4875,27 @@ def describe_numeric_1d(series, percentiles):
def describe_categorical_1d(data):
names = ['count', 'unique']
objcounts = data.value_counts()
result = [data.count(), len(objcounts[objcounts != 0])]
count_unique = len(objcounts[objcounts != 0])
result = [data.count(), count_unique]
if result[1] > 0:
top, freq = objcounts.index[0], objcounts.iloc[0]
if (data.dtype == object or
com.is_categorical_dtype(data.dtype)):
names += ['top', 'freq']
result += [top, freq]
elif com.is_datetime64_dtype(data):
if com.is_datetime64_dtype(data):
asint = data.dropna().values.view('i8')
names += ['top', 'freq', 'first', 'last']
result += [lib.Timestamp(top), freq,
lib.Timestamp(asint.min()),
lib.Timestamp(asint.max())]
else:
names += ['top', 'freq']
result += [top, freq]
return pd.Series(result, index=names, name=data.name)
def describe_1d(data, percentiles):
if com.is_numeric_dtype(data):
if com.is_bool_dtype(data):
return describe_categorical_1d(data)
elif com.is_numeric_dtype(data):
return describe_numeric_1d(data, percentiles)
elif com.is_timedelta64_dtype(data):
return describe_numeric_1d(data, percentiles)
@@ -4906,7 +4907,7 @@ def describe_1d(data, percentiles):
elif (include is None) and (exclude is None):
if len(self._get_numeric_data()._info_axis) > 0:
# when some numerics are found, keep only numerics
data = self.select_dtypes(include=[np.number, np.bool])
data = self.select_dtypes(include=[np.number])
else:
data = self
elif include == 'all':
@@ -241,24 +241,16 @@ def test_bool_describe_in_mixed_frame(self):
'int_data': [10, 20, 30, 40, 50],
})
# Boolean data and integer data is included in .describe() output,
# string data isn't
self.assert_numpy_array_equal(df.describe().columns, [
'bool_data', 'int_data'])
# Integer data are included in .describe() output,
# Boolean and string data are not.
self.assert_numpy_array_equal(df.describe().columns, ['int_data'])
bool_describe = df.describe()['bool_data']
bool_describe = df.describe(include='all')['bool_data']
# Both the min and the max values should stay booleans
self.assertEqual(bool_describe['min'].dtype, np.bool_)
self.assertEqual(bool_describe['max'].dtype, np.bool_)
# Top value is a boolean value that is False
self.assertTrue(isinstance(bool_describe['top'] , bool))
self.assertFalse(bool_describe['top'])
self.assertFalse(bool_describe['min'])
self.assertTrue(bool_describe['max'])
# For numeric operations, like mean or median, the values True/False
# are cast to the integer values 1 and 0
assert_almost_equal(bool_describe['mean'], 0.4)
assert_almost_equal(bool_describe['50%'], 0)
def test_reduce_mixed_frame(self):
# GH 6806
@@ -955,7 +955,7 @@ def test_describe_objects(self):
s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a'])
result = s.describe()
expected = Series({'count': 7, 'unique': 4,
'top': 'a', 'freq': 3}, index=result.index)
'top': 'a', 'freq': 3,'second':'b', 'second_freq': 2}, index=result.index)
assert_series_equal(result, expected)
dt = list(self.ts.index)
@@ -1486,9 +1486,8 @@ def test_describe_typefiltering_category_bool(self):
'D_num': np.arange(24.) + .5,
'E_ts': tm.makeTimeSeries()[:24].index})
# bool is considered numeric in describe, although not an np.number
desc = df.describe()
expected_cols = ['C_bool', 'D_num']
expected_cols = ['D_num']
expected = DataFrame(dict((k, df[k].describe())
for k in expected_cols),
columns=expected_cols)

0 comments on commit c8b3690

Please sign in to comment.