Skip to content

Commit

Permalink
Fix the output of df.describe on an empty categorical / object column (
Browse files Browse the repository at this point in the history
  • Loading branch information
enisnazif authored and vaibhavhrt committed Jun 6, 2019
1 parent 605476e commit a69d56f
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 1 deletion.
28 changes: 28 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,34 @@ are returned. (:issue:`21521`)
df.groupby("a").ffill()
``DataFrame`` describe on an empty categorical / object column will return top and freq
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

When calling :meth:`DataFrame.describe` with an empty categorical / object
column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with
the output for non-empty columns. Now the 'top' and 'freq' columns will always be included,
with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`)

.. ipython:: python
df = pd.DataFrame({"empty_col": pd.Categorical([])})
df
*Previous Behavior*:

.. code-block:: python
In [3]: df.describe()
Out[3]:
empty_col
count 0
unique 0
*New Behavior*:

.. ipython:: python
df.describe()
``__str__`` methods now call ``__repr__`` rather than vica-versa
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1483,7 +1483,7 @@ def value_counts(self, dropna=True):

if dropna or clean:
obs = code if clean else code[mask]
count = bincount(obs, minlength=ncat or None)
count = bincount(obs, minlength=ncat or 0)
else:
count = bincount(np.where(mask, code, ncat))
ix = np.append(ix, -1)
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9920,6 +9920,12 @@ def describe_categorical_1d(data):
names += ['top', 'freq']
result += [top, freq]

# If the DataFrame is empty, set 'top' and 'freq' to None
# to maintain output shape consistency
else:
names += ['top', 'freq']
result += [None, None]

return pd.Series(result, index=names, name=data.name)

def describe_1d(data):
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,16 @@ def test_describe_categorical(self):
result = df3.describe()
tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)

def test_describe_empty_categorical_column(self):
# GH 26397
# Ensure the index of an an empty categoric DataFrame column
# also contains (count, unique, top, freq)
df = pd.DataFrame({"empty_col": Categorical([])})
result = df.describe()
expected = DataFrame({'empty_col': [0, 0, None, None]},
index=['count', 'unique', 'top', 'freq'])
tm.assert_frame_equal(result, expected)

def test_describe_categorical_columns(self):
# GH 11558
columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
Expand All @@ -608,6 +618,7 @@ def test_describe_categorical_columns(self):
index=['count', 'mean', 'std', 'min', '25%',
'50%', '75%', 'max'],
columns=exp_columns)

tm.assert_frame_equal(result, expected)
tm.assert_categorical_equal(result.columns.values,
expected.columns.values)
Expand Down

0 comments on commit a69d56f

Please sign in to comment.