Fix the output of df.describe on an empty categorical / object column (…

…pandas-dev#26474)
vaibhavhrt · Jun 6, 2019 · a69d56f · a69d56f
1 parent 605476e
commit a69d56f
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -253,6 +253,34 @@ are returned. (:issue:`21521`)
 
     df.groupby("a").ffill()
 
+``DataFrame`` describe on an empty categorical / object column will return top and freq
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When calling :meth:`DataFrame.describe` with an empty categorical / object
+column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with
+the output for non-empty columns. Now the 'top' and 'freq' columns will always be included,
+with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({"empty_col": pd.Categorical([])})
+   df
+
+*Previous Behavior*:
+
+.. code-block:: python
+
+   In [3]: df.describe()
+   Out[3]:
+           empty_col
+   count           0
+   unique          0
+
+*New Behavior*:
+
+.. ipython:: python
+
+    df.describe()
 
 ``__str__`` methods now call ``__repr__`` rather than vica-versa
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1483,7 +1483,7 @@ def value_counts(self, dropna=True):
 
         if dropna or clean:
             obs = code if clean else code[mask]
-            count = bincount(obs, minlength=ncat or None)
+            count = bincount(obs, minlength=ncat or 0)
         else:
             count = bincount(np.where(mask, code, ncat))
             ix = np.append(ix, -1)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9920,6 +9920,12 @@ def describe_categorical_1d(data):
                     names += ['top', 'freq']
                     result += [top, freq]
 
+            # If the DataFrame is empty, set 'top' and 'freq' to None
+            # to maintain output shape consistency
+            else:
+                names += ['top', 'freq']
+                result += [None, None]
+
             return pd.Series(result, index=names, name=data.name)
 
         def describe_1d(data):

diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -588,6 +588,16 @@ def test_describe_categorical(self):
         result = df3.describe()
         tm.assert_numpy_array_equal(result["cat"].values, result["s"].values)
 
+    def test_describe_empty_categorical_column(self):
+        # GH 26397
+        # Ensure the index of an an empty categoric DataFrame column
+        # also contains (count, unique, top, freq)
+        df = pd.DataFrame({"empty_col": Categorical([])})
+        result = df.describe()
+        expected = DataFrame({'empty_col': [0, 0, None, None]},
+                             index=['count', 'unique', 'top', 'freq'])
+        tm.assert_frame_equal(result, expected)
+
     def test_describe_categorical_columns(self):
         # GH 11558
         columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
@@ -608,6 +618,7 @@ def test_describe_categorical_columns(self):
                              index=['count', 'mean', 'std', 'min', '25%',
                                     '50%', '75%', 'max'],
                              columns=exp_columns)
+
         tm.assert_frame_equal(result, expected)
         tm.assert_categorical_equal(result.columns.values,
                                     expected.columns.values)