Skip to content

Commit

Permalink
BUG: GroupBy return EA dtype (pandas-dev#23318)
Browse files Browse the repository at this point in the history
  • Loading branch information
5hirish authored and tm9k1 committed Nov 19, 2018
1 parent 09faabd commit 0e76af9
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 17 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -855,6 +855,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`).
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).

.. _whatsnew_0240.api.incompatibilities:

Expand Down Expand Up @@ -1090,6 +1091,7 @@ Categorical
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
- Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`)

Datetimelike
^^^^^^^^^^^^
Expand Down
16 changes: 14 additions & 2 deletions pandas/core/groupby/groupby.py
Expand Up @@ -24,7 +24,8 @@ class providing the base-class of operations.
from pandas.util._validators import validate_kwargs

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import ensure_float, is_numeric_dtype, is_scalar
from pandas.core.dtypes.common import (
ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
from pandas.core.dtypes.missing import isna, notna

import pandas.core.algorithms as algorithms
Expand Down Expand Up @@ -754,7 +755,18 @@ def _try_cast(self, result, obj, numeric_only=False):
dtype = obj.dtype

if not is_scalar(result):
if numeric_only and is_numeric_dtype(dtype) or not numeric_only:
if is_extension_array_dtype(dtype):
# The function can return something of any type, so check
# if the type is compatible with the calling EA.
try:
result = obj.values._from_sequence(result)
except Exception:
# https://github.com/pandas-dev/pandas/issues/22850
# pandas has no control over what 3rd-party ExtensionArrays
# do in _values_from_sequence. We still want ops to work
# though, so we catch any regular Exception.
pass
elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

return result
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/arrays/test_integer.py
Expand Up @@ -650,9 +650,10 @@ def test_preserve_dtypes(op):

# groupby
result = getattr(df.groupby("A"), op)()

expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="int64")
"C": integer_array([1, 3], dtype="Int64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)

Expand All @@ -673,9 +674,10 @@ def test_reduce_to_float(op):

# groupby
result = getattr(df.groupby("A"), op)()

expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="float64")
"C": integer_array([1, 3], dtype="Int64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)

Expand Down
37 changes: 24 additions & 13 deletions pandas/tests/sparse/test_groupby.py
Expand Up @@ -24,27 +24,39 @@ def test_first_last_nth(self):
sparse_grouped = self.sparse.groupby('A')
dense_grouped = self.dense.groupby('A')

sparse_grouped_first = sparse_grouped.first()
sparse_grouped_last = sparse_grouped.last()
sparse_grouped_nth = sparse_grouped.nth(1)

dense_grouped_first = dense_grouped.first().to_sparse()
dense_grouped_last = dense_grouped.last().to_sparse()
dense_grouped_nth = dense_grouped.nth(1).to_sparse()

# TODO: shouldn't these all be spares or not?
tm.assert_frame_equal(sparse_grouped.first(),
dense_grouped.first())
tm.assert_frame_equal(sparse_grouped.last(),
dense_grouped.last())
tm.assert_frame_equal(sparse_grouped.nth(1),
dense_grouped.nth(1).to_sparse())
tm.assert_frame_equal(sparse_grouped_first,
dense_grouped_first)
tm.assert_frame_equal(sparse_grouped_last,
dense_grouped_last)
tm.assert_frame_equal(sparse_grouped_nth,
dense_grouped_nth)

def test_aggfuncs(self):
sparse_grouped = self.sparse.groupby('A')
dense_grouped = self.dense.groupby('A')

tm.assert_frame_equal(sparse_grouped.mean(),
dense_grouped.mean())
result = sparse_grouped.mean().to_sparse()
expected = dense_grouped.mean().to_sparse()

tm.assert_frame_equal(result, expected)

# ToDo: sparse sum includes str column
# tm.assert_frame_equal(sparse_grouped.sum(),
# dense_grouped.sum())

tm.assert_frame_equal(sparse_grouped.count(),
dense_grouped.count())
result = sparse_grouped.count().to_sparse()
expected = dense_grouped.count().to_sparse()

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("fill_value", [0, np.nan])
Expand All @@ -54,6 +66,5 @@ def test_groupby_includes_fill_value(fill_value):
'b': [fill_value, 1, fill_value, fill_value]})
sdf = df.to_sparse(fill_value=fill_value)
result = sdf.groupby('a').sum()
expected = df.groupby('a').sum()
tm.assert_frame_equal(result, expected,
check_index_type=False)
expected = df.groupby('a').sum().to_sparse(fill_value=fill_value)
tm.assert_frame_equal(result, expected, check_index_type=False)
1 change: 1 addition & 0 deletions pandas/tests/test_resample.py
Expand Up @@ -1576,6 +1576,7 @@ def test_resample_categorical_data_with_timedeltaindex(self):
'Group': ['A', 'A']},
index=pd.to_timedelta([0, 10], unit='s'))
expected = expected.reindex(['Group_obj', 'Group'], axis=1)
expected['Group'] = expected['Group_obj'].astype('category')
tm.assert_frame_equal(result, expected)

def test_resample_daily_anchored(self):
Expand Down

0 comments on commit 0e76af9

Please sign in to comment.