Skip to content

Commit

Permalink
BUG: GroupBy.cummin altering nullable arrays inplace (pandas-dev#46220)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and yehoshuadimarsky committed Jul 13, 2022
1 parent 8d7165b commit d20c883
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 31 deletions.
5 changes: 3 additions & 2 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,9 @@ Groupby/resample/rolling
- Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`)
- Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`)
- Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`)
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`)
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`)
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`46216`)
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`46216`)
- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` with nullable dtypes incorrectly altering the original data in place (:issue:`46220`)
-

Reshaping
Expand Down
48 changes: 28 additions & 20 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1477,7 +1477,8 @@ def group_min(
cdef group_cummin_max(
iu_64_floating_t[:, ::1] out,
ndarray[iu_64_floating_t, ndim=2] values,
uint8_t[:, ::1] mask,
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask,
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike,
Expand All @@ -1496,6 +1497,9 @@ cdef group_cummin_max(
mask : np.ndarray[bool] or None
If not None, indices represent missing values,
otherwise the mask will not be used
result_mask : ndarray[bool, ndim=2], optional
If not None, these specify locations in the output that are NA.
Modified in-place.
labels : np.ndarray[np.intp]
Labels to group by.
ngroups : int
Expand Down Expand Up @@ -1558,7 +1562,7 @@ cdef group_cummin_max(

if not skipna and na_possible and seen_na[lab, j]:
if uses_mask:
mask[i, j] = 1 # FIXME: shouldn't alter inplace
result_mask[i, j] = 1
# Set to 0 ensures that we are deterministic and can
# downcast if appropriate
out[i, j] = 0
Expand Down Expand Up @@ -1595,19 +1599,21 @@ def group_cummin(
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
values,
mask,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=False
out=out,
values=values,
mask=mask,
result_mask=result_mask,
labels=labels,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
skipna=skipna,
compute_max=False,
)


Expand All @@ -1619,17 +1625,19 @@ def group_cummax(
const intp_t[::1] labels,
int ngroups,
bint is_datetimelike,
uint8_t[:, ::1] mask=None,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint skipna=True,
) -> None:
"""See group_cummin_max.__doc__"""
group_cummin_max(
out,
values,
mask,
labels,
ngroups,
is_datetimelike,
skipna,
compute_max=True
out=out,
values=values,
mask=mask,
result_mask=result_mask,
labels=labels,
ngroups=ngroups,
is_datetimelike=is_datetimelike,
skipna=skipna,
compute_max=True,
)
18 changes: 9 additions & 9 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,9 +406,13 @@ def _masked_ea_wrap_cython_operation(
"""
orig_values = values

# Copy to ensure input and result masks don't end up shared
mask = values._mask.copy()
result_mask = np.zeros(ngroups, dtype=bool)
# libgroupby functions are responsible for NOT altering mask
mask = values._mask
if self.kind != "aggregate":
result_mask = mask.copy()
else:
result_mask = np.zeros(ngroups, dtype=bool)

arr = values._data

res_values = self._cython_op_ndim_compat(
Expand All @@ -427,12 +431,7 @@ def _masked_ea_wrap_cython_operation(
# dtype; last attempt ran into trouble on 32bit linux build
res_values = res_values.astype(dtype.type, copy=False)

if self.kind != "aggregate":
out_mask = mask
else:
out_mask = result_mask

return orig_values._maybe_mask_result(res_values, out_mask)
return orig_values._maybe_mask_result(res_values, result_mask)

@final
def _cython_op_ndim_compat(
Expand Down Expand Up @@ -568,6 +567,7 @@ def _call_cython_op(
ngroups=ngroups,
is_datetimelike=is_datetimelike,
mask=mask,
result_mask=result_mask,
**kwargs,
)
else:
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,11 +843,15 @@ def test_cummax(dtypes_for_minmax):
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
orig = df.copy()
gb = df.groupby(groups)["a"]

result = getattr(gb, method)(skipna=False)
expected = Series(expected_data, dtype=dtype, name="a")

# check we didn't accidentally alter df
tm.assert_frame_equal(df, orig)

tm.assert_series_equal(result, expected)


Expand Down

0 comments on commit d20c883

Please sign in to comment.