Skip to content

BUG: Groupby.sum, DataFrame.sum and Series.sum for object type should be NA instead of 0 for all-nan values #60458

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
@@ -750,6 +750,7 @@ Groupby/resample/rolling
- Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
- Bug in :meth:`DataFrame.sum`, :meth:`Series.sum`, :meth:`DataFrameGroupBy.sum` and :math:`SeriesGroupBy.sum` where in case of all-nan values for object dtype the result is incorrectly set to 0 instead of ``nan``. (:issue:`60229`)
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
14 changes: 11 additions & 3 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
@@ -725,8 +725,12 @@ def group_sum(
raise ValueError("len(index) != len(labels)")

nobs = np.zeros((<object>out).shape, dtype=np.int64)
# the below is equivalent to `np.zeros_like(out)` but faster
sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
if sum_t is object:
# For object dtype, fill value should not be 0 (#60229)
sumx = np.full((<object>out).shape, NAN, dtype=object)
else:
# the below is equivalent to `np.zeros_like(out)` but faster
sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)

N, K = (<object>values).shape
@@ -760,7 +764,10 @@ def group_sum(
if uses_mask:
isna_result = result_mask[lab, j]
else:
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
isna_result = (
_treat_as_na(sumx[lab, j], is_datetimelike) and
nobs[lab, j] > 0
)

if isna_result:
# If sum is already NA, don't add to it. This is important for
@@ -795,6 +802,7 @@ def group_sum(
compensation[lab, j] = 0
sumx[lab, j] = t
elif not skipna:
nobs[lab, j] += 1
if uses_mask:
result_mask[lab, j] = True
else:
7 changes: 7 additions & 0 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
@@ -638,6 +638,13 @@ def nansum(
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)

if dtype.kind == "O" and skipna and min_count == 0:
# GH#60229 For object dtype, sum of all-NA array should be nan
if isinstance(the_sum, np.ndarray):
the_sum[mask.sum(axis=axis) == mask.shape[axis]] = np.nan
elif mask.all():
the_sum = np.nan

return the_sum


2 changes: 1 addition & 1 deletion pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
@@ -1186,7 +1186,7 @@ def test_frame_single_columns_object_sum_axis_1():
}
df = DataFrame(data)
result = df.sum(axis=1)
expected = Series(["A", 1.2, 0])
expected = Series(["A", 1.2, np.nan])
tm.assert_series_equal(result, expected)


15 changes: 15 additions & 0 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
@@ -420,10 +420,25 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis):
assert df.values.dtype == np.object_
result = getattr(df, method)(axis=axis)
expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
if method == "sum":
# GH#60229 in case of all-NA object array, sum should be nan
expected[df.isna().all(axis=axis)] = np.nan
if axis in [1, "columns"] and method in ["min", "max"]:
expected[expected.isna()] = None
tm.assert_series_equal(result, expected)

def test_object_sum_allna(self):
# GH#60229
df = DataFrame({"a": [np.nan] * 5, "b": [pd.NA] * 5}, dtype=object)

result = df.sum(axis=0, skipna=True)
expected = Series([np.nan, np.nan], index=["a", "b"], dtype=object)
tm.assert_series_equal(result, expected)

result = df.sum(axis=0, skipna=False)
expected = Series([np.nan, pd.NA], index=["a", "b"], dtype=object)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
def test_mixed_ops(self, op):
# GH#16116
1 change: 1 addition & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
@@ -358,6 +358,7 @@ def test_observed(request, using_infer_string, observed):
expected = cartesian_product_for_groupers(
expected, [cat1, cat2], list("AB"), fill_value=0
)
expected.loc[expected.C == 0, "C"] = np.nan

tm.assert_frame_equal(result, expected)

15 changes: 15 additions & 0 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
@@ -514,6 +514,21 @@ def test_sum_skipna_object(skipna):
tm.assert_series_equal(result, expected)


def test_sum_allnan_object(skipna):
# GH#60229
df = DataFrame(
{
"val": [np.nan] * 10,
"cat": ["A", "B"] * 5,
}
).astype({"val": object})
expected = Series(
[np.nan, np.nan], index=pd.Index(["A", "B"], name="cat"), name="val"
).astype(object)
result = df.groupby("cat")["val"].sum(skipna=skipna)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"func, values, dtype, result_dtype",
[
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_timegrouper.py
Original file line number Diff line number Diff line change
@@ -113,7 +113,7 @@ def test_groupby_with_timegrouper(self):
unit=df.index.unit,
)
expected = DataFrame(
{"Buyer": 0, "Quantity": 0},
{"Buyer": np.nan, "Quantity": 0},
index=exp_dti,
)
# Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
14 changes: 14 additions & 0 deletions pandas/tests/series/test_reductions.py
Original file line number Diff line number Diff line change
@@ -111,6 +111,20 @@ def test_prod_numpy16_bug():
assert not isinstance(result, Series)


@pytest.mark.parametrize("nan_val", [np.nan, pd.NA])
def test_object_sum_allna(nan_val):
# GH#60229
ser = Series([nan_val] * 5, dtype=object)

result = ser.sum(axis=0, skipna=True)
expected = np.nan
tm.assert_equal(result, expected)

result = ser.sum(axis=0, skipna=False)
expected = nan_val
tm.assert_equal(result, expected)


@pytest.mark.parametrize("func", [np.any, np.all])
@pytest.mark.parametrize("kwargs", [{"keepdims": True}, {"out": object()}])
def test_validate_any_all_out_keepdims_raises(kwargs, func):
Loading
Oops, something went wrong.