pandas-dev · snitish · Dec 1, 2024 · Dec 3, 2024 · Feb 15, 2025 · Feb 16, 2025
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -750,6 +750,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
 - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
 - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
+- Bug in :meth:`DataFrame.sum`, :meth:`Series.sum`, :meth:`DataFrameGroupBy.sum` and :math:`SeriesGroupBy.sum` where in case of all-nan values for object dtype the result is incorrectly set to 0 instead of ``nan``. (:issue:`60229`)
 - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
 - Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)
 - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -725,8 +725,12 @@ def group_sum(
         raise ValueError("len(index) != len(labels)")
 
     nobs = np.zeros((<object>out).shape, dtype=np.int64)
-    # the below is equivalent to `np.zeros_like(out)` but faster
-    sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
+    if sum_t is object:
+        # For object dtype, fill value should not be 0 (#60229)
+        sumx = np.full((<object>out).shape, NAN, dtype=object)
+    else:
+        # the below is equivalent to `np.zeros_like(out)` but faster
+        sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
     compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
 
     N, K = (<object>values).shape
@@ -760,7 +764,10 @@ def group_sum(
                     if uses_mask:
                         isna_result = result_mask[lab, j]
                     else:
-                        isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
+                        isna_result = (
+                            _treat_as_na(sumx[lab, j], is_datetimelike) and
+                            nobs[lab, j] > 0
+                        )
 
                     if isna_result:
                         # If sum is already NA, don't add to it. This is important for
@@ -795,6 +802,7 @@ def group_sum(
                             compensation[lab, j] = 0
                         sumx[lab, j] = t
                 elif not skipna:
+                    nobs[lab, j] += 1
                     if uses_mask:
                         result_mask[lab, j] = True
                     else:

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -638,6 +638,13 @@ def nansum(
     the_sum = values.sum(axis, dtype=dtype_sum)
     the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
 
+    if dtype.kind == "O" and skipna and min_count == 0:
+        # GH#60229 For object dtype, sum of all-NA array should be nan
+        if isinstance(the_sum, np.ndarray):
+            the_sum[mask.sum(axis=axis) == mask.shape[axis]] = np.nan
+        elif mask.all():
+            the_sum = np.nan
+
     return the_sum
 
 

diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -1186,7 +1186,7 @@ def test_frame_single_columns_object_sum_axis_1():
     }
     df = DataFrame(data)
     result = df.sum(axis=1)
-    expected = Series(["A", 1.2, 0])
+    expected = Series(["A", 1.2, np.nan])
     tm.assert_series_equal(result, expected)
 
 

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -420,10 +420,25 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis):
         assert df.values.dtype == np.object_
         result = getattr(df, method)(axis=axis)
         expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
+        if method == "sum":
+            # GH#60229 in case of all-NA object array, sum should be nan
+            expected[df.isna().all(axis=axis)] = np.nan
         if axis in [1, "columns"] and method in ["min", "max"]:
             expected[expected.isna()] = None
         tm.assert_series_equal(result, expected)
 
+    def test_object_sum_allna(self):
+        # GH#60229
+        df = DataFrame({"a": [np.nan] * 5, "b": [pd.NA] * 5}, dtype=object)
+
+        result = df.sum(axis=0, skipna=True)
+        expected = Series([np.nan, np.nan], index=["a", "b"], dtype=object)
+        tm.assert_series_equal(result, expected)
+
+        result = df.sum(axis=0, skipna=False)
+        expected = Series([np.nan, pd.NA], index=["a", "b"], dtype=object)
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
     def test_mixed_ops(self, op):
         # GH#16116

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -358,6 +358,7 @@ def test_observed(request, using_infer_string, observed):
         expected = cartesian_product_for_groupers(
             expected, [cat1, cat2], list("AB"), fill_value=0
         )
+        expected.loc[expected.C == 0, "C"] = np.nan
 
     tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -514,6 +514,21 @@ def test_sum_skipna_object(skipna):
     tm.assert_series_equal(result, expected)
 
 
+def test_sum_allnan_object(skipna):
+    # GH#60229
+    df = DataFrame(
+        {
+            "val": [np.nan] * 10,
+            "cat": ["A", "B"] * 5,
+        }
+    ).astype({"val": object})
+    expected = Series(
+        [np.nan, np.nan], index=pd.Index(["A", "B"], name="cat"), name="val"
+    ).astype(object)
+    result = df.groupby("cat")["val"].sum(skipna=skipna)
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "func, values, dtype, result_dtype",
     [

diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
@@ -113,7 +113,7 @@ def test_groupby_with_timegrouper(self):
                 unit=df.index.unit,
             )
             expected = DataFrame(
-                {"Buyer": 0, "Quantity": 0},
+                {"Buyer": np.nan, "Quantity": 0},
                 index=exp_dti,
             )
             # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"

diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
@@ -111,6 +111,20 @@ def test_prod_numpy16_bug():
     assert not isinstance(result, Series)
 
 
+@pytest.mark.parametrize("nan_val", [np.nan, pd.NA])
+def test_object_sum_allna(nan_val):
+    # GH#60229
+    ser = Series([nan_val] * 5, dtype=object)
+
+    result = ser.sum(axis=0, skipna=True)
+    expected = np.nan
+    tm.assert_equal(result, expected)
+
+    result = ser.sum(axis=0, skipna=False)
+    expected = nan_val
+    tm.assert_equal(result, expected)
+
+
 @pytest.mark.parametrize("func", [np.any, np.all])
 @pytest.mark.parametrize("kwargs", [{"keepdims": True}, {"out": object()}])
 def test_validate_any_all_out_keepdims_raises(kwargs, func):