BUG SeriesGroupBy.mean() overflowed on some integer array (pandas-dev…

…#22487) When integer arrays contained integers that could were outside the range of int64, the conversion would overflow. Instead only allow allow safe casting and if a safe cast can not be done, cast to float64 instead.
troels · Sep 9, 2018 · 6796bf4 · 6796bf4
1 parent 0976e12
commit 6796bf4
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -754,6 +754,7 @@ Groupby/Resample/Rolling
 - Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`).
 - Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to `loffset` kwarg (:issue:`7687`).
 - Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`).
+- Bug in :meth:`SeriesGroupBy.mean` when values where integral but could not fit inside of int64, overflowing instead. (:issues:`22487`)
 
 Sparse
 ^^^^^^

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -471,7 +471,12 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1,
             if (values == iNaT).any():
                 values = ensure_float64(values)
             else:
-                values = values.astype('int64', copy=False)
+                try:
+                    values = values.astype('int64', copy=False, casting='safe')
+                except TypeError:
+                    # At least one of the integers were outside the range of
+                    # int64. Convert to float64 instead.
+                    values = values.astype('float64', copy=False)
         elif is_numeric and not is_complex_dtype(values):
             values = ensure_float64(values)
         else:

diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
@@ -603,6 +603,15 @@ def test_groupby_mean_included():
     tm.assert_frame_equal(result, expected)
 
 
+def test_groupby_mean_no_overflow():
+    # Regression test for (#22487)
+    df = pd.DataFrame({
+        "user": ["A", "A", "A", "A", "A"],
+        "connections": [4970, 4749, 4719, 4704, 18446744073699999744]
+    })
+    assert df.groupby('user')['connections'].mean()['A'] == 3689348814740003840
+
+
 def test_astype_nansafe():
     # https://github.com/pandas-dev/pandas/pull/22343
     arr = integer_array([np.nan, 1, 2], dtype="Int8")