ENH: Add Rolling.nunique() (#61087)

snitish · web-flow · commit 781182cbb00d · 2025-03-10T10:11:03.000-07:00
* ENH: Add Rolling.nunique()

* Add docstring for Expanding.nunique()

* Add a test for float precision issues
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
@@ -10,7 +10,19 @@ class Methods:
         ["DataFrame", "Series"],
         [("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})],
         ["int", "float"],
-        ["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"],
+        [
+            "median",
+            "mean",
+            "max",
+            "min",
+            "std",
+            "count",
+            "skew",
+            "kurt",
+            "sum",
+            "sem",
+            "nunique",
+        ],
     )
     param_names = ["constructor", "window_kwargs", "dtype", "method"]
 
diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
@@ -42,6 +42,7 @@ Rolling window functions
    Rolling.quantile
    Rolling.sem
    Rolling.rank
+   Rolling.nunique
 
 .. _api.functions_window:
 
@@ -86,6 +87,7 @@ Expanding window functions
    Expanding.quantile
    Expanding.sem
    Expanding.rank
+   Expanding.nunique
 
 .. _api.functions_ewm:
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -62,6 +62,7 @@ Other enhancements
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
+- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
 - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
@@ -89,6 +89,12 @@ def roll_rank(
     method: WindowingRankType,
     ascending: bool,
 ) -> np.ndarray: ...  # np.ndarray[float]
+def roll_nunique(
+    values: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
 def roll_apply(
     obj: object,
     start: np.ndarray,  # np.ndarray[np.int64]
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -6,6 +6,7 @@ from libc.math cimport (
     sqrt,
 )
 from libcpp.deque cimport deque
+from libcpp.unordered_map cimport unordered_map
 
 from pandas._libs.algos cimport TiebreakEnumType
 
@@ -1470,6 +1471,66 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
     return np.asarray(output)
 
 
+def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
+                 ndarray[int64_t] end, int64_t minp) -> np.ndarray:
+    """
+    Rolling number of unique elements in the window
+    """
+    cdef:
+        Py_ssize_t i, j, s, e, N = len(start)
+        int64_t nobs = 0
+        float64_t val
+        float64_t[::1] output
+        unordered_map[float64_t, int64_t] value_counts
+
+    is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
+        start, end
+    )
+    output = np.empty(N, dtype=np.float64)
+    value_counts = unordered_map[float64_t, int64_t]()
+
+    with nogil:
+        for i in range(N):
+            s = start[i]
+            e = end[i]
+
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
+                if i != 0:
+                    nobs = 0
+                    value_counts.clear()
+
+                # setup
+                for j in range(s, e):
+                    val = values[j]
+                    if val == val:
+                        nobs += 1
+                        value_counts[val] += 1
+
+            else:
+                # calculate deletes
+                for j in range(start[i - 1], s):
+                    val = values[j]
+                    if val == val:
+                        value_counts[val] -= 1
+                        if value_counts[val] == 0:
+                            value_counts.erase(val)
+                        nobs -= 1
+
+                # calculate adds
+                for j in range(end[i - 1], e):
+                    val = values[j]
+                    if val == val:
+                        nobs += 1
+                        value_counts[val] += 1
+
+            if nobs >= minp:
+                output[i] = value_counts.size()
+            else:
+                output[i] = NaN
+
+    return np.asarray(output)
+
+
 def roll_apply(object obj,
                ndarray[int64_t] start, ndarray[int64_t] end,
                int64_t minp,
diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
@@ -927,6 +927,41 @@ def rank(
             numeric_only=numeric_only,
         )
 
+    @doc(
+        template_header,
+        ".. versionadded:: 3.0.0 \n\n",
+        create_section_header("Parameters"),
+        kwargs_numeric_only,
+        create_section_header("Returns"),
+        template_returns,
+        create_section_header("See Also"),
+        template_see_also,
+        create_section_header("Examples"),
+        dedent(
+            """
+        >>> s = pd.Series([1, 4, 2, 3, 5, 3])
+        >>> s.expanding().nunique()
+        0    1.0
+        1    2.0
+        2    3.0
+        3    4.0
+        4    5.0
+        5    5.0
+        dtype: float64
+        """
+        ).replace("\n", "", 1),
+        window_method="expanding",
+        aggregation_description="nunique",
+        agg_method="nunique",
+    )
+    def nunique(
+        self,
+        numeric_only: bool = False,
+    ):
+        return super().nunique(
+            numeric_only=numeric_only,
+        )
+
     @doc(
         template_header,
         create_section_header("Parameters"),
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -1799,6 +1799,16 @@ def rank(
 
         return self._apply(window_func, name="rank", numeric_only=numeric_only)
 
+    def nunique(
+        self,
+        numeric_only: bool = False,
+    ):
+        window_func = partial(
+            window_aggregations.roll_nunique,
+        )
+
+        return self._apply(window_func, name="nunique", numeric_only=numeric_only)
+
     def cov(
         self,
         other: DataFrame | Series | None = None,
@@ -2855,6 +2865,43 @@ def rank(
             numeric_only=numeric_only,
         )
 
+    @doc(
+        template_header,
+        ".. versionadded:: 3.0.0 \n\n",
+        create_section_header("Parameters"),
+        kwargs_numeric_only,
+        create_section_header("Returns"),
+        template_returns,
+        create_section_header("See Also"),
+        template_see_also,
+        create_section_header("Examples"),
+        dedent(
+            """
+        >>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5])
+        >>> s.rolling(3).nunique()
+        0    NaN
+        1    NaN
+        2    3.0
+        3    NaN
+        4    NaN
+        5    NaN
+        6    2.0
+        7    3.0
+        dtype: float64
+        """
+        ).replace("\n", "", 1),
+        window_method="rolling",
+        aggregation_description="nunique",
+        agg_method="nunique",
+    )
+    def nunique(
+        self,
+        numeric_only: bool = False,
+    ):
+        return super().nunique(
+            numeric_only=numeric_only,
+        )
+
     @doc(
         template_header,
         create_section_header("Parameters"),
diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py
@@ -32,6 +32,7 @@ def _get_rolling_aggregations():
             ("roll_min", window_aggregations.roll_min),
             ("roll_first", window_aggregations.roll_first),
             ("roll_last", window_aggregations.roll_last),
+            ("roll_nunique", window_aggregations.roll_nunique),
         ]
         + [
             (
diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py
@@ -255,6 +255,43 @@ def test_rank(window, method, pct, ascending, test_data):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
+def test_nunique(window, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+    elif test_data == "precision":
+        ser = Series(
+            data=[
+                0.3,
+                0.1 * 3,  # Not necessarily exactly 0.3
+                0.6,
+                0.2 * 3,  # Not necessarily exactly 0.6
+                0.9,
+                0.3 * 3,  # Not necessarily exactly 0.9
+                0.5,
+                0.1 * 5,  # Not necessarily exactly 0.5
+                0.8,
+                0.2 * 4,  # Not necessarily exactly 0.8
+            ],
+            dtype=np.float64,
+        )
+
+    expected = ser.expanding(window).apply(lambda x: x.nunique())
+    result = ser.expanding(window).nunique()
+
+    tm.assert_series_equal(result, expected)
+
+
 def test_expanding_corr(series):
     A = series.dropna()
     B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5]
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -96,6 +96,7 @@ def test_getitem_multiple(self, roll_frame):
             "count",
             "kurt",
             "skew",
+            "nunique",
         ],
     )
     def test_rolling(self, f, roll_frame):
@@ -1034,7 +1035,19 @@ def frame(self):
         return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
 
     @pytest.mark.parametrize(
-        "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"]
+        "f",
+        [
+            "sum",
+            "mean",
+            "min",
+            "max",
+            "first",
+            "last",
+            "count",
+            "kurt",
+            "skew",
+            "nunique",
+        ],
     )
     def test_expanding(self, f, frame):
         g = frame.groupby("A", group_keys=False)
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1586,6 +1586,43 @@ def test_rank(window, method, pct, ascending, test_data):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
+def test_nunique(window, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+    elif test_data == "precision":
+        ser = Series(
+            data=[
+                0.3,
+                0.1 * 3,  # Not necessarily exactly 0.3
+                0.6,
+                0.2 * 3,  # Not necessarily exactly 0.6
+                0.9,
+                0.3 * 3,  # Not necessarily exactly 0.9
+                0.5,
+                0.1 * 5,  # Not necessarily exactly 0.5
+                0.8,
+                0.2 * 4,  # Not necessarily exactly 0.8
+            ],
+            dtype=np.float64,
+        )
+
+    expected = ser.rolling(window).apply(lambda x: x.nunique())
+    result = ser.rolling(window).nunique()
+
+    tm.assert_series_equal(result, expected)
+
+
 def test_rolling_quantile_np_percentile():
     # #9413: Tests that rolling window's quantile default behavior
     # is analogous to Numpy's percentile

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ def _get_rolling_aggregations():`
`32`	`32`	`("roll_min", window_aggregations.roll_min),`
`33`	`33`	`("roll_first", window_aggregations.roll_first),`
`34`	`34`	`("roll_last", window_aggregations.roll_last),`
	`35`	`+ ("roll_nunique", window_aggregations.roll_nunique),`
`35`	`36`	`]`
`36`	`37`	`+ [`
`37`	`38`	`(`