Skip to content

Commit 781182c

Browse files
authored
ENH: Add Rolling.nunique() (#61087)
* ENH: Add Rolling.nunique() * Add docstring for Expanding.nunique() * Add a test for float precision issues
1 parent dab1b88 commit 781182c

File tree

11 files changed

+254
-2
lines changed

11 files changed

+254
-2
lines changed

asv_bench/benchmarks/rolling.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,19 @@ class Methods:
1010
["DataFrame", "Series"],
1111
[("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})],
1212
["int", "float"],
13-
["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"],
13+
[
14+
"median",
15+
"mean",
16+
"max",
17+
"min",
18+
"std",
19+
"count",
20+
"skew",
21+
"kurt",
22+
"sum",
23+
"sem",
24+
"nunique",
25+
],
1426
)
1527
param_names = ["constructor", "window_kwargs", "dtype", "method"]
1628

doc/source/reference/window.rst

+2
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Rolling window functions
4242
Rolling.quantile
4343
Rolling.sem
4444
Rolling.rank
45+
Rolling.nunique
4546

4647
.. _api.functions_window:
4748

@@ -86,6 +87,7 @@ Expanding window functions
8687
Expanding.quantile
8788
Expanding.sem
8889
Expanding.rank
90+
Expanding.nunique
8991

9092
.. _api.functions_ewm:
9193

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ Other enhancements
6262
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
6363
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
6464
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
65+
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
6566
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
6667
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
6768
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)

pandas/_libs/window/aggregations.pyi

+6
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,12 @@ def roll_rank(
8989
method: WindowingRankType,
9090
ascending: bool,
9191
) -> np.ndarray: ... # np.ndarray[float]
92+
def roll_nunique(
93+
values: np.ndarray, # const float64_t[:]
94+
start: np.ndarray, # np.ndarray[np.int64]
95+
end: np.ndarray, # np.ndarray[np.int64]
96+
minp: int, # int64_t
97+
) -> np.ndarray: ... # np.ndarray[float]
9298
def roll_apply(
9399
obj: object,
94100
start: np.ndarray, # np.ndarray[np.int64]

pandas/_libs/window/aggregations.pyx

+61
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ from libc.math cimport (
66
sqrt,
77
)
88
from libcpp.deque cimport deque
9+
from libcpp.unordered_map cimport unordered_map
910

1011
from pandas._libs.algos cimport TiebreakEnumType
1112

@@ -1470,6 +1471,66 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
14701471
return np.asarray(output)
14711472

14721473

1474+
def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
1475+
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
1476+
"""
1477+
Rolling number of unique elements in the window
1478+
"""
1479+
cdef:
1480+
Py_ssize_t i, j, s, e, N = len(start)
1481+
int64_t nobs = 0
1482+
float64_t val
1483+
float64_t[::1] output
1484+
unordered_map[float64_t, int64_t] value_counts
1485+
1486+
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
1487+
start, end
1488+
)
1489+
output = np.empty(N, dtype=np.float64)
1490+
value_counts = unordered_map[float64_t, int64_t]()
1491+
1492+
with nogil:
1493+
for i in range(N):
1494+
s = start[i]
1495+
e = end[i]
1496+
1497+
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
1498+
if i != 0:
1499+
nobs = 0
1500+
value_counts.clear()
1501+
1502+
# setup
1503+
for j in range(s, e):
1504+
val = values[j]
1505+
if val == val:
1506+
nobs += 1
1507+
value_counts[val] += 1
1508+
1509+
else:
1510+
# calculate deletes
1511+
for j in range(start[i - 1], s):
1512+
val = values[j]
1513+
if val == val:
1514+
value_counts[val] -= 1
1515+
if value_counts[val] == 0:
1516+
value_counts.erase(val)
1517+
nobs -= 1
1518+
1519+
# calculate adds
1520+
for j in range(end[i - 1], e):
1521+
val = values[j]
1522+
if val == val:
1523+
nobs += 1
1524+
value_counts[val] += 1
1525+
1526+
if nobs >= minp:
1527+
output[i] = value_counts.size()
1528+
else:
1529+
output[i] = NaN
1530+
1531+
return np.asarray(output)
1532+
1533+
14731534
def roll_apply(object obj,
14741535
ndarray[int64_t] start, ndarray[int64_t] end,
14751536
int64_t minp,

pandas/core/window/expanding.py

+35
Original file line numberDiff line numberDiff line change
@@ -927,6 +927,41 @@ def rank(
927927
numeric_only=numeric_only,
928928
)
929929

930+
@doc(
931+
template_header,
932+
".. versionadded:: 3.0.0 \n\n",
933+
create_section_header("Parameters"),
934+
kwargs_numeric_only,
935+
create_section_header("Returns"),
936+
template_returns,
937+
create_section_header("See Also"),
938+
template_see_also,
939+
create_section_header("Examples"),
940+
dedent(
941+
"""
942+
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
943+
>>> s.expanding().nunique()
944+
0 1.0
945+
1 2.0
946+
2 3.0
947+
3 4.0
948+
4 5.0
949+
5 5.0
950+
dtype: float64
951+
"""
952+
).replace("\n", "", 1),
953+
window_method="expanding",
954+
aggregation_description="nunique",
955+
agg_method="nunique",
956+
)
957+
def nunique(
958+
self,
959+
numeric_only: bool = False,
960+
):
961+
return super().nunique(
962+
numeric_only=numeric_only,
963+
)
964+
930965
@doc(
931966
template_header,
932967
create_section_header("Parameters"),

pandas/core/window/rolling.py

+47
Original file line numberDiff line numberDiff line change
@@ -1799,6 +1799,16 @@ def rank(
17991799

18001800
return self._apply(window_func, name="rank", numeric_only=numeric_only)
18011801

1802+
def nunique(
1803+
self,
1804+
numeric_only: bool = False,
1805+
):
1806+
window_func = partial(
1807+
window_aggregations.roll_nunique,
1808+
)
1809+
1810+
return self._apply(window_func, name="nunique", numeric_only=numeric_only)
1811+
18021812
def cov(
18031813
self,
18041814
other: DataFrame | Series | None = None,
@@ -2855,6 +2865,43 @@ def rank(
28552865
numeric_only=numeric_only,
28562866
)
28572867

2868+
@doc(
2869+
template_header,
2870+
".. versionadded:: 3.0.0 \n\n",
2871+
create_section_header("Parameters"),
2872+
kwargs_numeric_only,
2873+
create_section_header("Returns"),
2874+
template_returns,
2875+
create_section_header("See Also"),
2876+
template_see_also,
2877+
create_section_header("Examples"),
2878+
dedent(
2879+
"""
2880+
>>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5])
2881+
>>> s.rolling(3).nunique()
2882+
0 NaN
2883+
1 NaN
2884+
2 3.0
2885+
3 NaN
2886+
4 NaN
2887+
5 NaN
2888+
6 2.0
2889+
7 3.0
2890+
dtype: float64
2891+
"""
2892+
).replace("\n", "", 1),
2893+
window_method="rolling",
2894+
aggregation_description="nunique",
2895+
agg_method="nunique",
2896+
)
2897+
def nunique(
2898+
self,
2899+
numeric_only: bool = False,
2900+
):
2901+
return super().nunique(
2902+
numeric_only=numeric_only,
2903+
)
2904+
28582905
@doc(
28592906
template_header,
28602907
create_section_header("Parameters"),

pandas/tests/window/test_cython_aggregations.py

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def _get_rolling_aggregations():
3232
("roll_min", window_aggregations.roll_min),
3333
("roll_first", window_aggregations.roll_first),
3434
("roll_last", window_aggregations.roll_last),
35+
("roll_nunique", window_aggregations.roll_nunique),
3536
]
3637
+ [
3738
(

pandas/tests/window/test_expanding.py

+37
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,43 @@ def test_rank(window, method, pct, ascending, test_data):
255255
tm.assert_series_equal(result, expected)
256256

257257

258+
@pytest.mark.parametrize("window", [1, 3, 10, 20])
259+
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
260+
def test_nunique(window, test_data):
261+
length = 20
262+
if test_data == "default":
263+
ser = Series(data=np.random.default_rng(2).random(length))
264+
elif test_data == "duplicates":
265+
ser = Series(data=np.random.default_rng(2).choice(3, length))
266+
elif test_data == "nans":
267+
ser = Series(
268+
data=np.random.default_rng(2).choice(
269+
[1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
270+
)
271+
)
272+
elif test_data == "precision":
273+
ser = Series(
274+
data=[
275+
0.3,
276+
0.1 * 3, # Not necessarily exactly 0.3
277+
0.6,
278+
0.2 * 3, # Not necessarily exactly 0.6
279+
0.9,
280+
0.3 * 3, # Not necessarily exactly 0.9
281+
0.5,
282+
0.1 * 5, # Not necessarily exactly 0.5
283+
0.8,
284+
0.2 * 4, # Not necessarily exactly 0.8
285+
],
286+
dtype=np.float64,
287+
)
288+
289+
expected = ser.expanding(window).apply(lambda x: x.nunique())
290+
result = ser.expanding(window).nunique()
291+
292+
tm.assert_series_equal(result, expected)
293+
294+
258295
def test_expanding_corr(series):
259296
A = series.dropna()
260297
B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5]

pandas/tests/window/test_groupby.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def test_getitem_multiple(self, roll_frame):
9696
"count",
9797
"kurt",
9898
"skew",
99+
"nunique",
99100
],
100101
)
101102
def test_rolling(self, f, roll_frame):
@@ -1034,7 +1035,19 @@ def frame(self):
10341035
return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
10351036

10361037
@pytest.mark.parametrize(
1037-
"f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"]
1038+
"f",
1039+
[
1040+
"sum",
1041+
"mean",
1042+
"min",
1043+
"max",
1044+
"first",
1045+
"last",
1046+
"count",
1047+
"kurt",
1048+
"skew",
1049+
"nunique",
1050+
],
10381051
)
10391052
def test_expanding(self, f, frame):
10401053
g = frame.groupby("A", group_keys=False)

pandas/tests/window/test_rolling.py

+37
Original file line numberDiff line numberDiff line change
@@ -1586,6 +1586,43 @@ def test_rank(window, method, pct, ascending, test_data):
15861586
tm.assert_series_equal(result, expected)
15871587

15881588

1589+
@pytest.mark.parametrize("window", [1, 3, 10, 20])
1590+
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
1591+
def test_nunique(window, test_data):
1592+
length = 20
1593+
if test_data == "default":
1594+
ser = Series(data=np.random.default_rng(2).random(length))
1595+
elif test_data == "duplicates":
1596+
ser = Series(data=np.random.default_rng(2).choice(3, length))
1597+
elif test_data == "nans":
1598+
ser = Series(
1599+
data=np.random.default_rng(2).choice(
1600+
[1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
1601+
)
1602+
)
1603+
elif test_data == "precision":
1604+
ser = Series(
1605+
data=[
1606+
0.3,
1607+
0.1 * 3, # Not necessarily exactly 0.3
1608+
0.6,
1609+
0.2 * 3, # Not necessarily exactly 0.6
1610+
0.9,
1611+
0.3 * 3, # Not necessarily exactly 0.9
1612+
0.5,
1613+
0.1 * 5, # Not necessarily exactly 0.5
1614+
0.8,
1615+
0.2 * 4, # Not necessarily exactly 0.8
1616+
],
1617+
dtype=np.float64,
1618+
)
1619+
1620+
expected = ser.rolling(window).apply(lambda x: x.nunique())
1621+
result = ser.rolling(window).nunique()
1622+
1623+
tm.assert_series_equal(result, expected)
1624+
1625+
15891626
def test_rolling_quantile_np_percentile():
15901627
# #9413: Tests that rolling window's quantile default behavior
15911628
# is analogous to Numpy's percentile

0 commit comments

Comments
 (0)