Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add Rolling.nunique() #61087

Merged
merged 3 commits into from
Mar 10, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
@@ -10,7 +10,19 @@ class Methods:
["DataFrame", "Series"],
[("rolling", {"window": 10}), ("rolling", {"window": 1000}), ("expanding", {})],
["int", "float"],
["median", "mean", "max", "min", "std", "count", "skew", "kurt", "sum", "sem"],
[
"median",
"mean",
"max",
"min",
"std",
"count",
"skew",
"kurt",
"sum",
"sem",
"nunique",
],
)
param_names = ["constructor", "window_kwargs", "dtype", "method"]

2 changes: 2 additions & 0 deletions doc/source/reference/window.rst
Original file line number Diff line number Diff line change
@@ -42,6 +42,7 @@ Rolling window functions
Rolling.quantile
Rolling.sem
Rolling.rank
Rolling.nunique

.. _api.functions_window:

@@ -86,6 +87,7 @@ Expanding window functions
Expanding.quantile
Expanding.sem
Expanding.rank
Expanding.nunique

.. _api.functions_ewm:

1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
@@ -62,6 +62,7 @@ Other enhancements
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` methods ``sum``, ``mean``, ``median``, ``prod``, ``min``, ``max``, ``std``, ``var`` and ``sem`` now accept ``skipna`` parameter (:issue:`15675`)
- :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`)
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
6 changes: 6 additions & 0 deletions pandas/_libs/window/aggregations.pyi
Original file line number Diff line number Diff line change
@@ -89,6 +89,12 @@ def roll_rank(
method: WindowingRankType,
ascending: bool,
) -> np.ndarray: ... # np.ndarray[float]
def roll_nunique(
values: np.ndarray, # const float64_t[:]
start: np.ndarray, # np.ndarray[np.int64]
end: np.ndarray, # np.ndarray[np.int64]
minp: int, # int64_t
) -> np.ndarray: ... # np.ndarray[float]
def roll_apply(
obj: object,
start: np.ndarray, # np.ndarray[np.int64]
61 changes: 61 additions & 0 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@ from libc.math cimport (
sqrt,
)
from libcpp.deque cimport deque
from libcpp.unordered_map cimport unordered_map

from pandas._libs.algos cimport TiebreakEnumType

@@ -1470,6 +1471,66 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
return np.asarray(output)


def roll_nunique(const float64_t[:] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
"""
Rolling number of unique elements in the window
"""
cdef:
Py_ssize_t i, j, s, e, N = len(start)
int64_t nobs = 0
float64_t val
float64_t[::1] output
unordered_map[float64_t, int64_t] value_counts

is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
start, end
)
output = np.empty(N, dtype=np.float64)
value_counts = unordered_map[float64_t, int64_t]()

with nogil:
for i in range(N):
s = start[i]
e = end[i]

if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
if i != 0:
nobs = 0
value_counts.clear()

# setup
for j in range(s, e):
val = values[j]
if val == val:
nobs += 1
value_counts[val] += 1

else:
# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
if val == val:
value_counts[val] -= 1
if value_counts[val] == 0:
value_counts.erase(val)
nobs -= 1

# calculate adds
for j in range(end[i - 1], e):
val = values[j]
if val == val:
nobs += 1
value_counts[val] += 1

if nobs >= minp:
output[i] = value_counts.size()
else:
output[i] = NaN

return np.asarray(output)


def roll_apply(object obj,
ndarray[int64_t] start, ndarray[int64_t] end,
int64_t minp,
35 changes: 35 additions & 0 deletions pandas/core/window/expanding.py
Original file line number Diff line number Diff line change
@@ -927,6 +927,41 @@ def rank(
numeric_only=numeric_only,
)

@doc(
template_header,
".. versionadded:: 3.0.0 \n\n",
create_section_header("Parameters"),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([1, 4, 2, 3, 5, 3])
>>> s.expanding().nunique()
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 5.0
dtype: float64
"""
).replace("\n", "", 1),
window_method="expanding",
aggregation_description="nunique",
agg_method="nunique",
)
def nunique(
self,
numeric_only: bool = False,
):
return super().nunique(
numeric_only=numeric_only,
)

@doc(
template_header,
create_section_header("Parameters"),
47 changes: 47 additions & 0 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
@@ -1799,6 +1799,16 @@ def rank(

return self._apply(window_func, name="rank", numeric_only=numeric_only)

def nunique(
self,
numeric_only: bool = False,
):
window_func = partial(
window_aggregations.roll_nunique,
)

return self._apply(window_func, name="nunique", numeric_only=numeric_only)

def cov(
self,
other: DataFrame | Series | None = None,
@@ -2855,6 +2865,43 @@ def rank(
numeric_only=numeric_only,
)

@doc(
template_header,
".. versionadded:: 3.0.0 \n\n",
create_section_header("Parameters"),
kwargs_numeric_only,
create_section_header("Returns"),
template_returns,
create_section_header("See Also"),
template_see_also,
create_section_header("Examples"),
dedent(
"""
>>> s = pd.Series([1, 4, 2, np.nan, 3, 3, 4, 5])
>>> s.rolling(3).nunique()
0 NaN
1 NaN
2 3.0
3 NaN
4 NaN
5 NaN
6 2.0
7 3.0
dtype: float64
"""
).replace("\n", "", 1),
window_method="rolling",
aggregation_description="nunique",
agg_method="nunique",
)
def nunique(
self,
numeric_only: bool = False,
):
return super().nunique(
numeric_only=numeric_only,
)

@doc(
template_header,
create_section_header("Parameters"),
1 change: 1 addition & 0 deletions pandas/tests/window/test_cython_aggregations.py
Original file line number Diff line number Diff line change
@@ -32,6 +32,7 @@ def _get_rolling_aggregations():
("roll_min", window_aggregations.roll_min),
("roll_first", window_aggregations.roll_first),
("roll_last", window_aggregations.roll_last),
("roll_nunique", window_aggregations.roll_nunique),
]
+ [
(
37 changes: 37 additions & 0 deletions pandas/tests/window/test_expanding.py
Original file line number Diff line number Diff line change
@@ -255,6 +255,43 @@ def test_rank(window, method, pct, ascending, test_data):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("window", [1, 3, 10, 20])
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
def test_nunique(window, test_data):
length = 20
if test_data == "default":
ser = Series(data=np.random.default_rng(2).random(length))
elif test_data == "duplicates":
ser = Series(data=np.random.default_rng(2).choice(3, length))
elif test_data == "nans":
ser = Series(
data=np.random.default_rng(2).choice(
[1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
)
)
elif test_data == "precision":
ser = Series(
data=[
0.3,
0.1 * 3, # Not necessarily exactly 0.3
0.6,
0.2 * 3, # Not necessarily exactly 0.6
0.9,
0.3 * 3, # Not necessarily exactly 0.9
0.5,
0.1 * 5, # Not necessarily exactly 0.5
0.8,
0.2 * 4, # Not necessarily exactly 0.8
],
dtype=np.float64,
)

expected = ser.expanding(window).apply(lambda x: x.nunique())
result = ser.expanding(window).nunique()

tm.assert_series_equal(result, expected)


def test_expanding_corr(series):
A = series.dropna()
B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5]
15 changes: 14 additions & 1 deletion pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
@@ -96,6 +96,7 @@ def test_getitem_multiple(self, roll_frame):
"count",
"kurt",
"skew",
"nunique",
],
)
def test_rolling(self, f, roll_frame):
@@ -1034,7 +1035,19 @@ def frame(self):
return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})

@pytest.mark.parametrize(
"f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"]
"f",
[
"sum",
"mean",
"min",
"max",
"first",
"last",
"count",
"kurt",
"skew",
"nunique",
],
)
def test_expanding(self, f, frame):
g = frame.groupby("A", group_keys=False)
37 changes: 37 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
@@ -1586,6 +1586,43 @@ def test_rank(window, method, pct, ascending, test_data):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("window", [1, 3, 10, 20])
@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
def test_nunique(window, test_data):
length = 20
if test_data == "default":
ser = Series(data=np.random.default_rng(2).random(length))
elif test_data == "duplicates":
ser = Series(data=np.random.default_rng(2).choice(3, length))
elif test_data == "nans":
ser = Series(
data=np.random.default_rng(2).choice(
[1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
)
)
elif test_data == "precision":
ser = Series(
data=[
0.3,
0.1 * 3, # Not necessarily exactly 0.3
0.6,
0.2 * 3, # Not necessarily exactly 0.6
0.9,
0.3 * 3, # Not necessarily exactly 0.9
0.5,
0.1 * 5, # Not necessarily exactly 0.5
0.8,
0.2 * 4, # Not necessarily exactly 0.8
],
dtype=np.float64,
)

expected = ser.rolling(window).apply(lambda x: x.nunique())
result = ser.rolling(window).nunique()

tm.assert_series_equal(result, expected)


def test_rolling_quantile_np_percentile():
# #9413: Tests that rolling window's quantile default behavior
# is analogous to Numpy's percentile
Loading
Oops, something went wrong.