From 0d353ec14c79c4c5123623f00555843324041b37 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Jun 2023 09:22:32 -0600 Subject: [PATCH 01/24] Bump codecov/codecov-action from 3.1.3 to 3.1.4 (#243) Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3.1.3 to 3.1.4. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v3.1.3...v3.1.4) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 4 ++-- .github/workflows/ci.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 5a0198307..38fba5d88 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -73,7 +73,7 @@ jobs: run: | python -m pytest --doctest-modules flox --ignore flox/tests --cov=./ --cov-report=xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: ./coverage.xml flags: unittests @@ -126,7 +126,7 @@ jobs: python -m mypy --install-types --non-interactive --cobertura-xml-report mypy_report - name: Upload mypy coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: mypy_report/cobertura.xml flags: mypy diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4facd9a86..2d65e9d1d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -48,7 +48,7 @@ jobs: run: | pytest -n auto --cov=./ --cov-report=xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: ./coverage.xml flags: unittests @@ -91,7 +91,7 @@ jobs: run: | python -m pytest -n auto --cov=./ --cov-report=xml - name: Upload code coverage to Codecov - uses: codecov/codecov-action@v3.1.3 + uses: codecov/codecov-action@v3.1.4 with: file: ./coverage.xml flags: unittests From 5d223a70dc1aa45953e31662e76f31813a660a74 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 14 Jun 2023 12:06:21 -0600 Subject: [PATCH 02/24] Delete resample_reduce (#246) Closes #245 --- flox/xarray.py | 48 ++++-------------------------------- tests/test_xarray.py | 58 +------------------------------------------- 2 files changed, 6 insertions(+), 100 deletions(-) diff --git a/flox/xarray.py b/flox/xarray.py index ec06b9161..df8f773df 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -1,6 +1,5 @@ from __future__ import annotations -import warnings from typing import TYPE_CHECKING, Any, Hashable, Iterable, Sequence, Union import numpy as np @@ -21,7 +20,6 @@ from .xrutils import _contains_cftime_datetimes, _to_pytimedelta, datetime_to_numeric if TYPE_CHECKING: - from xarray.core.resample import Resample from xarray.core.types import T_DataArray, T_Dataset Dims = Union[str, Iterable[Hashable], None] @@ -450,7 +448,11 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs): actual = actual.drop_vars(name) # When grouping by MultiIndex, expect is an pd.Index wrapping # an object array of tuples - if name in ds_broad.indexes and isinstance(ds_broad.indexes[name], pd.MultiIndex): + if ( + name in ds_broad.indexes + and isinstance(ds_broad.indexes[name], pd.MultiIndex) + and not isinstance(expect, pd.RangeIndex) + ): levelnames = ds_broad.indexes[name].names expect = pd.MultiIndex.from_tuples(expect.values, names=levelnames) actual[name] = expect @@ -582,43 +584,3 @@ def _rechunk(func, obj, dim, labels, **kwargs): ) return obj - - -def resample_reduce( - resampler: Resample, - func: str | Aggregation, - keep_attrs: bool = True, - **kwargs, -): - warnings.warn( - "flox.xarray.resample_reduce is now deprecated. Please use Xarray's resample method directly.", - DeprecationWarning, - ) - - obj = resampler._obj - dim = resampler._group_dim - - # this creates a label DataArray since resample doesn't do that somehow - tostack = [] - for idx, slicer in enumerate(resampler._group_indices): - if slicer.stop is None: - stop = resampler._obj.sizes[dim] - else: - stop = slicer.stop - tostack.append(idx * np.ones((stop - slicer.start,), dtype=np.int32)) - by = xr.DataArray(np.hstack(tostack), dims=(dim,), name="__resample_dim__") - - result = ( - xarray_reduce( - obj, - by, - func=func, - method="blockwise", - keep_attrs=keep_attrs, - **kwargs, - ) - .rename({"__resample_dim__": dim}) - .transpose(dim, ...) - ) - result[dim] = resampler._unique_coord.data - return result diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 2fce2552c..7a343d962 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -6,7 +6,7 @@ xr = pytest.importorskip("xarray") # isort: on -from flox.xarray import rechunk_for_blockwise, resample_reduce, xarray_reduce +from flox.xarray import rechunk_for_blockwise, xarray_reduce from . import assert_equal, has_dask, raise_if_dask_computes, requires_dask @@ -245,48 +245,6 @@ def test_xarray_reduce_errors(): xarray_reduce(da, by.chunk(), func="mean") -@pytest.mark.parametrize("isdask", [True, False]) -@pytest.mark.parametrize("dataarray", [True, False]) -@pytest.mark.parametrize("chunklen", [27, 4 * 31 + 1, 4 * 31 + 20]) -def test_xarray_resample(chunklen, isdask, dataarray, engine): - if isdask: - if not has_dask: - pytest.skip() - ds = xr.tutorial.open_dataset("air_temperature", chunks={"time": chunklen}) - else: - ds = xr.tutorial.open_dataset("air_temperature") - - if dataarray: - ds = ds.air - - resampler = ds.resample(time="M") - with pytest.warns(DeprecationWarning): - actual = resample_reduce(resampler, "mean", engine=engine) - expected = resampler.mean() - xr.testing.assert_allclose(actual, expected) - - with xr.set_options(use_flox=True): - actual = resampler.mean() - xr.testing.assert_allclose(actual, expected) - - -@requires_dask -def test_xarray_resample_dataset_multiple_arrays(engine): - # regression test for #35 - times = pd.date_range("2000", periods=5) - foo = xr.DataArray(range(5), dims=["time"], coords=[times], name="foo") - bar = xr.DataArray(range(1, 6), dims=["time"], coords=[times], name="bar") - ds = xr.merge([foo, bar]).chunk({"time": 4}) - - resampler = ds.resample(time="4D") - # The separate computes are necessary here to force xarray - # to compute all variables in result at the same time. - expected = resampler.mean().compute() - with pytest.warns(DeprecationWarning): - result = resample_reduce(resampler, "mean", engine=engine).compute() - xr.testing.assert_allclose(expected, result) - - @requires_dask @pytest.mark.parametrize( "inchunks, expected", @@ -439,20 +397,6 @@ def test_cache(): assert len(cache.data) == 2 -@pytest.mark.parametrize("use_cftime", [True, False]) -@pytest.mark.parametrize("func", ["count", "mean"]) -def test_datetime_array_reduce(use_cftime, func, engine): - time = xr.DataArray( - xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime), - dims=("time",), - name="time", - ) - expected = getattr(time.resample(time="YS"), func)() - with pytest.warns(DeprecationWarning): - actual = resample_reduce(time.resample(time="YS"), func=func, engine=engine) - assert_equal(expected, actual) - - @requires_dask @pytest.mark.parametrize("method", ["cohorts", "map-reduce"]) def test_groupby_bins_indexed_coordinate(method): From 6cf315a57e006dc01c801039833351240c4431fe Mon Sep 17 00:00:00 2001 From: Antonio Valentino Date: Sat, 17 Jun 2023 16:42:34 +0200 Subject: [PATCH 03/24] Fix test failure on i386 (#248) Closes: #247 --- flox/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flox/core.py b/flox/core.py index 57ea4556f..02f53b837 100644 --- a/flox/core.py +++ b/flox/core.py @@ -537,7 +537,7 @@ def factorize_( right = expect.closed_right idx = np.digitize( flat, - bins=bins.view(np.intp) if bins.dtype.kind == "M" else bins, + bins=bins.view(np.int64) if bins.dtype.kind == "M" else bins, right=right, ) idx -= 1 From 66f152b377897b481f460b6fad9c8707ead910d4 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Mon, 3 Jul 2023 18:00:42 +0200 Subject: [PATCH 04/24] typing fixes (#235) * Update core.py * Update xarray.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * avoid renaming * Update xarray.py * Update xarray.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray.py * Update xarray.py * Update xarray.py * Update xarray.py * Update xarray.py * split to optional * Update xarray.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray.py * convert to pd.Index instead of ndarray * Handled different slicer types? * not supported instead? * specify type for simple_combine * Handle None in agg.min_count * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * add overloads and rename * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * more overloads * ignore * Update core.py * Update xarray.py * Update core.py * Update core.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update core.py * Update core.py * Update core.py * Update core.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update flox/core.py * Update flox/core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * Update core.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xarray.py * Have to add another type here because of xarray not supporting IntervalIndex * Update xarray.py * test ex instead of e * Revert "test ex instead of e" This reverts commit 8e55d3a62181153bcd011d99fc73af5b3a01eb6b. * check reveal_type * without e * try no redefinition * IF redefining ex, mypy always takes the first definition of ex. even if it has been narrowed down. * test min_count=0 * test min_count=0 * test min_count=0 * test min_count=0 * test min_count = 0 * test min_count=0 * test min_count=0 * test min_count=0 * test min_count=0 * test min_count=0 * test min_count=0 * test min_count=0 * test min_count=0 * Update asv_bench/benchmarks/combine.py --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian --- asv_bench/benchmarks/combine.py | 5 +- flox/aggregations.py | 11 ++-- flox/core.py | 102 +++++++++++++++++++++----------- flox/xarray.py | 46 ++++++++------ 4 files changed, 105 insertions(+), 59 deletions(-) diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index 286746c75..dd3d7a178 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -1,4 +1,5 @@ from functools import partial +from typing import Any import numpy as np @@ -43,8 +44,8 @@ class Combine1d(Combine): this is for reducting along a single dimension """ - def setup(self, *args, **kwargs): - def construct_member(groups): + def setup(self, *args, **kwargs) -> None: + def construct_member(groups) -> dict[str, Any]: return { "groups": groups, "intermediates": [ diff --git a/flox/aggregations.py b/flox/aggregations.py index 13b23fafe..e5013032a 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -185,7 +185,7 @@ def __init__( # how to aggregate results after first round of reduction self.combine: FuncTuple = _atleast_1d(combine) # simpler reductions used with the "simple combine" algorithm - self.simple_combine = None + self.simple_combine: tuple[Callable, ...] = () # final aggregation self.aggregate: Callable | str = aggregate if aggregate else self.combine[0] # finalize results (see mean) @@ -207,7 +207,7 @@ def __init__( # The following are set by _initialize_aggregation self.finalize_kwargs: dict[Any, Any] = {} - self.min_count: int | None = None + self.min_count: int = 0 def _normalize_dtype_fill_value(self, value, name): value = _atleast_1d(value) @@ -504,7 +504,7 @@ def _initialize_aggregation( dtype, array_dtype, fill_value, - min_count: int | None, + min_count: int, finalize_kwargs: dict[Any, Any] | None, ) -> Aggregation: if not isinstance(func, Aggregation): @@ -559,9 +559,6 @@ def _initialize_aggregation( assert isinstance(finalize_kwargs, dict) agg.finalize_kwargs = finalize_kwargs - if min_count is None: - min_count = 0 - # This is needed for the dask pathway. # Because we use intermediate fill_value since a group could be # absent in one block, but present in another block @@ -579,7 +576,7 @@ def _initialize_aggregation( else: agg.min_count = 0 - simple_combine = [] + simple_combine: list[Callable] = [] for combine in agg.combine: if isinstance(combine, str): if combine in ["nanfirst", "nanlast"]: diff --git a/flox/core.py b/flox/core.py index 02f53b837..f821df9bc 100644 --- a/flox/core.py +++ b/flox/core.py @@ -51,11 +51,15 @@ T_DuckArray = Union[np.ndarray, DaskArray] # Any ? T_By = T_DuckArray T_Bys = tuple[T_By, ...] - T_ExpectIndex = Union[pd.Index, None] - T_Expect = Union[Sequence, np.ndarray, T_ExpectIndex] + T_ExpectIndex = Union[pd.Index] T_ExpectIndexTuple = tuple[T_ExpectIndex, ...] + T_ExpectIndexOpt = Union[T_ExpectIndex, None] + T_ExpectIndexOptTuple = tuple[T_ExpectIndexOpt, ...] + T_Expect = Union[Sequence, np.ndarray, T_ExpectIndex] T_ExpectTuple = tuple[T_Expect, ...] - T_ExpectedGroups = Union[T_Expect, T_ExpectTuple] + T_ExpectOpt = Union[Sequence, np.ndarray, T_ExpectIndexOpt] + T_ExpectOptTuple = tuple[T_ExpectOpt, ...] + T_ExpectedGroups = Union[T_Expect, T_ExpectOptTuple] T_ExpectedGroupsOpt = Union[T_ExpectedGroups, None] T_Func = Union[str, Callable] T_Funcs = Union[T_Func, Sequence[T_Func]] @@ -98,7 +102,7 @@ def _is_first_last_reduction(func: T_Agg) -> bool: return isinstance(func, str) and func in ["nanfirst", "nanlast", "first", "last"] -def _get_expected_groups(by: T_By, sort: bool) -> pd.Index: +def _get_expected_groups(by: T_By, sort: bool) -> T_ExpectIndex: if is_duck_dask_array(by): raise ValueError("Please provide expected_groups if not grouping by a numpy array.") flatby = by.reshape(-1) @@ -219,8 +223,13 @@ def find_group_cohorts(labels, chunks, merge: bool = True) -> dict: raveled = labels.reshape(-1) # these are chunks where a label is present label_chunks = pd.Series(which_chunk).groupby(raveled).unique() + # These invert the label_chunks mapping so we know which labels occur together. - chunks_cohorts = tlz.groupby(lambda x: tuple(label_chunks.get(x)), label_chunks.keys()) + def invert(x) -> tuple[np.ndarray, ...]: + arr = label_chunks.get(x) + return tuple(arr) # type: ignore [arg-type] # pandas issue? + + chunks_cohorts = tlz.groupby(invert, label_chunks.keys()) if merge: # First sort by number of chunks occupied by cohort @@ -459,7 +468,7 @@ def factorize_( axes: T_Axes, *, fastpath: Literal[True], - expected_groups: tuple[pd.Index, ...] | None = None, + expected_groups: T_ExpectIndexOptTuple | None = None, reindex: bool = False, sort: bool = True, ) -> tuple[np.ndarray, tuple[np.ndarray, ...], tuple[int, ...], int, int, None]: @@ -471,7 +480,7 @@ def factorize_( by: T_Bys, axes: T_Axes, *, - expected_groups: tuple[pd.Index, ...] | None = None, + expected_groups: T_ExpectIndexOptTuple | None = None, reindex: bool = False, sort: bool = True, fastpath: Literal[False] = False, @@ -484,7 +493,7 @@ def factorize_( by: T_Bys, axes: T_Axes, *, - expected_groups: tuple[pd.Index, ...] | None = None, + expected_groups: T_ExpectIndexOptTuple | None = None, reindex: bool = False, sort: bool = True, fastpath: bool = False, @@ -496,7 +505,7 @@ def factorize_( by: T_Bys, axes: T_Axes, *, - expected_groups: tuple[pd.Index, ...] | None = None, + expected_groups: T_ExpectIndexOptTuple | None = None, reindex: bool = False, sort: bool = True, fastpath: bool = False, @@ -546,7 +555,7 @@ def factorize_( else: idx = np.zeros_like(flat, dtype=np.intp) - 1 - found_groups.append(expect) + found_groups.append(np.array(expect)) else: if expect is not None and reindex: sorter = np.argsort(expect) @@ -560,7 +569,7 @@ def factorize_( idx = sorter[(idx,)] idx[mask] = -1 else: - idx, groups = pd.factorize(flat, sort=sort) + idx, groups = pd.factorize(flat, sort=sort) # type: ignore # pandas issue? found_groups.append(np.array(groups)) factorized.append(idx.reshape(groupvar.shape)) @@ -853,7 +862,8 @@ def _finalize_results( """ squeezed = _squeeze_results(results, axis) - if agg.min_count > 0: + min_count = agg.min_count + if min_count > 0: counts = squeezed["intermediates"][-1] squeezed["intermediates"] = squeezed["intermediates"][:-1] @@ -864,8 +874,8 @@ def _finalize_results( else: finalized[agg.name] = agg.finalize(*squeezed["intermediates"], **agg.finalize_kwargs) - if agg.min_count > 0: - count_mask = counts < agg.min_count + if min_count > 0: + count_mask = counts < min_count if count_mask.any(): # For one count_mask.any() prevents promoting bool to dtype(fill_value) unless # necessary @@ -1283,7 +1293,7 @@ def dask_groupby_agg( array: DaskArray, by: T_By, agg: Aggregation, - expected_groups: pd.Index | None, + expected_groups: T_ExpectIndexOpt, axis: T_Axes = (), fill_value: Any = None, method: T_Method = "map-reduce", @@ -1423,9 +1433,11 @@ def dask_groupby_agg( group_chunks = ((np.nan,),) else: if expected_groups is None: - expected_groups = _get_expected_groups(by_input, sort=sort) - groups = (expected_groups.to_numpy(),) - group_chunks = ((len(expected_groups),),) + expected_groups_ = _get_expected_groups(by_input, sort=sort) + else: + expected_groups_ = expected_groups + groups = (expected_groups_.to_numpy(),) + group_chunks = ((len(expected_groups_),),) elif method == "cohorts": chunks_cohorts = find_group_cohorts( @@ -1569,7 +1581,7 @@ def _validate_reindex( return reindex -def _assert_by_is_aligned(shape: tuple[int, ...], by: T_Bys): +def _assert_by_is_aligned(shape: tuple[int, ...], by: T_Bys) -> None: assert all(b.ndim == by[0].ndim for b in by[1:]) for idx, b in enumerate(by): if not all(j in [i, 1] for i, j in zip(shape[-b.ndim :], b.shape)): @@ -1584,18 +1596,33 @@ def _assert_by_is_aligned(shape: tuple[int, ...], by: T_Bys): ) +@overload +def _convert_expected_groups_to_index( + expected_groups: tuple[None, ...], isbin: Sequence[bool], sort: bool +) -> tuple[None, ...]: + ... + + +@overload def _convert_expected_groups_to_index( expected_groups: T_ExpectTuple, isbin: Sequence[bool], sort: bool ) -> T_ExpectIndexTuple: - out: list[pd.Index | None] = [] + ... + + +def _convert_expected_groups_to_index( + expected_groups: T_ExpectOptTuple, isbin: Sequence[bool], sort: bool +) -> T_ExpectIndexOptTuple: + out: list[T_ExpectIndexOpt] = [] for ex, isbin_ in zip(expected_groups, isbin): if isinstance(ex, pd.IntervalIndex) or (isinstance(ex, pd.Index) and not isbin_): if sort: - ex = ex.sort_values() - out.append(ex) + out.append(ex.sort_values()) + else: + out.append(ex) elif ex is not None: if isbin_: - out.append(pd.IntervalIndex.from_breaks(ex)) + out.append(pd.IntervalIndex.from_breaks(ex)) # type: ignore [arg-type] # TODO: what do we want here? else: if sort: ex = np.sort(ex) @@ -1613,7 +1640,7 @@ def _lazy_factorize_wrapper(*by: T_By, **kwargs) -> np.ndarray: def _factorize_multiple( by: T_Bys, - expected_groups: T_ExpectIndexTuple, + expected_groups: T_ExpectIndexOptTuple, any_by_dask: bool, reindex: bool, sort: bool = True, @@ -1668,7 +1695,17 @@ def _factorize_multiple( return (group_idx,), found_groups, grp_shape -def _validate_expected_groups(nby: int, expected_groups: T_ExpectedGroupsOpt) -> T_ExpectTuple: +@overload +def _validate_expected_groups(nby: int, expected_groups: None) -> tuple[None, ...]: + ... + + +@overload +def _validate_expected_groups(nby: int, expected_groups: T_ExpectedGroups) -> T_ExpectTuple: + ... + + +def _validate_expected_groups(nby: int, expected_groups: T_ExpectedGroupsOpt) -> T_ExpectOptTuple: if expected_groups is None: return (None,) * nby @@ -1935,21 +1972,20 @@ def groupby_reduce( # Consider np.sum([np.nan]) = np.nan, np.nansum([np.nan]) = 0 if min_count is None: if nax < by_.ndim or fill_value is not None: - min_count = 1 + min_count_: int = 1 + else: + min_count_ = 0 + else: + min_count_ = min_count # TODO: set in xarray? - if ( - min_count is not None - and min_count > 0 - and func in ["nansum", "nanprod"] - and fill_value is None - ): + if min_count_ > 0 and func in ["nansum", "nanprod"] and fill_value is None: # nansum, nanprod have fill_value=0, 1 # overwrite than when min_count is set fill_value = np.nan kwargs = dict(axis=axis_, fill_value=fill_value, engine=engine) - agg = _initialize_aggregation(func, dtype, array.dtype, fill_value, min_count, finalize_kwargs) + agg = _initialize_aggregation(func, dtype, array.dtype, fill_value, min_count_, finalize_kwargs) groups: tuple[np.ndarray | DaskArray, ...] if not has_dask: diff --git a/flox/xarray.py b/flox/xarray.py index df8f773df..acd3f2d6c 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -22,6 +22,8 @@ if TYPE_CHECKING: from xarray.core.types import T_DataArray, T_Dataset + from .core import T_ExpectedGroupsOpt, T_ExpectIndex, T_ExpectOpt + Dims = Union[str, Iterable[Hashable], None] @@ -63,7 +65,7 @@ def xarray_reduce( obj: T_Dataset | T_DataArray, *by: T_DataArray | Hashable, func: str | Aggregation, - expected_groups=None, + expected_groups: T_ExpectedGroupsOpt = None, isbin: bool | Sequence[bool] = False, sort: bool = True, dim: Dims | ellipsis = None, @@ -215,7 +217,7 @@ def xarray_reduce( else: isbins = (isbin,) * nby - expected_groups = _validate_expected_groups(nby, expected_groups) + expected_groups_valid = _validate_expected_groups(nby, expected_groups) if not sort: raise NotImplementedError("sort must be True for xarray_reduce") @@ -313,10 +315,10 @@ def xarray_reduce( # Set expected_groups and convert to index since we need coords, sizes # for output xarray objects - expected_groups = list(expected_groups) + expected_groups_valid_list: list[T_ExpectIndex] = [] group_names: tuple[Any, ...] = () group_sizes: dict[Any, int] = {} - for idx, (b_, expect, isbin_) in enumerate(zip(by_da, expected_groups, isbins)): + for idx, (b_, expect, isbin_) in enumerate(zip(by_da, expected_groups_valid, isbins)): group_name = ( f"{b_.name}_bins" if isbin_ or isinstance(expect, pd.IntervalIndex) else b_.name ) @@ -326,21 +328,23 @@ def xarray_reduce( raise NotImplementedError( "flox does not support binning into an integer number of bins yet." ) + + expect1: T_ExpectOpt if expect is None: if isbin_: raise ValueError( f"Please provided bin edges for group variable {idx} " f"named {group_name} in expected_groups." ) - expect_ = _get_expected_groups(b_.data, sort=sort) + expect1 = _get_expected_groups(b_.data, sort=sort) else: - expect_ = expect - expect_index = _convert_expected_groups_to_index((expect_,), (isbin_,), sort=sort)[0] + expect1 = expect + expect_index = _convert_expected_groups_to_index((expect1,), (isbin_,), sort=sort)[0] # The if-check is for type hinting mainly, it narrows down the return # type of _convert_expected_groups_to_index to pure pd.Index: if expect_index is not None: - expected_groups[idx] = expect_index + expected_groups_valid_list.append(expect_index) group_sizes[group_name] = len(expect_index) else: # This will never be reached @@ -426,7 +430,7 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs): "skipna": skipna, "engine": engine, "reindex": reindex, - "expected_groups": tuple(expected_groups), + "expected_groups": tuple(expected_groups_valid_list), "isbin": isbins, "finalize_kwargs": finalize_kwargs, "dtype": dtype, @@ -440,10 +444,15 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs): if all(d not in ds_broad[var].dims for d in dim_tuple): actual[var] = ds_broad[var] - for name, expect, by_ in zip(group_names, expected_groups, by_da): - # Can't remove this till xarray handles IntervalIndex - if isinstance(expect, pd.IntervalIndex): - expect = expect.to_numpy() + expect3: T_ExpectIndex | np.ndarray + for name, expect2, by_ in zip(group_names, expected_groups_valid_list, by_da): + # Can't remove this until xarray handles IntervalIndex: + if isinstance(expect2, pd.IntervalIndex): + # TODO: Only place where expect3 is an ndarray, remove the type if xarray + # starts supporting IntervalIndex. + expect3 = expect2.to_numpy() + else: + expect3 = expect2 if isinstance(actual, xr.Dataset) and name in actual: actual = actual.drop_vars(name) # When grouping by MultiIndex, expect is an pd.Index wrapping @@ -451,15 +460,18 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs): if ( name in ds_broad.indexes and isinstance(ds_broad.indexes[name], pd.MultiIndex) - and not isinstance(expect, pd.RangeIndex) + and not isinstance(expect3, pd.RangeIndex) ): levelnames = ds_broad.indexes[name].names - expect = pd.MultiIndex.from_tuples(expect.values, names=levelnames) - actual[name] = expect + if isinstance(expect3, np.ndarray): + # TODO: workaoround for IntervalIndex issue. + raise NotImplementedError + expect3 = pd.MultiIndex.from_tuples(expect3.values, names=levelnames) + actual[name] = expect3 if Version(xr.__version__) > Version("2022.03.0"): actual = actual.set_coords(levelnames) else: - actual[name] = expect + actual[name] = expect3 if keep_attrs: actual[name].attrs = by_.attrs From 53c108cf43e9a01740bd8ae54b1db9c4284437b7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 3 Jul 2023 19:52:48 -0600 Subject: [PATCH 05/24] [pre-commit.ci] pre-commit autoupdate (#250) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - https://github.com/charliermarsh/ruff-pre-commit → https://github.com/astral-sh/ruff-pre-commit - [github.com/astral-sh/ruff-pre-commit: v0.0.260 → v0.0.276](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.260...v0.0.276) - [github.com/codespell-project/codespell: v2.2.4 → v2.2.5](https://github.com/codespell-project/codespell/compare/v2.2.4...v2.2.5) - [github.com/abravalheri/validate-pyproject: v0.12.2 → v0.13](https://github.com/abravalheri/validate-pyproject/compare/v0.12.2...v0.13) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ed74396b9..e9b1e9d6b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,9 +2,9 @@ ci: autoupdate_schedule: quarterly repos: - - repo: https://github.com/charliermarsh/ruff-pre-commit + - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.0.260' + rev: 'v0.0.276' hooks: - id: ruff args: ["--fix"] @@ -44,13 +44,13 @@ repos: args: [--extra-keys=metadata.kernelspec metadata.language_info.version] - repo: https://github.com/codespell-project/codespell - rev: v2.2.4 + rev: v2.2.5 hooks: - id: codespell additional_dependencies: - tomli - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.12.2 + rev: v0.13 hooks: - id: validate-pyproject From 40b300c9418b931da3cceaa9c799f55eab7d59a3 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Sat, 8 Jul 2023 14:28:59 +0200 Subject: [PATCH 06/24] Fix some typing errors in asv_bench and tests (#253) * Let mypy check asv_bench * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update pyproject.toml * Update test_core.py * Update test_core.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_core.py * Update cohorts.py * fix asv test * split-reduce is old name * Update test_core.py * the exclude doesn't seem to work anyway --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- asv_bench/benchmarks/cohorts.py | 6 +++--- asv_bench/benchmarks/combine.py | 2 +- pyproject.toml | 6 +++--- tests/test_core.py | 14 ++++++++------ 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/cohorts.py b/asv_bench/benchmarks/cohorts.py index ecef95d22..dbfbe8cd5 100644 --- a/asv_bench/benchmarks/cohorts.py +++ b/asv_bench/benchmarks/cohorts.py @@ -42,9 +42,9 @@ def track_num_layers(self): )[0] return len(result.dask.layers) - track_num_tasks.unit = "tasks" - track_num_tasks_optimized.unit = "tasks" - track_num_layers.unit = "layers" + track_num_tasks.unit = "tasks" # type: ignore[attr-defined] # Lazy + track_num_tasks_optimized.unit = "tasks" # type: ignore[attr-defined] # Lazy + track_num_layers.unit = "layers" # type: ignore[attr-defined] # Lazy class NWMMidwest(Cohorts): diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py index dd3d7a178..27600685f 100644 --- a/asv_bench/benchmarks/combine.py +++ b/asv_bench/benchmarks/combine.py @@ -70,7 +70,7 @@ def construct_member(groups) -> dict[str, Any]: ] self.kwargs = { "agg": flox.aggregations._initialize_aggregation( - "sum", "float64", np.float64, 0, None, {} + "sum", "float64", np.float64, 0, 0, {} ), "axis": (3,), } diff --git a/pyproject.toml b/pyproject.toml index 360be8a27..fb27ee761 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,8 +104,7 @@ known-third-party = [ [tool.mypy] allow_redefinition = true -exclude = "properties|asv_bench|doc|tests|flycheck" -files = "flox/*.py" +files = "**/*.py" show_error_codes = true warn_unused_ignores = true @@ -115,7 +114,8 @@ module=[ "cftime", "dask.*", "importlib_metadata", - "numpy_groupies", + "numba", + "numpy_groupies.*", "matplotlib.*", "pandas", "setuptools", diff --git a/tests/test_core.py b/tests/test_core.py index 5c4db9248..83b823b07 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -78,7 +78,7 @@ def dask_array_ones(*args): ) if TYPE_CHECKING: - from flox.core import T_Engine, T_ExpectedGroupsOpt, T_Func2 + from flox.core import T_Agg, T_Engine, T_ExpectedGroupsOpt, T_Method def _get_array_func(func: str) -> Callable: @@ -135,7 +135,7 @@ def test_alignment_error(): ) def test_groupby_reduce( engine: T_Engine, - func: T_Func2, + func: T_Agg, array: np.ndarray, by: np.ndarray, expected: list[float], @@ -992,14 +992,14 @@ def test_map_reduce_blockwise_mixed() -> None: dask.array.from_array(data.values, chunks=365), t.dt.month, func="mean", - method="split-reduce", + method="map-reduce", ) expected, _ = groupby_reduce(data, t.dt.month, func="mean") assert_equal(expected, actual) @requires_dask -@pytest.mark.parametrize("method", ["split-reduce", "blockwise", "map-reduce", "cohorts"]) +@pytest.mark.parametrize("method", ["blockwise", "map-reduce", "cohorts"]) def test_group_by_datetime(engine, method): kwargs = dict( func="mean", @@ -1356,13 +1356,15 @@ def test_validate_reindex_map_reduce( def test_validate_reindex() -> None: - for method in ["map-reduce", "cohorts"]: + methods: list[T_Method] = ["map-reduce", "cohorts"] + for method in methods: with pytest.raises(NotImplementedError): _validate_reindex( True, "argmax", method, expected_groups=None, any_by_dask=False, is_dask_array=True ) - for method in ["blockwise", "cohorts"]: + methods: list[T_Method] = ["blockwise", "cohorts"] + for method in methods: with pytest.raises(ValueError): _validate_reindex( True, "sum", method, expected_groups=None, any_by_dask=False, is_dask_array=True From 3fb613ce84bdad3d04dd385f534f579dd19a0a88 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 31 Aug 2023 04:55:46 +0200 Subject: [PATCH 07/24] replace the deprecated `provision-with-micromamba` with `setup-micromamba` (#258) * migrate the ci to `setup-micromamba` * migrate additional ci to `setup-micromamba` * migrate the benchmarks to `setup-micromamba` * migrate the upstream-dev ci to `setup-micromamba` * fix the new-ish `mypy` error * remove the apparently harmful quotes * change the yaml block used to specify multiple lines of arguments * Update flox/core.py --------- Co-authored-by: Deepak Cherian --- .github/workflows/benchmarks.yml | 7 ++++--- .github/workflows/ci-additional.yaml | 18 +++++++++------- .github/workflows/ci.yaml | 29 ++++++++++++++------------ .github/workflows/upstream-dev-ci.yaml | 8 ++++--- flox/core.py | 2 +- 5 files changed, 36 insertions(+), 28 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index c18eb162f..9521b7b68 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -22,12 +22,13 @@ jobs: fetch-depth: 0 - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/environment.yml environment-name: flox-tests - cache-env: true - # extra-specs: | + init-shell: bash + cache-environment: true + # create-args: | # python="${{ matrix.python-version }}" # - name: Setup some dependencies diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 38fba5d88..9449d9290 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -53,14 +53,15 @@ jobs: echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: flox-tests - extra-specs: | - python=${{env.PYTHON_VERSION}} - cache-env: true + init-shell: bash + cache-environment: true cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" + create-args: | + python=${{ env.PYTHON_VERSION }} - name: Install flox run: | @@ -102,14 +103,15 @@ jobs: run: | echo "TODAY=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - name: Setup micromamba - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{env.CONDA_ENV_FILE}} environment-name: flox-tests - extra-specs: | - python=${{env.PYTHON_VERSION}} - cache-env: true + init-shell: bash + cache-environment: true cache-env-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}" + create-args: | + python=${{ env.PYTHON_VERSION }} - name: Install flox run: | python -m pip install --no-deps -e . diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2d65e9d1d..75aafc4b6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -34,13 +34,14 @@ jobs: run: | echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/environment.yml environment-name: flox-tests - cache-env: true - extra-specs: | - python="${{ matrix.python-version }}" + init-shell: bash + cache-environment: true + create-args: | + python=${{ matrix.python-version }} - name: Install flox run: | python -m pip install --no-deps -e . @@ -77,13 +78,14 @@ jobs: with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: - environment-file: ci/${{ matrix.env }}.yml + environment-file: ci/environment.yml environment-name: flox-tests - cache-env: true - extra-specs: | - python="${{ matrix.python-version }}" + init-shell: bash + cache-environment: true + create-args: | + python=${{ matrix.python-version }} - name: Install flox run: | python -m pip install --no-deps -e . @@ -111,13 +113,14 @@ jobs: repository: 'pydata/xarray' fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/requirements/environment.yml environment-name: xarray-tests - cache-env: true - extra-specs: | - python="3.10" + init-shell: bash + cache-environment: true + create-args: | + python=3.10 - name: Install xarray run: | python -m pip install --no-deps . diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 7bd271928..62645707e 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -35,12 +35,14 @@ jobs: run: | echo "PYTHON_VERSION=${{ matrix.python-version }}" >> $GITHUB_ENV - name: Set up conda environment - uses: mamba-org/provision-with-micromamba@v15 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ci/upstream-dev-env.yml environment-name: flox-tests - extra-specs: | - python="${{ matrix.python-version }}" + init-shell: bash + cache-environment: true + create-args: >- + python=${{ matrix.python-version }} pytest-reportlog - name: Install flox run: | diff --git a/flox/core.py b/flox/core.py index f821df9bc..f8f700f99 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1622,7 +1622,7 @@ def _convert_expected_groups_to_index( out.append(ex) elif ex is not None: if isbin_: - out.append(pd.IntervalIndex.from_breaks(ex)) # type: ignore [arg-type] # TODO: what do we want here? + out.append(pd.IntervalIndex.from_breaks(ex)) else: if sort: ex = np.sort(ex) From bf4901794a3387d9e2cb83eba025e82154a49ae8 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Thu, 31 Aug 2023 19:16:18 +0200 Subject: [PATCH 08/24] compatibility with `numpy>=2.0` (#257) * attempt to fix the upstream-dev CI * replace `np.unicode_` with `np.str_` * remove `PROMOTE_TO_OBJECT` entirely --- .github/workflows/upstream-dev-ci.yaml | 2 +- flox/xrdtypes.py | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 62645707e..0de92037f 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -19,7 +19,7 @@ jobs: upstream-dev: name: upstream-dev runs-on: ubuntu-latest - if: ${{ contains( github.event.pull_request.labels.*.name, 'test-upstream') && github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} + if: ${{ (contains(github.event.pull_request.labels.*.name, 'test-upstream') && github.event_name == 'pull_request') || github.event_name == 'workflow_dispatch' }} defaults: run: shell: bash -l {0} diff --git a/flox/xrdtypes.py b/flox/xrdtypes.py index bf372f571..99dd08eb7 100644 --- a/flox/xrdtypes.py +++ b/flox/xrdtypes.py @@ -31,17 +31,6 @@ def __eq__(self, other): NINF = AlwaysLessThan() -# Pairs of types that, if both found, should be promoted to object dtype -# instead of following NumPy's own type-promotion rules. These type promotion -# rules match pandas instead. For reference, see the NumPy type hierarchy: -# https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html -PROMOTE_TO_OBJECT = [ - {np.number, np.character}, # numpy promotes to character - {np.bool_, np.character}, # numpy promotes to character - {np.bytes_, np.unicode_}, # numpy promotes to unicode -] - - def maybe_promote(dtype): """Simpler equivalent of pandas.core.common._maybe_promote From 5ea713efb5019ff9627893060a6c707a1d14170a Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 7 Sep 2023 11:04:10 +0200 Subject: [PATCH 09/24] convert datetime: micro-optimizations (#261) --- flox/xarray.py | 4 ++-- flox/xrutils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/flox/xarray.py b/flox/xarray.py index acd3f2d6c..487850ca0 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -374,8 +374,8 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs): # xarray always uses np.datetime64[ns] for np.datetime64 data dtype = "timedelta64[ns]" array = datetime_to_numeric(array, offset) - elif _contains_cftime_datetimes(array): - offset = min(array) + elif is_cftime: + offset = array.min() array = datetime_to_numeric(array, offset, datetime_unit="us") result, *groups = groupby_reduce(array, *by, func=func, **kwargs) diff --git a/flox/xrutils.py b/flox/xrutils.py index 45cf45eec..958bd3976 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -157,7 +157,7 @@ def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float): if array.dtype.kind in "Mm": offset = _datetime_nanmin(array) else: - offset = min(array) + offset = array.min() # Compute timedelta object. # For np.datetime64, this can silently yield garbage due to overflow. From 11d5083e8149af1f48f6d7d97e74c1d0d762fef8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 1 Oct 2023 19:00:53 -0600 Subject: [PATCH 10/24] Bump actions/checkout from 3 to 4 (#267) Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v3...v4) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/benchmarks.yml | 2 +- .github/workflows/ci-additional.yaml | 6 +++--- .github/workflows/ci.yaml | 6 +++--- .github/workflows/pypi.yaml | 2 +- .github/workflows/testpypi-release.yaml | 2 +- .github/workflows/upstream-dev-ci.yaml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 9521b7b68..5571d27af 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -17,7 +17,7 @@ jobs: steps: # We need the full repo to avoid this issue # https://github.com/actions/checkout/issues/23 - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 9449d9290..790b751ee 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -22,7 +22,7 @@ jobs: outputs: triggered: ${{ steps.detect-trigger.outputs.trigger-found }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 2 - uses: xarray-contrib/ci-trigger@v1.2 @@ -44,7 +44,7 @@ jobs: PYTHON_VERSION: "3.10" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for all branches and tags. @@ -95,7 +95,7 @@ jobs: PYTHON_VERSION: "3.10" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for all branches and tags. diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 75aafc4b6..43985f508 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -27,7 +27,7 @@ jobs: os: ["ubuntu-latest", "windows-latest"] python-version: ["3.8", "3.10"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set environment variables @@ -74,7 +74,7 @@ jobs: "minimal-requirements", ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment @@ -108,7 +108,7 @@ jobs: run: shell: bash -l {0} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: repository: 'pydata/xarray' fetch-depth: 0 # Fetch all history for all branches and tags. diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 7372dd0fa..93891fcd7 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -8,7 +8,7 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: diff --git a/.github/workflows/testpypi-release.yaml b/.github/workflows/testpypi-release.yaml index 1ed0a4a56..858348e6d 100644 --- a/.github/workflows/testpypi-release.yaml +++ b/.github/workflows/testpypi-release.yaml @@ -17,7 +17,7 @@ jobs: if: ${{ contains( github.event.pull_request.labels.*.name, 'test-build') && github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 0de92037f..7d87be16a 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -28,7 +28,7 @@ jobs: matrix: python-version: ["3.10"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set environment variables From 528a64549874963b4a5a4f381539307c1193402d Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Mon, 2 Oct 2023 23:27:06 +0200 Subject: [PATCH 11/24] tests: move xfail out of functions (#265) * tests: move xfail out of functions * remove stray print * Update tests/test_core.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add engine in test_dtype --------- Co-authored-by: Deepak Cherian Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- tests/__init__.py | 10 +++---- tests/conftest.py | 11 ++++---- tests/test_core.py | 63 ++++++++++++++++++++++++++------------------ tests/test_xarray.py | 32 +++++++--------------- 4 files changed, 54 insertions(+), 62 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 4c04a0fc8..3e43e94d6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,5 +1,5 @@ import importlib -from contextlib import contextmanager +from contextlib import nullcontext import numpy as np import packaging.version @@ -46,6 +46,7 @@ def LooseVersion(vstring): has_dask, requires_dask = _importorskip("dask") +has_numba, requires_numba = _importorskip("numba") has_xarray, requires_xarray = _importorskip("xarray") @@ -67,15 +68,10 @@ def __call__(self, dsk, keys, **kwargs): return dask.get(dsk, keys, **kwargs) -@contextmanager -def dummy_context(): - yield None - - def raise_if_dask_computes(max_computes=0): # return a dummy context manager so that this can be used for non-dask objects if not has_dask: - return dummy_context() + return nullcontext() scheduler = CountingScheduler(max_computes) return dask.config.set(scheduler=scheduler) diff --git a/tests/conftest.py b/tests/conftest.py index 5c3bb81f6..eb5971784 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,10 @@ import pytest +from . import requires_numba -@pytest.fixture(scope="module", params=["flox", "numpy", "numba"]) + +@pytest.fixture( + scope="module", params=["flox", "numpy", pytest.param("numba", marks=requires_numba)] +) def engine(request): - if request.param == "numba": - try: - import numba # noqa - except ImportError: - pytest.xfail() return request.param diff --git a/tests/test_core.py b/tests/test_core.py index 83b823b07..453958b2d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -105,7 +105,7 @@ def test_alignment_error(): @pytest.mark.parametrize("dtype", (float, int)) -@pytest.mark.parametrize("chunk", [False, True]) +@pytest.mark.parametrize("chunk", [False, pytest.param(True, marks=requires_dask)]) # TODO: make this intp when python 3.8 is dropped @pytest.mark.parametrize("expected_groups", [None, [0, 1, 2], np.array([0, 1, 2], dtype=np.int64)]) @pytest.mark.parametrize( @@ -145,7 +145,7 @@ def test_groupby_reduce( ) -> None: array = array.astype(dtype) if chunk: - if not has_dask or expected_groups is None: + if expected_groups is None: pytest.skip() array = da.from_array(array, chunks=(3,) if array.ndim == 1 else (1, 3)) by = da.from_array(by, chunks=(3,) if by.ndim == 1 else (1, 3)) @@ -166,7 +166,7 @@ def test_groupby_reduce( engine=engine, ) # we use pd.Index(expected_groups).to_numpy() which is always int64 - # for the values in this tests + # for the values in this test if expected_groups is None: g_dtype = by.dtype elif isinstance(expected_groups, np.ndarray): @@ -191,14 +191,20 @@ def gen_array_by(size, func): return array, by -@pytest.mark.parametrize("chunks", [None, -1, 3, 4]) +@pytest.mark.parametrize( + "chunks", + [ + None, + pytest.param(-1, marks=requires_dask), + pytest.param(3, marks=requires_dask), + pytest.param(4, marks=requires_dask), + ], +) @pytest.mark.parametrize("nby", [1, 2, 3]) @pytest.mark.parametrize("size", ((12,), (12, 9))) @pytest.mark.parametrize("add_nan_by", [True, False]) @pytest.mark.parametrize("func", ALL_FUNCS) def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): - if chunks is not None and not has_dask: - pytest.skip() if "arg" in func and engine == "flox": pytest.skip() @@ -390,16 +396,16 @@ def test_numpy_reduce_nd_md(): def test_groupby_agg_dask(func, shape, array_chunks, group_chunks, add_nan, dtype, engine, reindex): """Tests groupby_reduce with dask arrays against groupby_reduce with numpy arrays""" - rng = np.random.default_rng(12345) - array = dask.array.from_array(rng.random(shape), chunks=array_chunks).astype(dtype) - array = dask.array.ones(shape, chunks=array_chunks) - if func in ["first", "last"]: pytest.skip() if "arg" in func and (engine == "flox" or reindex): pytest.skip() + rng = np.random.default_rng(12345) + array = dask.array.from_array(rng.random(shape), chunks=array_chunks).astype(dtype) + array = dask.array.ones(shape, chunks=array_chunks) + labels = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0]) if add_nan: labels = labels.astype(float) @@ -612,7 +618,14 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): assert_equal(actual, expected, tolerance) -@pytest.mark.parametrize("reindex,chunks", [(None, None), (False, (2, 2, 3)), (True, (2, 2, 3))]) +@pytest.mark.parametrize( + "reindex, chunks", + [ + (None, None), + pytest.param(False, (2, 2, 3), marks=requires_dask), + pytest.param(True, (2, 2, 3), marks=requires_dask), + ], +) @pytest.mark.parametrize( "axis, groups, expected_shape", [ @@ -624,8 +637,6 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): def test_groupby_reduce_nans(reindex, chunks, axis, groups, expected_shape, engine): def _maybe_chunk(arr): if chunks: - if not has_dask: - pytest.skip() return da.from_array(arr, chunks=chunks) else: return arr @@ -739,7 +750,14 @@ def test_npg_nanarg_bug(func): ) @pytest.mark.parametrize("method", ["cohorts", "map-reduce"]) @pytest.mark.parametrize("chunk_labels", [False, True]) -@pytest.mark.parametrize("chunks", ((), (1,), (2,))) +@pytest.mark.parametrize( + "chunks", + ( + (), + pytest.param((1,), marks=requires_dask), + pytest.param((2,), marks=requires_dask), + ), +) def test_groupby_bins(chunk_labels, kwargs, chunks, engine, method) -> None: array = [1, 1, 1, 1, 1, 1] labels = [0.2, 1.5, 1.9, 2, 3, 20] @@ -748,8 +766,6 @@ def test_groupby_bins(chunk_labels, kwargs, chunks, engine, method) -> None: pytest.xfail() if chunks: - if not has_dask: - pytest.skip() array = dask.array.from_array(array, chunks=chunks) if chunk_labels: labels = dask.array.from_array(labels, chunks=chunks) @@ -825,7 +841,7 @@ def test_rechunk_for_cohorts(chunk_at, expected): assert rechunked.chunks == expected -@pytest.mark.parametrize("chunks", [None, 3]) +@pytest.mark.parametrize("chunks", [None, pytest.param(3, marks=requires_dask)]) @pytest.mark.parametrize("fill_value", [123, np.nan]) @pytest.mark.parametrize("func", ALL_FUNCS) def test_fill_value_behaviour(func, chunks, fill_value, engine): @@ -833,8 +849,6 @@ def test_fill_value_behaviour(func, chunks, fill_value, engine): # This is used by xarray if func in ["all", "any"] or "arg" in func: pytest.skip() - if chunks is not None and not has_dask: - pytest.skip() npfunc = _get_array_func(func) by = np.array([1, 2, 3, 1, 2, 3]) @@ -1050,11 +1064,8 @@ def test_factorize_values_outside_bins(): assert_equal(expected, actual) -@pytest.mark.parametrize("chunk", [True, False]) +@pytest.mark.parametrize("chunk", [pytest.param(True, marks=requires_dask), False]) def test_multiple_groupers_bins(chunk) -> None: - if chunk and not has_dask: - pytest.skip() - xp = dask.array if chunk else np array_kwargs = {"chunks": 2} if chunk else {} array = xp.ones((5, 2), **array_kwargs, dtype=np.int64) @@ -1087,9 +1098,9 @@ def test_multiple_groupers_bins(chunk) -> None: np.arange(2, 4).reshape(1, 2), ], ) -@pytest.mark.parametrize("chunk", [True, False]) +@pytest.mark.parametrize("chunk", [pytest.param(True, marks=requires_dask), False]) def test_multiple_groupers(chunk, by1, by2, expected_groups) -> None: - if chunk and (not has_dask or expected_groups is None): + if chunk and expected_groups is None: pytest.skip() xp = dask.array if chunk else np @@ -1233,7 +1244,7 @@ def test_dtype(func, dtype, engine): pytest.skip() arr = np.ones((4, 12), dtype=dtype) labels = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"]) - actual, _ = groupby_reduce(arr, labels, func=func, dtype=np.float64) + actual, _ = groupby_reduce(arr, labels, func=func, dtype=np.float64, engine=engine) assert actual.dtype == np.dtype("float64") diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 7a343d962..8f006e5f3 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -16,7 +16,7 @@ dask.config.set(scheduler="sync") try: - # Should test against legacy xarray implementation + # test against legacy xarray implementation xr.set_options(use_flox=False) except ValueError: pass @@ -31,15 +31,15 @@ @pytest.mark.parametrize("add_nan", [True, False]) @pytest.mark.parametrize("skipna", [True, False]) def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex): + if skipna is False and min_count is not None: + pytest.skip() + arr = np.ones((4, 12)) if add_nan: arr[1, ...] = np.nan arr[[0, 2], [3, 4]] = np.nan - if skipna is False and min_count is not None: - pytest.skip() - labels = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"]) labels = np.array(labels) labels2 = np.array([1, 2, 2, 1]) @@ -77,11 +77,8 @@ def test_xarray_reduce(skipna, add_nan, min_count, engine, reindex): # TODO: sort @pytest.mark.parametrize("pass_expected_groups", [True, False]) -@pytest.mark.parametrize("chunk", (True, False)) +@pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False)) def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine): - if not has_dask and chunk: - pytest.skip() - if chunk and pass_expected_groups is False: pytest.skip() @@ -126,11 +123,8 @@ def test_xarray_reduce_multiple_groupers(pass_expected_groups, chunk, engine): @pytest.mark.parametrize("pass_expected_groups", [True, False]) -@pytest.mark.parametrize("chunk", (True, False)) +@pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False)) def test_xarray_reduce_multiple_groupers_2(pass_expected_groups, chunk, engine): - if not has_dask and chunk: - pytest.skip() - if chunk and pass_expected_groups is False: pytest.skip() @@ -317,14 +311,12 @@ def test_multi_index_groupby_sum(engine): assert_equal(expected, actual) -@pytest.mark.parametrize("chunks", (None, 2)) +@pytest.mark.parametrize("chunks", (None, pytest.param(2, marks=requires_dask))) def test_xarray_groupby_bins(chunks, engine): array = xr.DataArray([1, 1, 1, 1, 1], dims="x") labels = xr.DataArray([1, 1.5, 1.9, 2, 3], dims="x", name="labels") if chunks: - if not has_dask: - pytest.skip() array = array.chunk({"x": chunks}) labels = labels.chunk({"x": chunks}) @@ -472,11 +464,8 @@ def test_alignment_error(): @pytest.mark.parametrize("add_nan", [True, False]) @pytest.mark.parametrize("dtype_out", [np.float64, "float64", np.dtype("float64")]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) -@pytest.mark.parametrize("chunk", (True, False)) +@pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False)) def test_dtype(add_nan, chunk, dtype, dtype_out, engine): - if chunk and not has_dask: - pytest.skip() - xp = dask.array if chunk else np data = xp.linspace(0, 1, 48, dtype=dtype).reshape((4, 12)) @@ -508,12 +497,9 @@ def test_dtype(add_nan, chunk, dtype, dtype_out, engine): xr.testing.assert_allclose(expected, actual.transpose("labels", ...), **tolerance64) -@pytest.mark.parametrize("chunk", [True, False]) +@pytest.mark.parametrize("chunk", [pytest.param(True, marks=requires_dask), False]) @pytest.mark.parametrize("use_flox", [True, False]) def test_dtype_accumulation(use_flox, chunk): - if chunk and not has_dask: - pytest.skip() - datetimes = pd.date_range("2010-01", "2015-01", freq="6H", inclusive="left") samples = 10 + np.cos(2 * np.pi * 0.001 * np.arange(len(datetimes))) * 1 samples += np.random.randn(len(datetimes)) From 486b7c5fb4324ddad48e9a00c0b88638f3e65471 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 2 Oct 2023 17:56:38 -0600 Subject: [PATCH 12/24] Drop python 3.8, test python 3.11 (#209) * Drop py38 * bump to py 3.11 * more bump * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * revert bump for xarray tests --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .github/workflows/ci-additional.yaml | 4 ++-- .github/workflows/ci.yaml | 2 +- .github/workflows/testpypi-release.yaml | 4 ++-- .github/workflows/upstream-dev-ci.yaml | 2 +- flox/core.py | 8 +++----- flox/xarray.py | 3 ++- flox/xrutils.py | 3 ++- pyproject.toml | 7 +++---- 8 files changed, 16 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 790b751ee..c71fc8913 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -41,7 +41,7 @@ jobs: env: CONDA_ENV_FILE: ci/environment.yml - PYTHON_VERSION: "3.10" + PYTHON_VERSION: "3.11" steps: - uses: actions/checkout@v4 @@ -92,7 +92,7 @@ jobs: shell: bash -l {0} env: CONDA_ENV_FILE: ci/environment.yml - PYTHON_VERSION: "3.10" + PYTHON_VERSION: "3.11" steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 43985f508..b963d671d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,7 +25,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "windows-latest"] - python-version: ["3.8", "3.10"] + python-version: ["3.9", "3.11"] steps: - uses: actions/checkout@v4 with: diff --git a/.github/workflows/testpypi-release.yaml b/.github/workflows/testpypi-release.yaml index 858348e6d..3b5f98b17 100644 --- a/.github/workflows/testpypi-release.yaml +++ b/.github/workflows/testpypi-release.yaml @@ -24,7 +24,7 @@ jobs: - uses: actions/setup-python@v4 name: Install Python with: - python-version: "3.10" + python-version: "3.11" - name: Install dependencies run: | @@ -65,7 +65,7 @@ jobs: - uses: actions/setup-python@v4 name: Install Python with: - python-version: "3.10" + python-version: "3.11" - uses: actions/download-artifact@v3 with: name: releases diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 7d87be16a..80ad81ac6 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -26,7 +26,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10"] + python-version: ["3.11"] steps: - uses: actions/checkout@v4 with: diff --git a/flox/core.py b/flox/core.py index f8f700f99..98c37bc6f 100644 --- a/flox/core.py +++ b/flox/core.py @@ -7,16 +7,14 @@ import sys import warnings from collections import namedtuple +from collections.abc import Mapping, Sequence from functools import partial, reduce from numbers import Integral from typing import ( TYPE_CHECKING, Any, Callable, - Dict, Literal, - Mapping, - Sequence, Union, overload, ) @@ -74,8 +72,8 @@ T_IsBins = Union[bool | Sequence[bool]] -IntermediateDict = Dict[Union[str, Callable], Any] -FinalResultsDict = Dict[str, Union["DaskArray", np.ndarray]] +IntermediateDict = dict[Union[str, Callable], Any] +FinalResultsDict = dict[str, Union["DaskArray", np.ndarray]] FactorProps = namedtuple("FactorProps", "offset_group nan_sentinel nanmask") # This dummy axis is inserted using np.expand_dims diff --git a/flox/xarray.py b/flox/xarray.py index 487850ca0..c85ad7113 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Hashable, Iterable, Sequence, Union +from collections.abc import Hashable, Iterable, Sequence +from typing import TYPE_CHECKING, Any, Union import numpy as np import pandas as pd diff --git a/flox/xrutils.py b/flox/xrutils.py index 958bd3976..4f80ec8c8 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -2,7 +2,8 @@ # defined in xarray import datetime -from typing import Any, Iterable +from collections.abc import Iterable +from typing import Any import numpy as np import pandas as pd diff --git a/pyproject.toml b/pyproject.toml index fb27ee761..e41fb17eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "flox" description = "GroupBy operations for dask.array" license = {file = "LICENSE"} readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" keywords = ["xarray", "dask", "groupby"] classifiers = [ "Development Status :: 4 - Beta", @@ -11,7 +11,6 @@ classifiers = [ "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -60,10 +59,10 @@ write_to_template= '__version__ = "{version}"' [tool.black] line-length = 100 -target-version = ["py38"] +target-version = ["py39"] [tool.ruff] -target-version = "py38" +target-version = "py39" builtins = ["ellipsis"] exclude = [ ".eggs", From f2945f0312339b3ad7b63b670bfd60965e804ee9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 3 Oct 2023 17:10:07 -0600 Subject: [PATCH 13/24] [pre-commit.ci] pre-commit autoupdate (#268) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.0.276 → v0.0.292](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.276...v0.0.292) - [github.com/psf/black: 23.3.0 → 23.9.1](https://github.com/psf/black/compare/23.3.0...23.9.1) - [github.com/executablebooks/mdformat: 0.7.16 → 0.7.17](https://github.com/executablebooks/mdformat/compare/0.7.16...0.7.17) - [github.com/codespell-project/codespell: v2.2.5 → v2.2.6](https://github.com/codespell-project/codespell/compare/v2.2.5...v2.2.6) - [github.com/abravalheri/validate-pyproject: v0.13 → v0.14](https://github.com/abravalheri/validate-pyproject/compare/v0.13...v0.14) * fix typos --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian --- .pre-commit-config.yaml | 10 +++++----- docs/source/implementation.md | 2 +- docs/source/user-stories/custom-aggregations.ipynb | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e9b1e9d6b..c36fa38cf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: 'v0.0.276' + rev: 'v0.0.292' hooks: - id: ruff args: ["--fix"] @@ -18,12 +18,12 @@ repos: - id: check-docstring-first - repo: https://github.com/psf/black - rev: 23.3.0 + rev: 23.9.1 hooks: - id: black - repo: https://github.com/executablebooks/mdformat - rev: 0.7.16 + rev: 0.7.17 hooks: - id: mdformat additional_dependencies: @@ -44,13 +44,13 @@ repos: args: [--extra-keys=metadata.kernelspec metadata.language_info.version] - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell additional_dependencies: - tomli - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.13 + rev: v0.14 hooks: - id: validate-pyproject diff --git a/docs/source/implementation.md b/docs/source/implementation.md index f3a2a87f7..29d9faf46 100644 --- a/docs/source/implementation.md +++ b/docs/source/implementation.md @@ -199,7 +199,7 @@ width: 100% 1. Group labels must be known at graph construction time, so this only works for numpy arrays. 1. This does require more tasks and a more complicated graph, but the communication overhead can be significantly lower. 1. The detection of "cohorts" is currently slow but could be improved. -1. The extra effort of detecting cohorts and mul;tiple copying of intermediate blocks may be worthwhile only if the chunk sizes are small +1. The extra effort of detecting cohorts and multiple copying of intermediate blocks may be worthwhile only if the chunk sizes are small relative to the approximate period of group labels, or small relative to the size of spatially localized groups. ### Example : sensitivity to chunking diff --git a/docs/source/user-stories/custom-aggregations.ipynb b/docs/source/user-stories/custom-aggregations.ipynb index 7b4167b98..f191c77e0 100644 --- a/docs/source/user-stories/custom-aggregations.ipynb +++ b/docs/source/user-stories/custom-aggregations.ipynb @@ -135,7 +135,7 @@ " # The next are for dask inputs and describe how to reduce\n", " # the data in parallel\n", " chunk=(\"sum\", \"nanlen\"), # first compute these blockwise : (grouped_sum, grouped_count)\n", - " combine=(\"sum\", \"sum\"), # reduce intermediate reuslts (sum the sums, sum the counts)\n", + " combine=(\"sum\", \"sum\"), # reduce intermediate results (sum the sums, sum the counts)\n", " finalize=lambda sum_, count: sum_ / count, # final mean value (divide sum by count)\n", "\n", " fill_value=(0, 0), # fill value for intermediate sums and counts when groups have no members\n", From 30d522dbaaeb84245f0496304f73272f3183bb2e Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 4 Oct 2023 22:04:36 -0600 Subject: [PATCH 14/24] Add multidimensional binning demo (#203) * Add nD binning notebook * fix. --- docs/source/user-stories.md | 1 + docs/source/user-stories/nD-bins.ipynb | 373 +++++++++++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 docs/source/user-stories/nD-bins.ipynb diff --git a/docs/source/user-stories.md b/docs/source/user-stories.md index 22b37939e..0241e01dc 100644 --- a/docs/source/user-stories.md +++ b/docs/source/user-stories.md @@ -8,4 +8,5 @@ user-stories/climatology.ipynb user-stories/climatology-hourly.ipynb user-stories/custom-aggregations.ipynb + user-stories/nD-bins.ipynb ``` diff --git a/docs/source/user-stories/nD-bins.ipynb b/docs/source/user-stories/nD-bins.ipynb new file mode 100644 index 000000000..87ef942bf --- /dev/null +++ b/docs/source/user-stories/nD-bins.ipynb @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e970d800-c612-482a-bb3a-b1eb7ad53d88", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "# Binning with multi-dimensional bins\n", + "\n", + "```{warning}\n", + "This post is a proof-of-concept for discussion. Expect APIs to change to enable this use case.\n", + "```\n", + "\n", + "Here we explore a binning problem where the bins are multidimensional\n", + "([xhistogram issue](https://github.com/xgcm/xhistogram/issues/28))\n", + "\n", + "> One of such multi-dim bin applications is the ranked probability score rps we\n", + "> use in `xskillscore.rps`, where we want to know how many forecasts fell into\n", + "> which bins. Bins are often defined as terciles of the forecast distribution\n", + "> and the bins for these terciles\n", + "> (`forecast_with_lon_lat_time_dims.quantile(q=[.33,.66],dim='time')`) depend on\n", + "> `lon` and `lat`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01f1a2ef-de62-45d0-a04e-343cd78debc5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import math\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr\n", + "\n", + "import flox\n", + "import flox.xarray" + ] + }, + { + "cell_type": "markdown", + "id": "0be3e214-0cf0-426f-8ebb-669cc5322310", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Create test data\n" + ] + }, + { + "cell_type": "markdown", + "id": "ce239000-e053-4fc3-ad14-e9e0160da869", + "metadata": { + "user_expressions": [] + }, + "source": [ + "Data to be reduced\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7659c24e-f5a1-4e59-84c0-5ec965ef92d2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "array = xr.DataArray(\n", + " np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]),\n", + " dims=(\"space\", \"time\"),\n", + " name=\"array\",\n", + ")\n", + "array" + ] + }, + { + "cell_type": "markdown", + "id": "da0c0ac9-ad75-42cd-a1ea-99069f5bef00", + "metadata": { + "user_expressions": [] + }, + "source": [ + "Array to group by\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4601e744-5d22-447e-97ce-9644198d485e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "by = xr.DataArray(\n", + " np.array([[1, 2, 3], [3, 4, 5], [5, 6, 7], [6, 7, 9]]),\n", + " dims=(\"space\", \"time\"),\n", + " name=\"by\",\n", + ")\n", + "by" + ] + }, + { + "cell_type": "markdown", + "id": "61c21c94-7b6e-46a6-b9c2-59d7b2d40c81", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "Multidimensional bins:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "863a1991-ab8d-47c0-aa48-22b422fcea8c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "bins = by + 0.5\n", + "bins = xr.DataArray(\n", + " np.concatenate([bins, bins[:, [-1]] + 1], axis=-1)[:, :-1].T,\n", + " dims=(\"time\", \"nbins\"),\n", + " name=\"bins\",\n", + ")\n", + "bins" + ] + }, + { + "cell_type": "markdown", + "id": "e65ecaba-d1cc-4485-ae58-c390cb2ebfab", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Concept\n", + "\n", + "The key idea is that GroupBy is two steps:\n", + "\n", + "1. Factorize (a.k.a \"digitize\") : convert the `by` data to a set of integer\n", + " codes representing the bins.\n", + "2. Apply the reduction.\n", + "\n", + "We treat multi-dimensional binning as a slightly complicated factorization\n", + "problem. Assume that bins are a function of `time`. So we\n", + "\n", + "1. generate a set of appropriate integer codes by:\n", + " 1. Loop over \"time\" and factorize the data appropriately.\n", + " 2. Add an offset to these codes so that \"bin 0\" for `time=0` is different\n", + " from \"bin 0\" for `time=1`\n", + "2. apply the groupby reduction to the \"offset codes\"\n", + "3. reshape the output to the right shape\n", + "\n", + "We will work at the xarray level, so its easy to keep track of the different\n", + "dimensions.\n", + "\n", + "### Factorizing\n", + "\n", + "The core `factorize_` function (which wraps `pd.cut`) only handles 1D bins, so\n", + "we use `xr.apply_ufunc` to vectorize it for us.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa33ab2c-0ecf-4198-a033-2a77f5d83c99", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "factorize_loop_dim = \"time\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afcddcc1-dd57-461e-a649-1f8bcd30342f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def factorize_nd_bins_core(by, bins):\n", + " group_idx, *_, props = flox.core.factorize_(\n", + " (by,),\n", + " axes=(-1,),\n", + " expected_groups=(pd.IntervalIndex.from_breaks(bins),),\n", + " )\n", + " # Use -1 as the NaN sentinel value\n", + " group_idx[props.nanmask] = -1\n", + " return group_idx\n", + "\n", + "\n", + "codes = xr.apply_ufunc(\n", + " factorize_nd_bins_core,\n", + " by,\n", + " bins,\n", + " # TODO: avoid hardcoded dim names\n", + " input_core_dims=[[\"space\"], [\"nbins\"]],\n", + " output_core_dims=[[\"space\"]],\n", + " vectorize=True,\n", + ")\n", + "codes" + ] + }, + { + "cell_type": "markdown", + "id": "1661312a-dc61-4a26-bfd8-12c2dc01eb15", + "metadata": { + "user_expressions": [] + }, + "source": [ + "### Offset the codes\n", + "\n", + "These are integer codes appropriate for a single timestep.\n", + "\n", + "We now add an offset that changes in time, to make sure \"bin 0\" for `time=0` is\n", + "different from \"bin 0\" for `time=1` (taken from\n", + "[this StackOverflow thread](https://stackoverflow.com/questions/46256279/bin-elements-per-row-vectorized-2d-bincount-for-numpy)).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e5801cb-a79c-4670-ad10-36bb19f1a6ff", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "N = math.prod([codes.sizes[d] for d in codes.dims if d != factorize_loop_dim])\n", + "offset = xr.DataArray(np.arange(codes.sizes[factorize_loop_dim]), dims=factorize_loop_dim)\n", + "# TODO: think about N-1 here\n", + "offset_codes = (codes + offset * (N - 1)).rename(by.name)\n", + "offset_codes.data[codes == -1] = -1\n", + "offset_codes" + ] + }, + { + "cell_type": "markdown", + "id": "6c06c48b-316b-4a33-9bc3-921acd10bcba", + "metadata": { + "user_expressions": [] + }, + "source": [ + "### Reduce\n", + "\n", + "Now that we have appropriate codes, let's apply the reduction\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cf1295e-4585-48b9-ac2b-9e00d03b2b9a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "interim = flox.xarray.xarray_reduce(\n", + " array,\n", + " offset_codes,\n", + " func=\"sum\",\n", + " # We use RangeIndex to indicate that `-1` code can be safely ignored\n", + " # (it indicates values outside the bins)\n", + " # TODO: Avoid hardcoding 9 = sizes[\"time\"] x (sizes[\"nbins\"] - 1)\n", + " expected_groups=pd.RangeIndex(9),\n", + ")\n", + "interim" + ] + }, + { + "cell_type": "markdown", + "id": "3539509b-d9b4-4342-a679-6ada6f285dfb", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Make final result\n", + "\n", + "Now reshape that 1D result appropriately.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1389d37-d76d-4a50-9dfb-8710258de3fd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "final = (\n", + " interim.coarsen(by=3)\n", + " # bin_number dimension is last, this makes sense since it is the core dimension\n", + " # and we vectorize over the loop dims.\n", + " # So the first (Nbins-1) elements are for the first index of the loop dim\n", + " .construct({\"by\": (factorize_loop_dim, \"bin_number\")})\n", + " .transpose(..., factorize_loop_dim)\n", + " .drop_vars(\"by\")\n", + ")\n", + "final" + ] + }, + { + "cell_type": "markdown", + "id": "a98b5e60-94af-45ae-be1b-4cb47e2d77ba", + "metadata": { + "user_expressions": [] + }, + "source": [ + "I think this is the expected answer.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "053a8643-f6d9-4fd1-b014-230fa716449c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "array.isel(space=slice(1, None)).rename({\"space\": \"bin_number\"}).identical(final)" + ] + }, + { + "cell_type": "markdown", + "id": "619ba4c4-7c87-459a-ab86-c187d3a86c67", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "## TODO\n", + "\n", + "This could be extended to:\n", + "\n", + "1. handle multiple `factorize_loop_dim`\n", + "2. avoid hard coded dimension names in the `apply_ufunc` call for factorizing\n", + "3. avoid hard coded number of output elements in the `xarray_reduce` call.\n", + "4. Somehow propagate the bin edges to the final output.\n" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 68b122e441d2f7fb1b6d70a4a2805b1f5226fe6f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 5 Oct 2023 16:39:52 -0600 Subject: [PATCH 15/24] Support quantile, median, mode with method="blockwise". (#269) * Support quantile, median with method="blockwise". We allow method="blockwise" when grouping by a dask array. This can only work if we have expected_groups, and set reindex=True. * Update flox/core.py * fix comment * Update validate_reindex test * Fix * Fix * Raise early if `q` is not provided for quantile * WIP test * narrow type * fix type * Mode + Tests * limit tests * cleanup tests * fix bool tests * Revert "limit tests" This reverts commit d46c3aef3b9c5d9dadd6555c0e7b1b628c0633e6. * Small cleanup * more cleanup * fix test * ignore scipy typing * update docs --- ci/environment.yml | 1 + docs/source/aggregations.md | 7 +- .../user-stories/custom-aggregations.ipynb | 9 +- flox/aggregate_npg.py | 81 +++++++++++++ flox/aggregations.py | 48 ++++++-- flox/core.py | 54 ++++++--- flox/xarray.py | 9 +- pyproject.toml | 3 +- tests/__init__.py | 1 + tests/test_core.py | 110 ++++++++++++++---- 10 files changed, 261 insertions(+), 62 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index 93f0e891f..9d5aa6d01 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -22,3 +22,4 @@ dependencies: - pooch - toolz - numba + - scipy diff --git a/docs/source/aggregations.md b/docs/source/aggregations.md index e6c10e4ba..d3591d2dc 100644 --- a/docs/source/aggregations.md +++ b/docs/source/aggregations.md @@ -11,8 +11,11 @@ the `func` kwarg: - `"std"`, `"nanstd"` - `"argmin"` - `"argmax"` -- `"first"` -- `"last"` +- `"first"`, `"nanfirst"` +- `"last"`, `"nanlast"` +- `"median"`, `"nanmedian"` +- `"mode"`, `"nanmode"` +- `"quantile"`, `"nanquantile"` ```{tip} We would like to add support for `cumsum`, `cumprod` ([issue](https://github.com/xarray-contrib/flox/issues/91)). Contributions are welcome! diff --git a/docs/source/user-stories/custom-aggregations.ipynb b/docs/source/user-stories/custom-aggregations.ipynb index f191c77e0..8b9be09e9 100644 --- a/docs/source/user-stories/custom-aggregations.ipynb +++ b/docs/source/user-stories/custom-aggregations.ipynb @@ -15,8 +15,13 @@ ">\n", "> A = da.groupby(['lon_bins', 'lat_bins']).mode()\n", "\n", - "This notebook will describe how to accomplish this using a custom `Aggregation`\n", - "since `mode` and `median` aren't supported by flox yet.\n" + "This notebook will describe how to accomplish this using a custom `Aggregation`.\n", + "\n", + "\n", + "```{tip}\n", + "flox now supports `mode`, `nanmode`, `quantile`, `nanquantile`, `median`, `nanmedian` using exactly the same \n", + "approach as shown below\n", + "```\n" ] }, { diff --git a/flox/aggregate_npg.py b/flox/aggregate_npg.py index 30e0eb257..966bd43b8 100644 --- a/flox/aggregate_npg.py +++ b/flox/aggregate_npg.py @@ -100,3 +100,84 @@ def _len(group_idx, array, engine, *, func, axis=-1, size=None, fill_value=None, len = partial(_len, func="len") nanlen = partial(_len, func="nanlen") + + +def median(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None): + return npg.aggregate_numpy.aggregate( + group_idx, + array, + func=np.median, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + +def nanmedian(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None): + return npg.aggregate_numpy.aggregate( + group_idx, + array, + func=np.nanmedian, + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + +def quantile(group_idx, array, engine, *, q, axis=-1, size=None, fill_value=None, dtype=None): + return npg.aggregate_numpy.aggregate( + group_idx, + array, + func=partial(np.quantile, q=q), + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + +def nanquantile(group_idx, array, engine, *, q, axis=-1, size=None, fill_value=None, dtype=None): + return npg.aggregate_numpy.aggregate( + group_idx, + array, + func=partial(np.nanquantile, q=q), + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + +def mode_(array, nan_policy, dtype): + from scipy.stats import mode + + # npg splits `array` into object arrays for each group + # scipy.stats.mode does not like that + # here we cast back + return mode(array.astype(dtype, copy=False), nan_policy=nan_policy, axis=-1).mode + + +def mode(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None): + return npg.aggregate_numpy.aggregate( + group_idx, + array, + func=partial(mode_, nan_policy="propagate", dtype=array.dtype), + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) + + +def nanmode(group_idx, array, engine, *, axis=-1, size=None, fill_value=None, dtype=None): + return npg.aggregate_numpy.aggregate( + group_idx, + array, + func=partial(mode_, nan_policy="omit", dtype=array.dtype), + axis=axis, + size=size, + fill_value=fill_value, + dtype=dtype, + ) diff --git a/flox/aggregations.py b/flox/aggregations.py index e5013032a..c06ef3509 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Callable, TypedDict import numpy as np -import numpy_groupies as npg from numpy.typing import DTypeLike from . import aggregate_flox, aggregate_npg, xrutils @@ -35,6 +34,16 @@ class AggDtype(TypedDict): intermediate: tuple[np.dtype | type[np.intp], ...] +def get_npg_aggregation(func, *, engine): + try: + method_ = getattr(aggregate_npg, func) + method = partial(method_, engine=engine) + except AttributeError: + aggregate = aggregate_npg._get_aggregate(engine).aggregate + method = partial(aggregate, func=func) + return method + + def generic_aggregate( group_idx, array, @@ -51,14 +60,11 @@ def generic_aggregate( try: method = getattr(aggregate_flox, func) except AttributeError: - method = partial(npg.aggregate_numpy.aggregate, func=func) + method = get_npg_aggregation(func, engine="numpy") + elif engine in ["numpy", "numba"]: - try: - method_ = getattr(aggregate_npg, func) - method = partial(method_, engine=engine) - except AttributeError: - aggregate = aggregate_npg._get_aggregate(engine).aggregate - method = partial(aggregate, func=func) + method = get_npg_aggregation(func, engine=engine) + else: raise ValueError( f"Expected engine to be one of ['flox', 'numpy', 'numba']. Received {engine} instead." @@ -465,10 +471,22 @@ def _pick_second(*x): final_dtype=bool, ) -# numpy_groupies does not support median -# And the dask version is really hard! -# median = Aggregation("median", chunk=None, combine=None, fill_value=None) -# nanmedian = Aggregation("nanmedian", chunk=None, combine=None, fill_value=None) +# Support statistical quantities only blockwise +# The parallel versions will be approximate and are hard to implement! +median = Aggregation( + name="median", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64 +) +nanmedian = Aggregation( + name="nanmedian", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64 +) +quantile = Aggregation( + name="quantile", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64 +) +nanquantile = Aggregation( + name="nanquantile", fill_value=dtypes.NA, chunk=None, combine=None, final_dtype=np.float64 +) +mode = Aggregation(name="mode", fill_value=dtypes.NA, chunk=None, combine=None) +nanmode = Aggregation(name="nanmode", fill_value=dtypes.NA, chunk=None, combine=None) aggregations = { "any": any_, @@ -496,6 +514,12 @@ def _pick_second(*x): "nanfirst": nanfirst, "last": last, "nanlast": nanlast, + "median": median, + "nanmedian": nanmedian, + "quantile": quantile, + "nanquantile": nanquantile, + "mode": mode, + "nanmode": nanmode, } diff --git a/flox/core.py b/flox/core.py index 98c37bc6f..484303295 100644 --- a/flox/core.py +++ b/flox/core.py @@ -1307,15 +1307,14 @@ def dask_groupby_agg( assert isinstance(axis, Sequence) assert all(ax >= 0 for ax in axis) - if method == "blockwise" and not isinstance(by, np.ndarray): - raise NotImplementedError - inds = tuple(range(array.ndim)) name = f"groupby_{agg.name}" token = dask.base.tokenize(array, by, agg, expected_groups, axis) if expected_groups is None and reindex: expected_groups = _get_expected_groups(by, sort=sort) + if method == "cohorts": + assert reindex is False by_input = by @@ -1349,7 +1348,6 @@ def dask_groupby_agg( # b. "_grouped_combine": A more general solution where we tree-reduce the groupby reduction. # This allows us to discover groups at compute time, support argreductions, lower intermediate # memory usage (but method="cohorts" would also work to reduce memory in some cases) - do_simple_combine = not _is_arg_reduction(agg) if method == "blockwise": @@ -1375,7 +1373,7 @@ def dask_groupby_agg( partial( blockwise_method, axis=axis, - expected_groups=None if method == "cohorts" else expected_groups, + expected_groups=expected_groups if reindex else None, engine=engine, sort=sort, ), @@ -1468,14 +1466,24 @@ def dask_groupby_agg( elif method == "blockwise": reduced = intermediate - # Here one input chunk → one output chunks - # find number of groups in each chunk, this is needed for output chunks - # along the reduced axis - slices = slices_from_chunks(tuple(array.chunks[ax] for ax in axis)) - groups_in_block = tuple(_unique(by_input[slc]) for slc in slices) - groups = (np.concatenate(groups_in_block),) - ngroups_per_block = tuple(len(grp) for grp in groups_in_block) - group_chunks = (ngroups_per_block,) + if reindex: + if TYPE_CHECKING: + assert expected_groups is not None + # TODO: we could have `expected_groups` be a dask array with appropriate chunks + # for now, we have a numpy array that is interpreted as listing all group labels + # that are present in every chunk + groups = (expected_groups,) + group_chunks = ((len(expected_groups),),) + else: + # Here one input chunk → one output chunks + # find number of groups in each chunk, this is needed for output chunks + # along the reduced axis + # TODO: this logic is very specialized for the resampling case + slices = slices_from_chunks(tuple(array.chunks[ax] for ax in axis)) + groups_in_block = tuple(_unique(by_input[slc]) for slc in slices) + groups = (np.concatenate(groups_in_block),) + ngroups_per_block = tuple(len(grp) for grp in groups_in_block) + group_chunks = (ngroups_per_block,) else: raise ValueError(f"Unknown method={method}.") @@ -1547,7 +1555,7 @@ def _validate_reindex( if reindex is True and not all_numpy: if _is_arg_reduction(func): raise NotImplementedError - if method in ["blockwise", "cohorts"]: + if method == "cohorts" or (method == "blockwise" and not any_by_dask): raise ValueError( "reindex=True is not a valid choice for method='blockwise' or method='cohorts'." ) @@ -1562,7 +1570,11 @@ def _validate_reindex( # have to do the grouped_combine since there's no good fill_value reindex = False - if method == "blockwise" or _is_arg_reduction(func): + if method == "blockwise": + # for grouping by dask arrays, we set reindex=True + reindex = any_by_dask + + elif _is_arg_reduction(func): reindex = False elif method == "cohorts": @@ -1767,7 +1779,10 @@ def groupby_reduce( *by : ndarray or DaskArray Array of labels to group over. Must be aligned with ``array`` so that ``array.shape[-by.ndim :] == by.shape`` - func : str or Aggregation + func : {"all", "any", "count", "sum", "nansum", "mean", "nanmean", \ + "max", "nanmax", "min", "nanmin", "argmax", "nanargmax", "argmin", "nanargmin", \ + "quantile", "nanquantile", "median", "nanmedian", "mode", "nanmode", \ + "first", "nanfirst", "last", "nanlast"} or Aggregation Single function name or an Aggregation instance expected_groups : (optional) Sequence Expected unique labels. @@ -1835,7 +1850,7 @@ def groupby_reduce( boost in computation speed. For cases like time grouping, this may result in large intermediates relative to the original block size. Avoid that by using ``method="cohorts"``. By default, it is turned off for argreductions. finalize_kwargs : dict, optional - Kwargs passed to finalize the reduction such as ``ddof`` for var, std. + Kwargs passed to finalize the reduction such as ``ddof`` for var, std or ``q`` for quantile. Returns ------- @@ -1855,6 +1870,9 @@ def groupby_reduce( "Try engine='numpy' or engine='numba' instead." ) + if func == "quantile" and (finalize_kwargs is None or "q" not in finalize_kwargs): + raise ValueError("Please pass `q` for quantile calculations.") + bys: T_Bys = tuple(np.asarray(b) if not is_duck_array(b) else b for b in by) nby = len(bys) by_is_dask = tuple(is_duck_dask_array(b) for b in bys) @@ -2023,7 +2041,7 @@ def groupby_reduce( result, groups = partial_agg( array, by_, - expected_groups=None if method == "blockwise" else expected_groups, + expected_groups=expected_groups, agg=agg, reindex=reindex, method=method, diff --git a/flox/xarray.py b/flox/xarray.py index c85ad7113..eb35da387 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -88,8 +88,11 @@ def xarray_reduce( Xarray object to reduce *by : DataArray or iterable of str or iterable of DataArray Variables with which to group by ``obj`` - func : str or Aggregation - Reduction method + func : {"all", "any", "count", "sum", "nansum", "mean", "nanmean", \ + "max", "nanmax", "min", "nanmin", "argmax", "nanargmax", "argmin", "nanargmin", \ + "quantile", "nanquantile", "median", "nanmedian", "mode", "nanmode", \ + "first", "nanfirst", "last", "nanlast"} or Aggregation + Single function name or an Aggregation instance expected_groups : str or sequence expected group labels corresponding to each `by` variable isbin : iterable of bool @@ -164,7 +167,7 @@ def xarray_reduce( boost in computation speed. For cases like time grouping, this may result in large intermediates relative to the original block size. Avoid that by using method="cohorts". By default, it is turned off for arg reductions. **finalize_kwargs - kwargs passed to the finalize function, like ``ddof`` for var, std. + kwargs passed to the finalize function, like ``ddof`` for var, std or ``q`` for quantile. Returns ------- diff --git a/pyproject.toml b/pyproject.toml index e41fb17eb..e387657d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,7 +118,8 @@ module=[ "matplotlib.*", "pandas", "setuptools", - "toolz" + "scipy.*", + "toolz", ] ignore_missing_imports = true diff --git a/tests/__init__.py b/tests/__init__.py index 3e43e94d6..f1c8ec6bf 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -47,6 +47,7 @@ def LooseVersion(vstring): has_dask, requires_dask = _importorskip("dask") has_numba, requires_numba = _importorskip("numba") +has_scipy, requires_scipy = _importorskip("scipy") has_xarray, requires_xarray = _importorskip("xarray") diff --git a/tests/test_core.py b/tests/test_core.py index 453958b2d..440431304 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -31,6 +31,7 @@ has_dask, raise_if_dask_computes, requires_dask, + requires_scipy, ) labels = np.array([0, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 0]) @@ -50,6 +51,9 @@ def dask_array_ones(*args): return None +DEFAULT_QUANTILE = 0.9 +SCIPY_STATS_FUNCS = ("mode", "nanmode") +BLOCKWISE_FUNCS = ("median", "nanmedian", "quantile", "nanquantile") + SCIPY_STATS_FUNCS ALL_FUNCS = ( "sum", "nansum", @@ -73,9 +77,11 @@ def dask_array_ones(*args): "any", "all", "nanlast", - pytest.param("median", marks=(pytest.mark.skip,)), - pytest.param("nanmedian", marks=(pytest.mark.skip,)), -) + "median", + "nanmedian", + "quantile", + "nanquantile", +) + tuple(pytest.param(func, marks=requires_scipy) for func in SCIPY_STATS_FUNCS) if TYPE_CHECKING: from flox.core import T_Agg, T_Engine, T_ExpectedGroupsOpt, T_Method @@ -84,12 +90,26 @@ def dask_array_ones(*args): def _get_array_func(func: str) -> Callable: if func == "count": - def npfunc(x): + def npfunc(x, **kwargs): x = np.asarray(x) return (~np.isnan(x)).sum() elif func in ["nanfirst", "nanlast"]: npfunc = getattr(xrutils, func) + + elif func in SCIPY_STATS_FUNCS: + import scipy.stats + + if "nan" in func: + func = func[3:] + nan_policy = "omit" + else: + nan_policy = "propagate" + + def npfunc(x, **kwargs): + spfunc = partial(getattr(scipy.stats, func), nan_policy=nan_policy) + return getattr(spfunc(x, **kwargs), func) + else: npfunc = getattr(np, func) @@ -205,7 +225,7 @@ def gen_array_by(size, func): @pytest.mark.parametrize("add_nan_by", [True, False]) @pytest.mark.parametrize("func", ALL_FUNCS) def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): - if "arg" in func and engine == "flox": + if ("arg" in func and engine == "flox") or (func in BLOCKWISE_FUNCS and chunks != -1): pytest.skip() array, by = gen_array_by(size, func) @@ -224,6 +244,10 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): finalize_kwargs = finalize_kwargs + [{"ddof": 1}, {"ddof": 0}] fill_value = np.nan tolerance = {"rtol": 1e-14, "atol": 1e-16} + elif "quantile" in func: + finalize_kwargs = [{"q": DEFAULT_QUANTILE}] + fill_value = None + tolerance = None else: fill_value = None tolerance = None @@ -246,15 +270,16 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): func_ = f"nan{func}" if "nan" not in func else func array_[..., nanmask] = np.nan expected = getattr(np, func_)(array_, axis=-1, **kwargs) - # elif func in ["first", "last"]: - # expected = getattr(xrutils, f"nan{func}")(array_[..., ~nanmask], axis=-1, **kwargs) - elif func in ["nanfirst", "nanlast"]: - expected = getattr(xrutils, func)(array_[..., ~nanmask], axis=-1, **kwargs) else: - expected = getattr(np, func)(array_[..., ~nanmask], axis=-1, **kwargs) + array_func = _get_array_func(func) + expected = array_func(array_[..., ~nanmask], axis=-1, **kwargs) for _ in range(nby): expected = np.expand_dims(expected, -1) + if func in BLOCKWISE_FUNCS: + assert chunks == -1 + flox_kwargs["method"] = "blockwise" + actual, *groups = groupby_reduce(array, *by, **flox_kwargs) assert actual.ndim == (array.ndim + nby - 1) assert expected.ndim == (array.ndim + nby - 1) @@ -265,7 +290,7 @@ def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): assert actual.dtype.kind == "i" assert_equal(actual, expected, tolerance) - if not has_dask or chunks is None: + if not has_dask or chunks is None or func in BLOCKWISE_FUNCS: continue params = list(itertools.product(["map-reduce"], [True, False, None])) @@ -396,7 +421,7 @@ def test_numpy_reduce_nd_md(): def test_groupby_agg_dask(func, shape, array_chunks, group_chunks, add_nan, dtype, engine, reindex): """Tests groupby_reduce with dask arrays against groupby_reduce with numpy arrays""" - if func in ["first", "last"]: + if func in ["first", "last"] or func in BLOCKWISE_FUNCS: pytest.skip() if "arg" in func and (engine == "flox" or reindex): @@ -551,7 +576,7 @@ def test_first_last_disallowed_dask(func): "axis", [None, (0, 1, 2), (0, 1), (0, 2), (1, 2), 0, 1, 2, (0,), (1,), (2,)] ) def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): - if "arg" in func and engine == "flox": + if ("arg" in func and engine == "flox") or func in BLOCKWISE_FUNCS: pytest.skip() if not isinstance(axis, int): @@ -847,7 +872,7 @@ def test_rechunk_for_cohorts(chunk_at, expected): def test_fill_value_behaviour(func, chunks, fill_value, engine): # fill_value = np.nan tests promotion of int counts to float # This is used by xarray - if func in ["all", "any"] or "arg" in func: + if (func in ["all", "any"] or "arg" in func) or func in BLOCKWISE_FUNCS: pytest.skip() npfunc = _get_array_func(func) @@ -903,8 +928,17 @@ def test_cohorts_map_reduce_consistent_dtypes(method, dtype, labels_dtype): @requires_dask @pytest.mark.parametrize("func", ALL_FUNCS) @pytest.mark.parametrize("axis", (-1, None)) -@pytest.mark.parametrize("method", ["blockwise", "cohorts", "map-reduce", "split-reduce"]) +@pytest.mark.parametrize("method", ["blockwise", "cohorts", "map-reduce"]) def test_cohorts_nd_by(func, method, axis, engine): + if ( + ("arg" in func and (axis is None or engine == "flox")) + or (method != "blockwise" and func in BLOCKWISE_FUNCS) + or (axis is None and ("first" in func or "last" in func)) + ): + pytest.skip() + if axis is not None and method != "map-reduce": + pytest.xfail() + o = dask.array.ones((3,), chunks=-1) o2 = dask.array.ones((2, 3), chunks=-1) @@ -915,20 +949,14 @@ def test_cohorts_nd_by(func, method, axis, engine): by[0, 4] = 31 array = np.broadcast_to(array, (2, 3) + array.shape) - if "arg" in func and (axis is None or engine == "flox"): - pytest.skip() - if func in ["any", "all"]: fill_value = False else: fill_value = -123 - if axis is not None and method != "map-reduce": - pytest.xfail() - if axis is None and ("first" in func or "last" in func): - pytest.skip() - kwargs = dict(func=func, engine=engine, method=method, axis=axis, fill_value=fill_value) + if "quantile" in func: + kwargs["finalize_kwargs"] = {"q": DEFAULT_QUANTILE} actual, groups = groupby_reduce(array, by, **kwargs) expected, sorted_groups = groupby_reduce(array.compute(), by, **kwargs) assert_equal(groups, sorted_groups) @@ -990,6 +1018,8 @@ def test_datetime_binning(): def test_bool_reductions(func, engine): if "arg" in func and engine == "flox": pytest.skip() + if "quantile" in func or "mode" in func: + pytest.skip() groups = np.array([1, 1, 1]) data = np.array([True, True, False]) npfunc = _get_array_func(func) @@ -1242,9 +1272,14 @@ def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dty def test_dtype(func, dtype, engine): if "arg" in func or func in ["any", "all"]: pytest.skip() + + finalize_kwargs = {"q": DEFAULT_QUANTILE} if "quantile" in func else {} + arr = np.ones((4, 12), dtype=dtype) labels = np.array(["a", "a", "c", "c", "c", "b", "b", "c", "c", "b", "b", "f"]) - actual, _ = groupby_reduce(arr, labels, func=func, dtype=np.float64, engine=engine) + actual, _ = groupby_reduce( + arr, labels, func=func, dtype=np.float64, engine=engine, finalize_kwargs=finalize_kwargs + ) assert actual.dtype == np.dtype("float64") @@ -1387,6 +1422,33 @@ def test_validate_reindex() -> None: ) assert actual is False + with pytest.raises(ValueError): + _validate_reindex( + True, + "sum", + method="blockwise", + expected_groups=np.array([1, 2, 3]), + any_by_dask=False, + is_dask_array=True, + ) + + assert _validate_reindex( + True, + "sum", + method="blockwise", + expected_groups=np.array([1, 2, 3]), + any_by_dask=True, + is_dask_array=True, + ) + assert _validate_reindex( + None, + "sum", + method="blockwise", + expected_groups=np.array([1, 2, 3]), + any_by_dask=True, + is_dask_array=True, + ) + @requires_dask def test_1d_blockwise_sort_optimization(): From 9f82e19d94b168a13260365bf7d9ecd6e464c53b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 7 Oct 2023 08:33:16 -0600 Subject: [PATCH 16/24] Add engine="numbagg" (#72) * Add engine="numbagg" * Fix. * fix CI * Add nanlen * fix env * Add numbagg * Bettter numbagg benchmarks? * Update ci/environment.yml * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * cleanup * Error on dtype specified * Don't shadow sum, mean, sum_of_squares * more skip * Fix backup npg aggregations * xfail nanmean bool * ignore numbagg for mypy * Add to upstream-dev CI * Add to optional dependencies * Fix bool reductions * fix mypy ignore * reintroduce engines * Update docstring * Update docs. * Support more aggregations * More aggregations * back to nancount * Add any, all * promote in nanstd too * Add ddof in anticipation of https://github.com/numbagg/numbagg/issues/138 * Add more benchmarks * reorder benchmark table * Fix numba compilation setup? * More benchmarks * Rework benchmarks * small docstring update * ignore asv typing * fix type ignoring * Guard against numbagg failures * Use released numbagg --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- asv_bench/asv.conf.json | 1 + asv_bench/benchmarks/reduce.py | 101 +++++++++++++++++++++++++++------ ci/environment.yml | 2 + ci/upstream-dev-env.yml | 1 + docs/source/engines.md | 8 ++- flox/aggregate_numbagg.py | 100 ++++++++++++++++++++++++++++++++ flox/aggregations.py | 18 +++++- flox/core.py | 34 ++++++++++- flox/xarray.py | 3 + pyproject.toml | 4 +- tests/__init__.py | 1 + tests/conftest.py | 10 +++- tests/test_core.py | 16 ++++-- tests/test_xarray.py | 4 ++ 14 files changed, 271 insertions(+), 32 deletions(-) create mode 100644 flox/aggregate_numbagg.py diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index d01f64dd4..cab53382a 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -72,6 +72,7 @@ // followed by the pip installed packages). // "matrix": { + "numbagg": [""], "numpy_groupies": [""], "numpy": [""], "pandas": [""], diff --git a/asv_bench/benchmarks/reduce.py b/asv_bench/benchmarks/reduce.py index 985658785..add77c182 100644 --- a/asv_bench/benchmarks/reduce.py +++ b/asv_bench/benchmarks/reduce.py @@ -1,49 +1,86 @@ import numpy as np -import numpy_groupies as npg import pandas as pd +from asv_runner.benchmarks.mark import parameterize, skip_for_params import flox +import flox.aggregations -from . import parameterized +N = 3000 +funcs = ["sum", "nansum", "mean", "nanmean", "max", "nanmax", "var", "count", "all"] +engines = ["flox", "numpy", "numbagg"] +expected_groups = { + "None": None, + "RangeIndex": pd.RangeIndex(5), + "bins": pd.IntervalIndex.from_breaks([1, 2, 4]), +} +expected_names = tuple(expected_groups) -N = 1000 -funcs = ["sum", "nansum", "mean", "nanmean", "max", "var", "nanvar", "count"] -engines = ["flox", "numpy"] -expected_groups = [None, pd.IntervalIndex.from_breaks([1, 2, 4])] +NUMBAGG_FUNCS = ["nansum", "nanmean", "nanmax", "count", "all"] + +numbagg_skip = [ + (func, expected_names[0], "numbagg") for func in funcs if func not in NUMBAGG_FUNCS +] + [(func, expected_names[1], "numbagg") for func in funcs if func not in NUMBAGG_FUNCS] + + +def setup_jit(): + # pre-compile jitted funcs + labels = np.ones((N), dtype=int) + array1 = np.ones((N), dtype=float) + array2 = np.ones((N, N), dtype=float) + + if "numba" in engines: + for func in funcs: + method = getattr(flox.aggregate_npg, func) + method(labels, array1, engine="numba") + if "numbagg" in engines: + for func in set(NUMBAGG_FUNCS) & set(funcs): + flox.groupby_reduce(array1, labels, func=func, engine="numbagg") + flox.groupby_reduce(array2, labels, func=func, engine="numbagg") class ChunkReduce: """Time the core reduction function.""" + min_run_count = 5 + warmup_time = 1 + def setup(self, *args, **kwargs): - # pre-compile jitted funcs - if "numba" in engines: - for func in funcs: - npg.aggregate_numba.aggregate( - np.ones((100,), dtype=int), np.ones((100,), dtype=int), func=func - ) raise NotImplementedError - @parameterized("func, engine, expected_groups", [funcs, engines, expected_groups]) - def time_reduce(self, func, engine, expected_groups): + @skip_for_params(numbagg_skip) + @parameterize({"func": funcs, "expected_name": expected_names, "engine": engines}) + def time_reduce(self, func, expected_name, engine): flox.groupby_reduce( self.array, self.labels, func=func, engine=engine, axis=self.axis, - expected_groups=expected_groups, + expected_groups=expected_groups[expected_name], ) - @parameterized("func, engine, expected_groups", [funcs, engines, expected_groups]) - def peakmem_reduce(self, func, engine, expected_groups): + @parameterize({"func": ["nansum", "nanmean", "nanmax", "count"], "engine": engines}) + def time_reduce_bare(self, func, engine): + flox.aggregations.generic_aggregate( + self.labels, + self.array, + axis=-1, + size=5, + func=func, + engine=engine, + fill_value=0, + ) + + @skip_for_params(numbagg_skip) + @parameterize({"func": funcs, "expected_name": expected_names, "engine": engines}) + def peakmem_reduce(self, func, expected_name, engine): flox.groupby_reduce( self.array, self.labels, func=func, engine=engine, axis=self.axis, - expected_groups=expected_groups, + expected_groups=expected_groups[expected_name], ) @@ -52,6 +89,16 @@ def setup(self, *args, **kwargs): self.array = np.ones((N,)) self.labels = np.repeat(np.arange(5), repeats=N // 5) self.axis = -1 + if "numbagg" in args: + setup_jit() + + +class ChunkReduce1DUnsorted(ChunkReduce): + def setup(self, *args, **kwargs): + self.array = np.ones((N,)) + self.labels = np.random.permutation(np.repeat(np.arange(5), repeats=N // 5)) + self.axis = -1 + setup_jit() class ChunkReduce2D(ChunkReduce): @@ -59,6 +106,15 @@ def setup(self, *args, **kwargs): self.array = np.ones((N, N)) self.labels = np.repeat(np.arange(N // 5), repeats=5) self.axis = -1 + setup_jit() + + +class ChunkReduce2DUnsorted(ChunkReduce): + def setup(self, *args, **kwargs): + self.array = np.ones((N, N)) + self.labels = np.random.permutation(np.repeat(np.arange(N // 5), repeats=5)) + self.axis = -1 + setup_jit() class ChunkReduce2DAllAxes(ChunkReduce): @@ -66,3 +122,12 @@ def setup(self, *args, **kwargs): self.array = np.ones((N, N)) self.labels = np.repeat(np.arange(N // 5), repeats=5) self.axis = None + setup_jit() + + +class ChunkReduce2DAllAxesUnsorted(ChunkReduce): + def setup(self, *args, **kwargs): + self.array = np.ones((N, N)) + self.labels = np.random.permutation(np.repeat(np.arange(N // 5), repeats=5)) + self.axis = None + setup_jit() diff --git a/ci/environment.yml b/ci/environment.yml index 9d5aa6d01..33e1b4661 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -23,3 +23,5 @@ dependencies: - toolz - numba - scipy + - pip: + - numbagg>=0.3 diff --git a/ci/upstream-dev-env.yml b/ci/upstream-dev-env.yml index 04fd7ce60..6b8e796ea 100644 --- a/ci/upstream-dev-env.yml +++ b/ci/upstream-dev-env.yml @@ -18,3 +18,4 @@ dependencies: - git+https://github.com/pandas-dev/pandas - git+https://github.com/dask/dask - git+https://github.com/ml31415/numpy-groupies + - git+https://github.com/numbagg/numbagg diff --git a/docs/source/engines.md b/docs/source/engines.md index 867979d13..68c65cab5 100644 --- a/docs/source/engines.md +++ b/docs/source/engines.md @@ -9,18 +9,20 @@ 1. `engine="numba"` wraps `numpy_groupies.aggregate_numba`. This uses `numba` kernels for the core aggregation. 1. `engine="flox"` uses the `ufunc.reduceat` method after first argsorting the array so that all group members occur sequentially. This was copied from a [gist by Stephan Hoyer](https://gist.github.com/shoyer/f538ac78ae904c936844) +1. `engine="numbagg"` uses the reductions available in [`numbagg.grouped`](https://github.com/numbagg/numbagg/blob/main/numbagg/grouped.py) + from the [numbagg](https://github.com/numbagg/numbagg) project. See [](arrays) for more details. ## Tradeoffs -For the common case of reducing a nD array by a 1D array of group labels (e.g. `groupby("time.month")`), `engine="flox"` *can* be faster. +For the common case of reducing a nD array by a 1D array of group labels (e.g. `groupby("time.month")`), `engine="numbagg"` is almost always faster, and `engine="flox"` *can* be faster. The reason is that `numpy_groupies` converts all groupby problems to a 1D problem, this can involve [some overhead](https://github.com/ml31415/numpy-groupies/pull/46). It is possible to optimize this a bit in `flox` or `numpy_groupies`, but the work has not been done yet. The advantage of `engine="numpy"` is that it tends to work for more array types, since it appears to be more common to implement `np.bincount`, and not `np.add.reduceat`. ```{tip} -Other potential engines we could add are [`numbagg`](https://github.com/numbagg/numbagg) ([stalled PR here](https://github.com/xarray-contrib/flox/pull/72)) and [`datashader`](https://github.com/xarray-contrib/flox/issues/142). -Both use numba for high-performance aggregations. Contributions or discussion is very welcome! +One other potential engine we could add is [`datashader`](https://github.com/xarray-contrib/flox/issues/142). +Contributions or discussion is very welcome! ``` diff --git a/flox/aggregate_numbagg.py b/flox/aggregate_numbagg.py new file mode 100644 index 000000000..b0b06d86e --- /dev/null +++ b/flox/aggregate_numbagg.py @@ -0,0 +1,100 @@ +from functools import partial + +import numbagg +import numbagg.grouped +import numpy as np + + +def _numbagg_wrapper( + group_idx, + array, + *, + axis=-1, + func="sum", + size=None, + fill_value=None, + dtype=None, + numbagg_func=None, +): + return numbagg_func( + array, + group_idx, + axis=axis, + num_labels=size, + # The following are unsupported + # fill_value=fill_value, + # dtype=dtype, + ) + + +def nansum(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): + if np.issubdtype(array.dtype, np.bool_): + array = array.astype(np.in64) + return numbagg.grouped.group_nansum( + array, + group_idx, + axis=axis, + num_labels=size, + # fill_value=fill_value, + # dtype=dtype, + ) + + +def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): + if np.issubdtype(array.dtype, np.int_): + array = array.astype(np.float64) + return numbagg.grouped.group_nanmean( + array, + group_idx, + axis=axis, + num_labels=size, + # fill_value=fill_value, + # dtype=dtype, + ) + + +def nanvar(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None, ddof=0): + assert ddof != 0 + if np.issubdtype(array.dtype, np.int_): + array = array.astype(np.float64) + return numbagg.grouped.group_nanvar( + array, + group_idx, + axis=axis, + num_labels=size, + # ddof=0, + # fill_value=fill_value, + # dtype=dtype, + ) + + +def nanstd(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None, ddof=0): + assert ddof != 0 + if np.issubdtype(array.dtype, np.int_): + array = array.astype(np.float64) + return numbagg.grouped.group_nanstd( + array, + group_idx, + axis=axis, + num_labels=size, + # ddof=0, + # fill_value=fill_value, + # dtype=dtype, + ) + + +nansum_of_squares = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nansum_of_squares) +nanlen = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nancount) +nanprod = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanprod) +nanfirst = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanfirst) +nanlast = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanlast) +# nanargmax = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanargmax) +# nanargmin = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanargmin) +nanmax = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanmax) +nanmin = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanmin) +any = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanany) +all = partial(_numbagg_wrapper, numbagg_func=numbagg.grouped.group_nanall) + +# sum = nansum +# mean = nanmean +# sum_of_squares = nansum_of_squares diff --git a/flox/aggregations.py b/flox/aggregations.py index c06ef3509..ec696da53 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -62,12 +62,28 @@ def generic_aggregate( except AttributeError: method = get_npg_aggregation(func, engine="numpy") + elif engine == "numbagg": + from . import aggregate_numbagg + + try: + if ( + # numabgg hardcodes ddof=1 + ("var" in func or "std" in func) + and kwargs.get("ddof", 0) == 0 + ): + method = get_npg_aggregation(func, engine="numpy") + + else: + method = getattr(aggregate_numbagg, func) + except AttributeError: + method = get_npg_aggregation(func, engine="numpy") + elif engine in ["numpy", "numba"]: method = get_npg_aggregation(func, engine=engine) else: raise ValueError( - f"Expected engine to be one of ['flox', 'numpy', 'numba']. Received {engine} instead." + f"Expected engine to be one of ['flox', 'numpy', 'numba', 'numbagg']. Received {engine} instead." ) group_idx = np.asarray(group_idx, like=array) diff --git a/flox/core.py b/flox/core.py index 484303295..e5518b551 100644 --- a/flox/core.py +++ b/flox/core.py @@ -67,7 +67,7 @@ T_AxesOpt = Union[T_Axis, T_Axes, None] T_Dtypes = Union[np.typing.DTypeLike, Sequence[np.typing.DTypeLike], None] T_FillValues = Union[np.typing.ArrayLike, Sequence[np.typing.ArrayLike], None] - T_Engine = Literal["flox", "numpy", "numba"] + T_Engine = Literal["flox", "numpy", "numba", "numbagg"] T_Method = Literal["map-reduce", "blockwise", "cohorts"] T_IsBins = Union[bool | Sequence[bool]] @@ -1831,7 +1831,7 @@ def groupby_reduce( (for 1D ``by`` only). * ``"split-reduce"``: Same as "cohorts" and will be removed soon. - engine : {"flox", "numpy", "numba"}, optional + engine : {"flox", "numpy", "numba", "numbagg"}, optional Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk: * ``"numpy"``: Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``. @@ -1843,6 +1843,9 @@ def groupby_reduce( for a reduction that is not yet implemented. * ``"numba"``: Use the implementations in ``numpy_groupies.aggregate_numba``. + * ``"numbagg"``: + Use the reductions supported by ``numbagg.grouped``. This will fall back to ``numpy_groupies.aggregate_numpy`` + for a reduction that is not yet implemented. reindex : bool, optional Whether to "reindex" the blockwise results to ``expected_groups`` (possibly automatically detected). If True, the intermediate result of the blockwise groupby-reduction has a value for all expected groups, @@ -1870,6 +1873,14 @@ def groupby_reduce( "Try engine='numpy' or engine='numba' instead." ) + if engine == "numbagg" and dtype is not None: + raise NotImplementedError( + "numbagg does not support the `dtype` kwarg. Either cast your " + "input arguments to `dtype` or use a different `engine`: " + "'flox' or 'numpy' or 'numba'. " + "See https://github.com/numbagg/numbagg/issues/121." + ) + if func == "quantile" and (finalize_kwargs is None or "q" not in finalize_kwargs): raise ValueError("Please pass `q` for quantile calculations.") @@ -1878,6 +1889,23 @@ def groupby_reduce( by_is_dask = tuple(is_duck_dask_array(b) for b in bys) any_by_dask = any(by_is_dask) + if ( + engine == "numbagg" + and _is_arg_reduction(func) + and (any_by_dask or is_duck_dask_array(array)) + ): + # There is only one test that fails, but I can't figure + # out why without deep debugging. + # just disable for now. + # test_groupby_reduce_axis_subset_against_numpy + # for array is 3D dask, by is 3D dask, axis=2 + # We are falling back to numpy for the arg reduction, + # so presumably something is going wrong + raise NotImplementedError( + "argreductions not supported for engine='numbagg' yet." + "Try engine='numpy' or engine='numba' instead." + ) + if method in ["split-reduce", "cohorts"] and any_by_dask: raise ValueError(f"method={method!r} can only be used when grouping by numpy arrays.") @@ -2019,7 +2047,7 @@ def groupby_reduce( if agg.chunk[0] is None and method != "blockwise": raise NotImplementedError( f"Aggregation {agg.name!r} is only implemented for dask arrays when method='blockwise'." - f"\n\n Received: {func}" + f"Received method={method!r}" ) if method in ["blockwise", "cohorts"] and nax != by_.ndim: diff --git a/flox/xarray.py b/flox/xarray.py index eb35da387..7e8c4f2b1 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -148,6 +148,9 @@ def xarray_reduce( for a reduction that is not yet implemented. * ``"numba"``: Use the implementations in ``numpy_groupies.aggregate_numba``. + * ``"numbagg"``: + Use the reductions supported by ``numbagg.grouped``. This will fall back to ``numpy_groupies.aggregate_numpy`` + for a reduction that is not yet implemented. keep_attrs : bool, optional Preserve attrs? skipna : bool, optional diff --git a/pyproject.toml b/pyproject.toml index e387657d2..5ca8e0317 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ repository = "https://github.com/xarray-contrib/flox.git" changelog = "https://github.com/xarray-contrib/flox/releases" [project.optional-dependencies] -all = ["cachey", "dask", "numba", "xarray"] +all = ["cachey", "dask", "numba", "numbagg", "xarray"] test = ["netCDF4"] [build-system] @@ -109,11 +109,13 @@ warn_unused_ignores = true [[tool.mypy.overrides]] module=[ + "asv_runner.*", "cachey", "cftime", "dask.*", "importlib_metadata", "numba", + "numbagg.*", "numpy_groupies.*", "matplotlib.*", "pandas", diff --git a/tests/__init__.py b/tests/__init__.py index f1c8ec6bf..f46319172 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -47,6 +47,7 @@ def LooseVersion(vstring): has_dask, requires_dask = _importorskip("dask") has_numba, requires_numba = _importorskip("numba") +has_numbagg, requires_numbagg = _importorskip("numbagg") has_scipy, requires_scipy = _importorskip("scipy") has_xarray, requires_xarray = _importorskip("xarray") diff --git a/tests/conftest.py b/tests/conftest.py index eb5971784..504564b5a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,16 @@ import pytest -from . import requires_numba +from . import requires_numba, requires_numbagg @pytest.fixture( - scope="module", params=["flox", "numpy", pytest.param("numba", marks=requires_numba)] + scope="module", + params=[ + "flox", + "numpy", + pytest.param("numba", marks=requires_numba), + pytest.param("numbagg", marks=requires_numbagg), + ], ) def engine(request): return request.param diff --git a/tests/test_core.py b/tests/test_core.py index 440431304..09ee50902 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -225,7 +225,9 @@ def gen_array_by(size, func): @pytest.mark.parametrize("add_nan_by", [True, False]) @pytest.mark.parametrize("func", ALL_FUNCS) def test_groupby_reduce_all(nby, size, chunks, func, add_nan_by, engine): - if ("arg" in func and engine == "flox") or (func in BLOCKWISE_FUNCS and chunks != -1): + if ("arg" in func and engine in ["flox", "numbagg"]) or ( + func in BLOCKWISE_FUNCS and chunks != -1 + ): pytest.skip() array, by = gen_array_by(size, func) @@ -424,7 +426,7 @@ def test_groupby_agg_dask(func, shape, array_chunks, group_chunks, add_nan, dtyp if func in ["first", "last"] or func in BLOCKWISE_FUNCS: pytest.skip() - if "arg" in func and (engine == "flox" or reindex): + if "arg" in func and (engine in ["flox", "numbagg"] or reindex): pytest.skip() rng = np.random.default_rng(12345) @@ -576,7 +578,7 @@ def test_first_last_disallowed_dask(func): "axis", [None, (0, 1, 2), (0, 1), (0, 2), (1, 2), 0, 1, 2, (0,), (1,), (2,)] ) def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): - if ("arg" in func and engine == "flox") or func in BLOCKWISE_FUNCS: + if ("arg" in func and engine in ["flox", "numbagg"]) or func in BLOCKWISE_FUNCS: pytest.skip() if not isinstance(axis, int): @@ -893,6 +895,9 @@ def test_fill_value_behaviour(func, chunks, fill_value, engine): @pytest.mark.parametrize("func", ["mean", "sum"]) @pytest.mark.parametrize("dtype", ["float32", "float64", "int32", "int64"]) def test_dtype_preservation(dtype, func, engine): + if engine == "numbagg": + # https://github.com/numbagg/numbagg/issues/121 + pytest.skip() if func == "sum" or (func == "mean" and "float" in dtype): expected = np.dtype(dtype) elif func == "mean" and "int" in dtype: @@ -931,7 +936,7 @@ def test_cohorts_map_reduce_consistent_dtypes(method, dtype, labels_dtype): @pytest.mark.parametrize("method", ["blockwise", "cohorts", "map-reduce"]) def test_cohorts_nd_by(func, method, axis, engine): if ( - ("arg" in func and (axis is None or engine == "flox")) + ("arg" in func and (axis is None or engine in ["flox", "numbagg"])) or (method != "blockwise" and func in BLOCKWISE_FUNCS) or (axis is None and ("first" in func or "last" in func)) ): @@ -1270,6 +1275,9 @@ def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dty @pytest.mark.parametrize("func", ALL_FUNCS) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) def test_dtype(func, dtype, engine): + if engine == "numbagg": + # https://github.com/numbagg/numbagg/issues/121 + pytest.skip() if "arg" in func or func in ["any", "all"]: pytest.skip() diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 8f006e5f3..116937052 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -466,6 +466,10 @@ def test_alignment_error(): @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("chunk", (pytest.param(True, marks=requires_dask), False)) def test_dtype(add_nan, chunk, dtype, dtype_out, engine): + if engine == "numbagg": + # https://github.com/numbagg/numbagg/issues/121 + pytest.skip() + xp = dask.array if chunk else np data = xp.linspace(0, 1, 48, dtype=dtype).reshape((4, 12)) From a897034b02e1c1445a8b7a25535120205d7df4d0 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 10 Oct 2023 18:58:47 -0600 Subject: [PATCH 17/24] Significantly faster cohorts detection. (#272) * Significantly faster cohorts detection. * cleanup * add benchmark * update types * fix * single chunk optimization * more optimization * more optimization * add comment * fix benchmark --- asv_bench/benchmarks/cohorts.py | 14 +++++++++++++- flox/core.py | 21 +++++++++++++++------ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/cohorts.py b/asv_bench/benchmarks/cohorts.py index dbfbe8cd5..21707d448 100644 --- a/asv_bench/benchmarks/cohorts.py +++ b/asv_bench/benchmarks/cohorts.py @@ -1,8 +1,10 @@ import dask import numpy as np import pandas as pd +import xarray as xr import flox +from flox.xarray import xarray_reduce class Cohorts: @@ -12,7 +14,7 @@ def setup(self, *args, **kwargs): raise NotImplementedError def time_find_group_cohorts(self): - flox.core.find_group_cohorts(self.by, self.array.chunks) + flox.core.find_group_cohorts(self.by, [self.array.chunks[ax] for ax in self.axis]) # The cache clear fails dependably in CI # Not sure why try: @@ -125,3 +127,13 @@ class PerfectMonthlyRechunked(PerfectMonthly): def setup(self, *args, **kwargs): super().setup() super().rechunk() + + +def time_cohorts_era5_single(): + TIME = 900 # 92044 in Google ARCO ERA5 + da = xr.DataArray( + dask.array.ones((TIME, 721, 1440), chunks=(1, -1, -1)), + dims=("time", "lat", "lon"), + coords=dict(time=pd.date_range("1959-01-01", freq="6H", periods=TIME)), + ) + xarray_reduce(da, da.time.dt.day, method="cohorts", func="any") diff --git a/flox/core.py b/flox/core.py index e5518b551..e2784ae99 100644 --- a/flox/core.py +++ b/flox/core.py @@ -208,15 +208,20 @@ def find_group_cohorts(labels, chunks, merge: bool = True) -> dict: # 1. First subset the array appropriately axis = range(-labels.ndim, 0) # Easier to create a dask array and use the .blocks property - array = dask.array.ones(tuple(sum(c) for c in chunks), chunks=chunks) + array = dask.array.empty(tuple(sum(c) for c in chunks), chunks=chunks) labels = np.broadcast_to(labels, array.shape[-labels.ndim :]) # Iterate over each block and create a new block of same shape with "chunk number" shape = tuple(array.blocks.shape[ax] for ax in axis) - blocks = np.empty(math.prod(shape), dtype=object) - for idx, block in enumerate(array.blocks.ravel()): - blocks[idx] = np.full(tuple(block.shape[ax] for ax in axis), idx) - which_chunk = np.block(blocks.reshape(shape).tolist()).reshape(-1) + # Use a numpy object array to enable assignment in the loop + # TODO: is it possible to just use a nested list? + # That is what we need for `np.block` + blocks = np.empty(shape, dtype=object) + array_chunks = tuple(np.array(c) for c in array.chunks) + for idx, blockindex in enumerate(np.ndindex(array.numblocks)): + chunkshape = tuple(c[i] for c, i in zip(array_chunks, blockindex)) + blocks[blockindex] = np.full(chunkshape, idx) + which_chunk = np.block(blocks.tolist()).reshape(-1) raveled = labels.reshape(-1) # these are chunks where a label is present @@ -229,7 +234,11 @@ def invert(x) -> tuple[np.ndarray, ...]: chunks_cohorts = tlz.groupby(invert, label_chunks.keys()) - if merge: + # If our dataset has chunksize one along the axis, + # then no merging is possible. + single_chunks = all((ac == 1).all() for ac in array_chunks) + + if merge and not single_chunks: # First sort by number of chunks occupied by cohort sorted_chunks_cohorts = dict( sorted(chunks_cohorts.items(), key=lambda kv: len(kv[0]), reverse=True) From 0e1b0d8e46471f5eb534f1a018b9b9a40a32750b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Oct 2023 11:41:15 -0600 Subject: [PATCH 18/24] repo-review comments (#270) * repo-review comments xref #264 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update pyproject.toml --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- .readthedocs.yml | 18 ++++++++++++++++++ flox/visualize.py | 2 +- pyproject.toml | 5 ++++- 4 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 .readthedocs.yml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c36fa38cf..f7e20ea85 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: rev: 'v0.0.292' hooks: - id: ruff - args: ["--fix"] + args: ["--fix", "--show-fixes"] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 @@ -17,7 +17,7 @@ repos: - id: end-of-file-fixer - id: check-docstring-first - - repo: https://github.com/psf/black + - repo: https://github.com/psf/black-pre-commit-mirror rev: 23.9.1 hooks: - id: black diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 000000000..7e4965745 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,18 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.11" +sphinx: + configuration: docs/conf.py + +python: + install: + - method: pip + path: . + extra_requirements: + - docs diff --git a/flox/visualize.py b/flox/visualize.py index 7d44c7d91..17c0a6596 100644 --- a/flox/visualize.py +++ b/flox/visualize.py @@ -179,7 +179,7 @@ def _visualize_cohorts(by, cohorts, ax=None): def visualize_groups_2d(labels, y0=0, **kwargs): colors = mpl.cm.tab10_r - for i, chunk in enumerate(labels): + for _i, chunk in enumerate(labels): chunk = np.atleast_2d(chunk) draw_mesh( *chunk.shape, diff --git a/pyproject.toml b/pyproject.toml index 5ca8e0317..721661c91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,6 @@ requires = [ "numpy_groupies>=0.9.19", "toolz", "setuptools>=61.0.0", - "wheel", "setuptools_scm[toml]>=7.0", ] build-backend = "setuptools.build_meta" @@ -77,6 +76,8 @@ ignore = [ "E731", ] select = [ + # Bugbear + # "B", # Pyflakes "F", # Pycodestyle @@ -127,6 +128,8 @@ ignore_missing_imports = true [tool.pytest.ini_options] addopts = "--tb=short" +minversion = "7" +testpaths = ["tests"] [tool.codespell] From 9dd126cbaa285a4a3a2ef60726d69767888104cb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 11 Oct 2023 16:40:35 -0600 Subject: [PATCH 19/24] benchmarks updates (#273) * Switch to mamba * Switch to mamba * fix/ * Add mamba to env * reduce * fix? * fix skipping * Speedup? * Remove custom bench env yaml * [skip-ci] fix bench env * [skip-ci]fix? * [skip-ci] again * Revert "[skip-ci] again" This reverts commit b440a4eaa9a281beaea629ac28e305846c220f6a. * Revert "[skip-ci]fix?" This reverts commit f045a6403b3d9bbc8e176a087e1924ab9f81058a. * Revert "[skip-ci] fix bench env" This reverts commit 2b5add3380ea4a74518db3c43b188d6f1aaa3cc3. * Revert "Remove custom bench env yaml" This reverts commit 518ff1a562523db3369526116548cd0b2581f5a3. * add back custom bench env * fix reduce bare with 2D arrays * try avoiding env file again * small cleanups * try again --- .github/workflows/benchmarks.yml | 20 +++----- asv_bench/asv.conf.json | 75 ++---------------------------- asv_bench/benchmarks/reduce.py | 80 ++++++++++++++++---------------- ci/benchmark.yml | 15 ++++++ 4 files changed, 66 insertions(+), 124 deletions(-) create mode 100644 ci/benchmark.yml diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 5571d27af..d8c39cdfd 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -24,18 +24,13 @@ jobs: - name: Set up conda environment uses: mamba-org/setup-micromamba@v1 with: - environment-file: ci/environment.yml - environment-name: flox-tests + environment-name: flox-bench + create-args: >- + python=3.10 + asv + mamba init-shell: bash cache-environment: true - # create-args: | - # python="${{ matrix.python-version }}" - - # - name: Setup some dependencies - # shell: bash -l {0} - # run: | - # pip install asv - # sudo apt-get update -y - name: Run benchmarks shell: bash -l {0} @@ -47,14 +42,11 @@ jobs: ASV_FACTOR: 1.5 ASV_SKIP_SLOW: 1 run: | - set -x + # set -x # ID this runner asv machine --yes echo "Baseline: ${{ github.event.pull_request.base.sha }} (${{ github.event.pull_request.base.label }})" echo "Contender: ${GITHUB_SHA} (${{ github.event.pull_request.head.label }})" - # Use mamba for env creation - # export CONDA_EXE=$(which mamba) - export CONDA_EXE=$(which conda) # Run benchmarks for current commit against base ASV_OPTIONS="--split --show-stderr --factor $ASV_FACTOR" asv continuous $ASV_OPTIONS ${{ github.event.pull_request.base.sha }} ${GITHUB_SHA} \ diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cab53382a..cfcbdf230 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -31,21 +31,8 @@ // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). "branches": ["main"], // for git - // "branches": ["default"], // for mercurial - - // The DVCS being used. If not set, it will be automatically - // determined from "repo" by looking at the protocol in the URL - // (if remote), or by looking for special directories, such as - // ".git" (if local). "dvcs": "git", - // The tool to use to create environments. May be "conda", - // "virtualenv" or other value depending on the plugins in use. - // If missing or the empty string, the tool will be automatically - // determined by looking for tools on the PATH environment - // variable. - "environment_type": "conda", - // timeout in seconds for installing any dependencies in environment // defaults to 10 min "install_timeout": 600, @@ -55,63 +42,11 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - "pythons": ["3.9"], - - // The list of conda channel names to be searched for benchmark - // dependency packages in the specified order - "conda_channels": ["conda-forge", "nodefaults"], - - // The matrix of dependencies to test. Each key is the name of a - // package (in PyPI) and the values are version numbers. An empty - // list or empty string indicates to just test against the default - // (latest) version. null indicates that the package is to not be - // installed. If the package to be tested is only available from - // PyPi, and the 'environment_type' is conda, then you can preface - // the package name by 'pip+', and the package will be installed via - // pip (with all the conda available packages installed first, - // followed by the pip installed packages). - // - "matrix": { - "numbagg": [""], - "numpy_groupies": [""], - "numpy": [""], - "pandas": [""], - "dask-core": [""], - "xarray": [""], - }, - - // Combinations of libraries/python versions can be excluded/included - // from the set to test. Each entry is a dictionary containing additional - // key-value pairs to include/exclude. - // - // An exclude entry excludes entries where all values match. The - // values are regexps that should match the whole string. - // - // An include entry adds an environment. Only the packages listed - // are installed. The 'python' key is required. The exclude rules - // do not apply to includes. - // - // In addition to package names, the following keys are available: - // - // - python - // Python version, as in the *pythons* variable above. - // - environment_type - // Environment type, as above. - // - sys_platform - // Platform, as in sys.platform. Possible values for the common - // cases: 'linux2', 'win32', 'cygwin', 'darwin'. - // - // "exclude": [ - // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows - // {"environment_type": "conda", "six": null}, // don't run without six on conda - // ], - // - // "include": [ - // // additional env for python2.7 - // {"python": "2.7", "numpy": "1.8"}, - // // additional env if run on windows+conda - // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, - // ], + // "pythons": ["3.9"], + + "environment_type": "mamba", + "conda_channels": ["conda-forge"], + "conda_environment_file": "../ci/benchmark.yml", // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" diff --git a/asv_bench/benchmarks/reduce.py b/asv_bench/benchmarks/reduce.py index add77c182..326b73566 100644 --- a/asv_bench/benchmarks/reduce.py +++ b/asv_bench/benchmarks/reduce.py @@ -6,20 +6,20 @@ import flox.aggregations N = 3000 -funcs = ["sum", "nansum", "mean", "nanmean", "max", "nanmax", "var", "count", "all"] +funcs = ["sum", "nansum", "mean", "nanmean", "max", "nanmax", "count"] engines = ["flox", "numpy", "numbagg"] expected_groups = { "None": None, - "RangeIndex": pd.RangeIndex(5), "bins": pd.IntervalIndex.from_breaks([1, 2, 4]), } expected_names = tuple(expected_groups) NUMBAGG_FUNCS = ["nansum", "nanmean", "nanmax", "count", "all"] - -numbagg_skip = [ - (func, expected_names[0], "numbagg") for func in funcs if func not in NUMBAGG_FUNCS -] + [(func, expected_names[1], "numbagg") for func in funcs if func not in NUMBAGG_FUNCS] +numbagg_skip = [] +for name in expected_names: + numbagg_skip.extend( + list((func, expected_names[0], "numbagg") for func in funcs if func not in NUMBAGG_FUNCS) + ) def setup_jit(): @@ -42,7 +42,7 @@ class ChunkReduce: """Time the core reduction function.""" min_run_count = 5 - warmup_time = 1 + warmup_time = 0.5 def setup(self, *args, **kwargs): raise NotImplementedError @@ -59,18 +59,6 @@ def time_reduce(self, func, expected_name, engine): expected_groups=expected_groups[expected_name], ) - @parameterize({"func": ["nansum", "nanmean", "nanmax", "count"], "engine": engines}) - def time_reduce_bare(self, func, engine): - flox.aggregations.generic_aggregate( - self.labels, - self.array, - axis=-1, - size=5, - func=func, - engine=engine, - fill_value=0, - ) - @skip_for_params(numbagg_skip) @parameterize({"func": funcs, "expected_name": expected_names, "engine": engines}) def peakmem_reduce(self, func, expected_name, engine): @@ -92,13 +80,18 @@ def setup(self, *args, **kwargs): if "numbagg" in args: setup_jit() - -class ChunkReduce1DUnsorted(ChunkReduce): - def setup(self, *args, **kwargs): - self.array = np.ones((N,)) - self.labels = np.random.permutation(np.repeat(np.arange(5), repeats=N // 5)) - self.axis = -1 - setup_jit() + @parameterize({"func": ["nansum", "nanmean", "nanmax", "count"], "engine": engines}) + def time_reduce_bare(self, func, engine): + # TODO: migrate to the other test cases, but we'll have to setup labels + # appropriately ;( + flox.aggregations.generic_aggregate( + self.labels, + self.array, + axis=self.axis, + func=func, + engine=engine, + fill_value=0, + ) class ChunkReduce2D(ChunkReduce): @@ -109,14 +102,6 @@ def setup(self, *args, **kwargs): setup_jit() -class ChunkReduce2DUnsorted(ChunkReduce): - def setup(self, *args, **kwargs): - self.array = np.ones((N, N)) - self.labels = np.random.permutation(np.repeat(np.arange(N // 5), repeats=5)) - self.axis = -1 - setup_jit() - - class ChunkReduce2DAllAxes(ChunkReduce): def setup(self, *args, **kwargs): self.array = np.ones((N, N)) @@ -125,9 +110,24 @@ def setup(self, *args, **kwargs): setup_jit() -class ChunkReduce2DAllAxesUnsorted(ChunkReduce): - def setup(self, *args, **kwargs): - self.array = np.ones((N, N)) - self.labels = np.random.permutation(np.repeat(np.arange(N // 5), repeats=5)) - self.axis = None - setup_jit() +# class ChunkReduce2DUnsorted(ChunkReduce): +# def setup(self, *args, **kwargs): +# self.array = np.ones((N, N)) +# self.labels = np.random.permutation(np.repeat(np.arange(N // 5), repeats=5)) +# self.axis = -1 +# setup_jit() + +# class ChunkReduce1DUnsorted(ChunkReduce): +# def setup(self, *args, **kwargs): +# self.array = np.ones((N,)) +# self.labels = np.random.permutation(np.repeat(np.arange(5), repeats=N // 5)) +# self.axis = -1 +# setup_jit() + + +# class ChunkReduce2DAllAxesUnsorted(ChunkReduce): +# def setup(self, *args, **kwargs): +# self.array = np.ones((N, N)) +# self.labels = np.random.permutation(np.repeat(np.arange(N // 5), repeats=5)) +# self.axis = None +# setup_jit() diff --git a/ci/benchmark.yml b/ci/benchmark.yml new file mode 100644 index 000000000..e4987000b --- /dev/null +++ b/ci/benchmark.yml @@ -0,0 +1,15 @@ +name: flox-bench +channels: + - conda-forge +dependencies: + - asv + - cachey + - dask-core + - numpy>=1.20 + - mamba + - pip + - python=3.10 + - xarray + - numpy_groupies>=0.9.19 + - numbagg>=0.3 + - wheel From a2b46b0dd786f431a3fa39960a65aa734ad0ee6f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 14 Oct 2023 15:13:38 -0600 Subject: [PATCH 20/24] Cleanups (#276) - prettier - stricter mypy - remove split-reduce --- .github/workflows/benchmarks.yml | 2 +- .github/workflows/ci.yaml | 20 +- .github/workflows/pypi.yaml | 2 +- .gitignore | 1 + .pre-commit-config.yaml | 101 +- asv_bench/asv.conf.json | 192 +- asv_bench/benchmarks/README_CI.md | 2 +- ci/benchmark.yml | 2 +- ci/docs.yml | 4 +- ci/environment.yml | 7 +- ci/minimal-requirements.yml | 2 +- ci/no-dask.yml | 2 +- ci/no-numba.yml | 24 + ci/no-xarray.yml | 2 +- ci/upstream-dev-env.yml | 10 +- codecov.yml | 6 +- docs/source/_static/style.css | 4 +- docs/source/engines.md | 2 +- docs/source/implementation.md | 12 +- docs/source/index.md | 4 +- .../user-stories/hourly-climatology.html | 15025 +++++++++++++++- flox/aggregations.py | 2 +- flox/cache.py | 2 +- flox/core.py | 39 +- flox/xarray.py | 6 +- flox/xrutils.py | 2 +- pyproject.toml | 4 +- readthedocs.yml | 6 +- tests/__init__.py | 4 +- tests/test_core.py | 2 +- 30 files changed, 15194 insertions(+), 299 deletions(-) create mode 100644 ci/no-numba.yml diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index d8c39cdfd..3a4215d40 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -8,7 +8,7 @@ on: jobs: benchmark: # if: ${{ contains( github.event.pull_request.labels.*.name, 'run-benchmark') && github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} # Run if the PR has been labelled correctly. - if: ${{ github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} # Always run. + if: ${{ github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' }} # Always run. name: Linux runs-on: ubuntu-20.04 env: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b963d671d..51777368c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -59,20 +59,20 @@ jobs: optional-deps: name: ${{ matrix.env }} - runs-on: ${{ matrix.os }} + runs-on: "ubuntu-latest" defaults: run: shell: bash -l {0} strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] - env: - [ - "no-xarray", - "no-dask", - "minimal-requirements", - ] + python-version: ["3.11"] + env: ["no-xarray", "no-dask"] + include: + - env: "no-numba" + python-version: "3.12" + - env: "minimal-requirements" + python-version: "3.9" steps: - uses: actions/checkout@v4 with: @@ -80,7 +80,7 @@ jobs: - name: Set up conda environment uses: mamba-org/setup-micromamba@v1 with: - environment-file: ci/environment.yml + environment-file: ci/${{ matrix.env }}.yml environment-name: flox-tests init-shell: bash cache-environment: true @@ -110,7 +110,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - repository: 'pydata/xarray' + repository: "pydata/xarray" fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set up conda environment uses: mamba-org/setup-micromamba@v1 diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 93891fcd7..1cd8b20f2 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -12,7 +12,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version: "3.x" - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.gitignore b/.gitignore index dbfcb2ef4..04f8d580d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ docs/source/generated/ html/ .asv/ +asv_bench/pkgs/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f7e20ea85..080a08bab 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,56 +1,61 @@ ci: - autoupdate_schedule: quarterly + autoupdate_schedule: quarterly repos: - - repo: https://github.com/astral-sh/ruff-pre-commit - # Ruff version. - rev: 'v0.0.292' - hooks: - - id: ruff - args: ["--fix", "--show-fixes"] - - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 - hooks: - - id: check-yaml - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-docstring-first - - - repo: https://github.com/psf/black-pre-commit-mirror - rev: 23.9.1 - hooks: - - id: black - - - repo: https://github.com/executablebooks/mdformat - rev: 0.7.17 - hooks: + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: "v0.0.292" + hooks: + - id: ruff + args: ["--fix", "--show-fixes"] + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v3.0.3" + hooks: + - id: prettier + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-yaml + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-docstring-first + + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 23.9.1 + hooks: + - id: black + + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.17 + hooks: - id: mdformat additional_dependencies: - mdformat-black - mdformat-myst - - repo: https://github.com/nbQA-dev/nbQA - rev: 1.7.0 - hooks: - - id: nbqa-black - - id: nbqa-ruff - args: [--fix] - - - repo: https://github.com/kynan/nbstripout - rev: 0.6.1 - hooks: - - id: nbstripout - args: [--extra-keys=metadata.kernelspec metadata.language_info.version] - - - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 - hooks: - - id: codespell - additional_dependencies: - - tomli - - - repo: https://github.com/abravalheri/validate-pyproject - rev: v0.14 - hooks: - - id: validate-pyproject + - repo: https://github.com/nbQA-dev/nbQA + rev: 1.7.0 + hooks: + - id: nbqa-black + - id: nbqa-ruff + args: [--fix] + + - repo: https://github.com/kynan/nbstripout + rev: 0.6.1 + hooks: + - id: nbstripout + args: [--extra-keys=metadata.kernelspec metadata.language_info.version] + + - repo: https://github.com/codespell-project/codespell + rev: v2.2.6 + hooks: + - id: codespell + additional_dependencies: + - tomli + + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.15 + hooks: + - id: validate-pyproject diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cfcbdf230..207572fce 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -1,98 +1,98 @@ { - // The version of the config file format. Do not change, unless - // you know what you are doing. - "version": 1, - - // The name of the project being benchmarked - "project": "flox", - - // The project's homepage - "project_url": "http://flox.readthedocs.io/", - - // The URL or local path of the source code repository for the - // project being benchmarked - "repo": "..", - - // The Python project's subdirectory in your repo. If missing or - // the empty string, the project is assumed to be located at the root - // of the repository. - // "repo_subdir": "", - - // Customizable commands for building, installing, and - // uninstalling the project. See asv.conf.json documentation. - // - // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], - // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], - // "build_command": [ - // "python setup.py build", - // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" - // ], - - // List of branches to benchmark. If not provided, defaults to "master" - // (for git) or "default" (for mercurial). - "branches": ["main"], // for git - "dvcs": "git", - - // timeout in seconds for installing any dependencies in environment - // defaults to 10 min - "install_timeout": 600, - - // the base URL to show a commit for the project. - "show_commit_url": "http://github.com/xarray-contrib/flox/commit/", - - // The Pythons you'd like to test against. If not provided, defaults - // to the current version of Python used to run `asv`. - // "pythons": ["3.9"], - - "environment_type": "mamba", - "conda_channels": ["conda-forge"], - "conda_environment_file": "../ci/benchmark.yml", - - // The directory (relative to the current directory) that benchmarks are - // stored in. If not provided, defaults to "benchmarks" - "benchmark_dir": "benchmarks", - - // The directory (relative to the current directory) to cache the Python - // environments in. If not provided, defaults to "env" - "env_dir": ".asv/env", - - // The directory (relative to the current directory) that raw benchmark - // results are stored in. If not provided, defaults to "results". - "results_dir": ".asv/results", - - // The directory (relative to the current directory) that the html tree - // should be written to. If not provided, defaults to "html". - "html_dir": ".asv/html", - - // The number of characters to retain in the commit hashes. - // "hash_length": 8, - - // `asv` will cache results of the recent builds in each - // environment, making them faster to install next time. This is - // the number of builds to keep, per environment. - // "build_cache_size": 2, - - // The commits after which the regression search in `asv publish` - // should start looking for regressions. Dictionary whose keys are - // regexps matching to benchmark names, and values corresponding to - // the commit (exclusive) after which to start looking for - // regressions. The default is to start from the first commit - // with results. If the commit is `null`, regression detection is - // skipped for the matching benchmark. - // - // "regressions_first_commits": { - // "some_benchmark": "352cdf", // Consider regressions only after this commit - // "another_benchmark": null, // Skip regression detection altogether - // }, - - // The thresholds for relative change in results, after which `asv - // publish` starts reporting regressions. Dictionary of the same - // form as in ``regressions_first_commits``, with values - // indicating the thresholds. If multiple entries match, the - // maximum is taken. If no entry matches, the default is 5%. - // - // "regressions_thresholds": { - // "some_benchmark": 0.01, // Threshold of 1% - // "another_benchmark": 0.5, // Threshold of 50% - // }, + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "flox", + + // The project's homepage + "project_url": "http://flox.readthedocs.io/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "..", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // Customizable commands for building, installing, and + // uninstalling the project. See asv.conf.json documentation. + // + // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"], + // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + // "build_command": [ + // "python setup.py build", + // "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + // ], + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + "branches": ["main"], // for git + "dvcs": "git", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + "install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "http://github.com/xarray-contrib/flox/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["3.9"], + + "environment_type": "mamba", + "conda_channels": ["conda-forge"], + "conda_environment_file": "../ci/benchmark.yml", + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html" + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache results of the recent builds in each + // environment, making them faster to install next time. This is + // the number of builds to keep, per environment. + // "build_cache_size": 2, + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // }, + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // }, } diff --git a/asv_bench/benchmarks/README_CI.md b/asv_bench/benchmarks/README_CI.md index f306736ab..a2e04e3ac 100644 --- a/asv_bench/benchmarks/README_CI.md +++ b/asv_bench/benchmarks/README_CI.md @@ -13,7 +13,7 @@ The `asv` suite can be run for any PR on GitHub Actions (check workflow `.github We use `asv continuous` to run the job, which runs a relative performance measurement. This means that there's no state to be saved and that regressions are only caught in terms of performance ratio (absolute numbers are available but they are not useful since we do not use stable hardware over time). `asv continuous` will: - Compile `scikit-image` for _both_ commits. We use `ccache` to speed up the process, and `mamba` is used to create the build environments. -- Run the benchmark suite for both commits, _twice_ (since `processes=2` by default). +- Run the benchmark suite for both commits, _twice_ (since `processes=2` by default). - Generate a report table with performance ratios: - `ratio=1.0` -> performance didn't change. - `ratio<1.0` -> PR made it slower. diff --git a/ci/benchmark.yml b/ci/benchmark.yml index e4987000b..80667a4f6 100644 --- a/ci/benchmark.yml +++ b/ci/benchmark.yml @@ -5,7 +5,7 @@ dependencies: - asv - cachey - dask-core - - numpy>=1.20 + - numpy>=1.21 - mamba - pip - python=3.10 diff --git a/ci/docs.yml b/ci/docs.yml index c3359e4f1..ad8feee08 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -5,7 +5,7 @@ dependencies: - dask-core - pip - xarray - - numpy>=1.20 + - numpy>=1.21 - numpydoc - numpy_groupies>=0.9.19 - toolz @@ -18,4 +18,4 @@ dependencies: - jupyter - sphinx-codeautolink - pip: - - -e .. + - -e .. diff --git a/ci/environment.yml b/ci/environment.yml index 33e1b4661..ca9111105 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -8,8 +8,8 @@ dependencies: - dask-core - netcdf4 - pandas - - numpy>=1.20 - - lxml # for mypy coverage report + - numpy>=1.21 + - lxml # for mypy coverage report - matplotlib - pip - pytest @@ -18,10 +18,9 @@ dependencies: - pytest-xdist - xarray - pre-commit + - numbagg>=0.3 - numpy_groupies>=0.9.19 - pooch - toolz - numba - scipy - - pip: - - numbagg>=0.3 diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml index eeb075194..40b287f64 100644 --- a/ci/minimal-requirements.yml +++ b/ci/minimal-requirements.yml @@ -9,7 +9,7 @@ dependencies: - pytest-cov - pytest-pretty - pytest-xdist - - numpy==1.20 + - numpy==1.21 - numpy_groupies==0.9.19 - pandas - pooch diff --git a/ci/no-dask.yml b/ci/no-dask.yml index 8876a84cf..f74d522b6 100644 --- a/ci/no-dask.yml +++ b/ci/no-dask.yml @@ -5,7 +5,7 @@ dependencies: - codecov - netcdf4 - pandas - - numpy>=1.20 + - numpy>=1.21 - pip - pytest - pytest-cov diff --git a/ci/no-numba.yml b/ci/no-numba.yml new file mode 100644 index 000000000..1d846d117 --- /dev/null +++ b/ci/no-numba.yml @@ -0,0 +1,24 @@ +name: flox-tests +channels: + - conda-forge +dependencies: + - asv + - cachey + - codecov + - dask-core + - netcdf4 + - pandas + - numpy>=1.21 + - lxml # for mypy coverage report + - matplotlib + - pip + - pytest + - pytest-cov + - pytest-pretty + - pytest-xdist + - xarray + - pre-commit + - numpy_groupies>=0.9.19 + - pooch + - toolz + - scipy diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml index 491d7ba8e..9eb75cf96 100644 --- a/ci/no-xarray.yml +++ b/ci/no-xarray.yml @@ -5,7 +5,7 @@ dependencies: - codecov - netcdf4 - pandas - - numpy>=1.20 + - numpy>=1.21 - pip - pytest - pytest-cov diff --git a/ci/upstream-dev-env.yml b/ci/upstream-dev-env.yml index 6b8e796ea..47e6202ff 100644 --- a/ci/upstream-dev-env.yml +++ b/ci/upstream-dev-env.yml @@ -14,8 +14,8 @@ dependencies: - pytest-xdist - pip - pip: - - git+https://github.com/pydata/xarray - - git+https://github.com/pandas-dev/pandas - - git+https://github.com/dask/dask - - git+https://github.com/ml31415/numpy-groupies - - git+https://github.com/numbagg/numbagg + - git+https://github.com/pydata/xarray + - git+https://github.com/pandas-dev/pandas + - git+https://github.com/dask/dask + - git+https://github.com/ml31415/numpy-groupies + - git+https://github.com/numbagg/numbagg diff --git a/codecov.yml b/codecov.yml index 90c354ae2..6228abe13 100644 --- a/codecov.yml +++ b/codecov.yml @@ -5,9 +5,9 @@ codecov: comment: false ignore: - - 'benchmarks/*.py' - - 'tests/*.py' - - 'setup.py' + - "benchmarks/*.py" + - "tests/*.py" + - "setup.py" coverage: precision: 2 diff --git a/docs/source/_static/style.css b/docs/source/_static/style.css index 5d9d45bb7..d97a63fad 100644 --- a/docs/source/_static/style.css +++ b/docs/source/_static/style.css @@ -4,7 +4,9 @@ padding-left: 1.25em; border-left: thin var(--color-foreground-muted) solid; } -.xr-array-wrap, .xr-var-data, .xr-var-preview { +.xr-array-wrap, +.xr-var-data, +.xr-var-preview { font-size: 0.9em; } .gp { diff --git a/docs/source/engines.md b/docs/source/engines.md index 68c65cab5..8a33c675c 100644 --- a/docs/source/engines.md +++ b/docs/source/engines.md @@ -16,7 +16,7 @@ See [](arrays) for more details. ## Tradeoffs -For the common case of reducing a nD array by a 1D array of group labels (e.g. `groupby("time.month")`), `engine="numbagg"` is almost always faster, and `engine="flox"` *can* be faster. +For the common case of reducing a nD array by a 1D array of group labels (e.g. `groupby("time.month")`), `engine="numbagg"` is almost always faster, and `engine="flox"` _can_ be faster. The reason is that `numpy_groupies` converts all groupby problems to a 1D problem, this can involve [some overhead](https://github.com/ml31415/numpy-groupies/pull/46). It is possible to optimize this a bit in `flox` or `numpy_groupies`, but the work has not been done yet. diff --git a/docs/source/implementation.md b/docs/source/implementation.md index 29d9faf46..9a66a6c8d 100644 --- a/docs/source/implementation.md +++ b/docs/source/implementation.md @@ -43,7 +43,7 @@ width: 100% --- ``` -The first step is to extract all members of a group, which involves a *lot* of +The first step is to extract all members of a group, which involves a _lot_ of communication and is quite expensive (in dataframe terminology, this is a "shuffle"). This is fundamentally why many groupby reductions don't work well right now with big datasets. @@ -129,7 +129,7 @@ width: 100% --- ``` -*Tradeoffs* +_Tradeoffs_ 1. Only works for certain groupings. 1. Group labels must be known at graph construction time, so this only works for numpy arrays @@ -146,14 +146,14 @@ width: 100% The `map-reduce` strategy is quite effective but can involve some unnecessary communication. It can be possible to exploit patterns in how group labels are distributed across chunks (similar to `method="blockwise"` above). Two cases are illustrative: -1. Groups labels can be *approximately-periodic*: e.g. `time.dayofyear` (period 365 or 366) or `time.month` (period 12). +1. Groups labels can be _approximately-periodic_: e.g. `time.dayofyear` (period 365 or 366) or `time.month` (period 12). Consider our earlier example, `groupby("time.month")` with monthly frequency data and chunksize of 4 along `time`. ![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) Because a chunksize of 4 evenly divides the number of groups (12) all we need to do is index out blocks 0, 3, 7 and then apply the `"map-reduce"` strategy to form the final result for months Jan-Apr. Repeat for the remaining groups of months (May-Aug; Sep-Dec) and then concatenate. -1. Groups can be *spatially localized* like the blockwise case above, for example grouping by country administrative boundaries like +1. Groups can be _spatially localized_ like the blockwise case above, for example grouping by country administrative boundaries like counties or districts. In this case, concatenating the result for the northwesternmost county or district and the southeasternmost district can involve a lot of wasteful communication (again depending on chunking). @@ -172,7 +172,7 @@ Consider our earlier example, `groupby("time.month")` with monthly frequency dat ![cohorts-schematic](/../diagrams/cohorts-month-chunk4.png) With `method="map-reduce", reindex=True`, each block will become 3x its original size at the blockwise step: input blocks have 4 timesteps while output block -has a value for all 12 months. Note that the blockwise groupby-reduction *does not reduce* the data since there is only one element in each +has a value for all 12 months. Note that the blockwise groupby-reduction _does not reduce_ the data since there is only one element in each group. In addition, since `map-reduce` will make the final result have only one chunk of size 12 along the new `month` dimension, the final result has chunk sizes 3x that of the input, which may not be ideal. @@ -184,7 +184,7 @@ remaining groups of months (May-Aug; Sep-Dec) and then concatenate. This is the We can generalize this idea for more complicated problems (inspired by the `split_out`kwarg in `dask.dataframe.groupby`) We first apply the groupby-reduction blockwise, then split and reindex blocks to create a new array with which we complete the reduction -using `map-reduce`. Because the split or shuffle step occurs after the blockwise reduction, we *sometimes* communicate a significantly smaller +using `map-reduce`. Because the split or shuffle step occurs after the blockwise reduction, we _sometimes_ communicate a significantly smaller amount of data than if we split or shuffled the input array. ```{image} /../diagrams/new-cohorts-annotated.svg diff --git a/docs/source/index.md b/docs/source/index.md index 9fdd470c2..353dddfbe 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -31,8 +31,8 @@ See a presentation ([video](https://discourse.pangeo.io/t/november-17-2021-flox- 1. {py:func}`flox.xarray.xarray_reduce` [extends](xarray.md) Xarray's GroupBy operations allowing lazy grouping by dask arrays, grouping by multiple arrays, as well as combining categorical grouping and histogram-style binning operations using multiple variables. 1. `flox` also provides utility functions for rechunking both dask arrays and Xarray objects along a single dimension using the group labels as a guide: - 1. To rechunk for blockwise operations: {py:func}`flox.rechunk_for_blockwise`, {py:func}`flox.xarray.rechunk_for_blockwise`. - 1. To rechunk so that "cohorts", or groups of labels, tend to occur in the same chunks: {py:func}`flox.rechunk_for_cohorts`, {py:func}`flox.xarray.rechunk_for_cohorts`. + 1. To rechunk for blockwise operations: {py:func}`flox.rechunk_for_blockwise`, {py:func}`flox.xarray.rechunk_for_blockwise`. + 1. To rechunk so that "cohorts", or groups of labels, tend to occur in the same chunks: {py:func}`flox.rechunk_for_cohorts`, {py:func}`flox.xarray.rechunk_for_cohorts`. ## Installing diff --git a/docs/source/user-stories/hourly-climatology.html b/docs/source/user-stories/hourly-climatology.html index fc78db18d..f626ee551 100644 --- a/docs/source/user-stories/hourly-climatology.html +++ b/docs/source/user-stories/hourly-climatology.html @@ -1,88 +1,14961 @@ - - - - - + - + + Dask Performance Report - - Dask Performance Report - - - - - - - - - - - - - - - + + + + - +
- - - - - -
- - - - - - - + - + } + }, + 10, + root, + ); + } + })(window); + }); + }; + if (document.readyState != "loading") fn(); + else document.addEventListener("DOMContentLoaded", fn); + })(); + - diff --git a/flox/aggregations.py b/flox/aggregations.py index ec696da53..b2129c197 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -225,7 +225,7 @@ def __init__( "final": final_dtype, "intermediate": self._normalize_dtype_fill_value(dtypes, "dtype"), } - self.dtype: AggDtype = None # type: ignore + self.dtype: AggDtype = None # type: ignore[assignment] # The following are set by _initialize_aggregation self.finalize_kwargs: dict[Any, Any] = {} diff --git a/flox/cache.py b/flox/cache.py index dd6416aea..e43c540d6 100644 --- a/flox/cache.py +++ b/flox/cache.py @@ -9,4 +9,4 @@ memoize = partial(cache.memoize, key=dask.base.tokenize) except ImportError: cache = {} - memoize = lambda x: x # type: ignore + memoize = lambda x: x # type: ignore[assignment] diff --git a/flox/core.py b/flox/core.py index e2784ae99..482c8fac1 100644 --- a/flox/core.py +++ b/flox/core.py @@ -7,7 +7,7 @@ import sys import warnings from collections import namedtuple -from collections.abc import Mapping, Sequence +from collections.abc import Sequence from functools import partial, reduce from numbers import Integral from typing import ( @@ -42,9 +42,10 @@ else: from typing import Unpack except (ModuleNotFoundError, ImportError): - Unpack: Any # type: ignore + Unpack: Any # type: ignore[no-redef] import dask.array.Array as DaskArray + from dask.typing import Graph T_DuckArray = Union[np.ndarray, DaskArray] # Any ? T_By = T_DuckArray @@ -576,7 +577,7 @@ def factorize_( idx = sorter[(idx,)] idx[mask] = -1 else: - idx, groups = pd.factorize(flat, sort=sort) # type: ignore # pandas issue? + idx, groups = pd.factorize(flat, sort=sort) # type: ignore[arg-type] found_groups.append(np.array(groups)) factorized.append(idx.reshape(groupvar.shape)) @@ -1156,11 +1157,7 @@ def _reduce_blockwise( agg.finalize = None assert agg.finalize_kwargs is not None - if isinstance(agg.finalize_kwargs, Mapping): - finalize_kwargs_: tuple[dict[Any, Any], ...] = (agg.finalize_kwargs,) - else: - finalize_kwargs_ = agg.finalize_kwargs - finalize_kwargs_ += ({},) + ({},) + finalize_kwargs_: tuple[dict[Any, Any], ...] = (agg.finalize_kwargs,) + ({},) + ({},) results = chunk_reduce( array, @@ -1266,7 +1263,7 @@ def subset_to_blocks( chunks = tuple(tuple(np.array(c)[i].tolist()) for c, i in zip(array.chunks, squeezed)) keys = itertools.product(*(range(len(c)) for c in chunks)) - layer = {(name,) + key: tuple(new_keys[key].tolist()) for key in keys} + layer: Graph = {(name,) + key: tuple(new_keys[key].tolist()) for key in keys} graph = HighLevelGraph.from_collections(name, layer, dependencies=[array]) return dask.array.Array(graph, name, chunks, meta=array) @@ -1276,14 +1273,11 @@ def _extract_unknown_groups(reduced, dtype) -> tuple[DaskArray]: import dask.array from dask.highlevelgraph import HighLevelGraph - layer: dict[tuple, tuple] = {} groups_token = f"group-{reduced.name}" first_block = reduced.ndim * (0,) - layer[(groups_token, *first_block)] = ( - operator.getitem, - (reduced.name, *first_block), - "groups", - ) + layer: Graph = { + (groups_token, *first_block): (operator.getitem, (reduced.name, *first_block), "groups") + } groups: tuple[DaskArray] = ( dask.array.Array( HighLevelGraph.from_collections(groups_token, layer, dependencies=[reduced]), @@ -1537,6 +1531,7 @@ def _collapse_blocks_along_axes(reduced: DaskArray, axis: T_Axes, group_chunks) inchunk = ochunk[: -len(axis)] + np.unravel_index(ochunk[-1], nblocks) layer2[(name, *ochunk)] = (reduced.name, *inchunk) + layer2: Graph return dask.array.Array( HighLevelGraph.from_collections(name, layer2, dependencies=[reduced]), name, @@ -1814,7 +1809,7 @@ def groupby_reduce( fewer than min_count non-NA values are present the result will be NA. Only used if skipna is set to True or defaults to True for the array's dtype. - method : {"map-reduce", "blockwise", "cohorts", "split-reduce"}, optional + method : {"map-reduce", "blockwise", "cohorts"}, optional Strategy for reduction of dask arrays only: * ``"map-reduce"``: First apply the reduction blockwise on ``array``, then @@ -1838,8 +1833,6 @@ def groupby_reduce( 'month', dayofyear' etc. Optimize chunking ``array`` for this method by first rechunking using ``rechunk_for_cohorts`` (for 1D ``by`` only). - * ``"split-reduce"``: - Same as "cohorts" and will be removed soon. engine : {"flox", "numpy", "numba", "numbagg"}, optional Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk: * ``"numpy"``: @@ -1915,12 +1908,9 @@ def groupby_reduce( "Try engine='numpy' or engine='numba' instead." ) - if method in ["split-reduce", "cohorts"] and any_by_dask: + if method == "cohorts" and any_by_dask: raise ValueError(f"method={method!r} can only be used when grouping by numpy arrays.") - if method == "split-reduce": - method = "cohorts" - reindex = _validate_reindex( reindex, func, method, expected_groups, any_by_dask, is_duck_dask_array(array) ) @@ -1976,7 +1966,7 @@ def groupby_reduce( axis_ = tuple(array.ndim + np.arange(-by_.ndim, 0)) else: # TODO: How come this function doesn't exist according to mypy? - axis_ = np.core.numeric.normalize_axis_tuple(axis, array.ndim) # type: ignore + axis_ = np.core.numeric.normalize_axis_tuple(axis, array.ndim) # type: ignore[attr-defined] nax = len(axis_) has_dask = is_duck_dask_array(array) or is_duck_dask_array(by_) @@ -2053,7 +2043,8 @@ def groupby_reduce( # TODO: How else to narrow that array.chunks is there? assert isinstance(array, DaskArray) - if agg.chunk[0] is None and method != "blockwise": + # TODO: fix typing of FuncTuple in Aggregation + if agg.chunk[0] is None and method != "blockwise": # type: ignore[unreachable] raise NotImplementedError( f"Aggregation {agg.name!r} is only implemented for dask arrays when method='blockwise'." f"Received method={method!r}" diff --git a/flox/xarray.py b/flox/xarray.py index 7e8c4f2b1..0f5d68fde 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -110,7 +110,7 @@ def xarray_reduce( in ``expected_groups`` is not actually present in ``by``. dtype : data-type, optional DType for the output. Can be anything accepted by ``np.dtype``. - method : {"map-reduce", "blockwise", "cohorts", "split-reduce"}, optional + method : {"map-reduce", "blockwise", "cohorts"}, optional Strategy for reduction of dask arrays only: * ``"map-reduce"``: First apply the reduction blockwise on ``array``, then @@ -134,8 +134,6 @@ def xarray_reduce( 'month', dayofyear' etc. Optimize chunking ``array`` for this method by first rechunking using ``rechunk_for_cohorts`` (for 1D ``by`` only). - * ``"split-reduce"``: - Same as "cohorts" and will be removed soon. engine : {"flox", "numpy", "numba"}, optional Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk: * ``"numpy"``: @@ -253,7 +251,7 @@ def xarray_reduce( try: from xarray.indexes import PandasMultiIndex except ImportError: - PandasMultiIndex = tuple() # type: ignore + PandasMultiIndex = tuple() # type: ignore[assignment, misc] more_drop = set() for var in maybe_drop: diff --git a/flox/xrutils.py b/flox/xrutils.py index 4f80ec8c8..b1be05fe9 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -20,7 +20,7 @@ dask_array_type = dask.array.Array except ImportError: - dask_array_type = () # type: ignore + dask_array_type = () # type: ignore[assignment, misc] def asarray(data, xp=np): diff --git a/pyproject.toml b/pyproject.toml index 721661c91..0a236de62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ ] dependencies = [ "pandas", - "numpy>=1.20", + "numpy>=1.21", "numpy_groupies>=0.9.19", "toolz", ] @@ -107,6 +107,8 @@ allow_redefinition = true files = "**/*.py" show_error_codes = true warn_unused_ignores = true +warn_unreachable = true +enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] [[tool.mypy.overrides]] module=[ diff --git a/readthedocs.yml b/readthedocs.yml index ebe5245c9..25699dd03 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,11 +1,11 @@ version: 2 build: - os: 'ubuntu-20.04' + os: "ubuntu-20.04" tools: - python: 'mambaforge-4.10' + python: "mambaforge-4.10" conda: - environment: ci/docs.yml + environment: ci/docs.yml formats: [] diff --git a/tests/__init__.py b/tests/__init__.py index f46319172..4af6bc87e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -14,7 +14,7 @@ dask_array_type = da.Array except ImportError: - dask_array_type = () # type: ignore + dask_array_type = () # type: ignore[assignment, misc] try: @@ -22,7 +22,7 @@ xr_types = (xr.DataArray, xr.Dataset) except ImportError: - xr_types = () # type: ignore + xr_types = () # type: ignore[assignment] def _importorskip(modname, minversion=None): diff --git a/tests/test_core.py b/tests/test_core.py index 09ee50902..d170c95e5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1252,7 +1252,7 @@ def grouped_median(group_idx, array, *, axis=-1, size=None, fill_value=None, dty expected = np.median(array, axis=-1, keepdims=True) assert_equal(expected, actual) - for method in ["map-reduce", "cohorts", "split-reduce"]: + for method in ["map-reduce", "cohorts"]: with pytest.raises(NotImplementedError): groupby_reduce( dask.array.from_array(array, chunks=(1, -1)), From 84d4a1c823a7acd1d1ce7cf68257396c6ee635c8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 14 Oct 2023 16:34:30 -0600 Subject: [PATCH 21/24] Bump numpy to >=1.22 (#278) --- ci/benchmark.yml | 2 +- ci/docs.yml | 2 +- ci/environment.yml | 2 +- ci/minimal-requirements.yml | 2 +- ci/no-dask.yml | 2 +- ci/no-numba.yml | 2 +- ci/no-xarray.yml | 2 +- pyproject.toml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/benchmark.yml b/ci/benchmark.yml index 80667a4f6..d2b52fc1c 100644 --- a/ci/benchmark.yml +++ b/ci/benchmark.yml @@ -5,7 +5,7 @@ dependencies: - asv - cachey - dask-core - - numpy>=1.21 + - numpy>=1.22 - mamba - pip - python=3.10 diff --git a/ci/docs.yml b/ci/docs.yml index ad8feee08..b71ce8955 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -5,7 +5,7 @@ dependencies: - dask-core - pip - xarray - - numpy>=1.21 + - numpy>=1.22 - numpydoc - numpy_groupies>=0.9.19 - toolz diff --git a/ci/environment.yml b/ci/environment.yml index ca9111105..d364f6fbe 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -8,7 +8,7 @@ dependencies: - dask-core - netcdf4 - pandas - - numpy>=1.21 + - numpy>=1.22 - lxml # for mypy coverage report - matplotlib - pip diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml index 40b287f64..466c26066 100644 --- a/ci/minimal-requirements.yml +++ b/ci/minimal-requirements.yml @@ -9,7 +9,7 @@ dependencies: - pytest-cov - pytest-pretty - pytest-xdist - - numpy==1.21 + - numpy==1.22 - numpy_groupies==0.9.19 - pandas - pooch diff --git a/ci/no-dask.yml b/ci/no-dask.yml index f74d522b6..4cce0a380 100644 --- a/ci/no-dask.yml +++ b/ci/no-dask.yml @@ -5,7 +5,7 @@ dependencies: - codecov - netcdf4 - pandas - - numpy>=1.21 + - numpy>=1.22 - pip - pytest - pytest-cov diff --git a/ci/no-numba.yml b/ci/no-numba.yml index 1d846d117..6288bc6f4 100644 --- a/ci/no-numba.yml +++ b/ci/no-numba.yml @@ -8,7 +8,7 @@ dependencies: - dask-core - netcdf4 - pandas - - numpy>=1.21 + - numpy>=1.22 - lxml # for mypy coverage report - matplotlib - pip diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml index 9eb75cf96..c4141a331 100644 --- a/ci/no-xarray.yml +++ b/ci/no-xarray.yml @@ -5,7 +5,7 @@ dependencies: - codecov - netcdf4 - pandas - - numpy>=1.21 + - numpy>=1.22 - pip - pytest - pytest-cov diff --git a/pyproject.toml b/pyproject.toml index 0a236de62..b014d7af2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ ] dependencies = [ "pandas", - "numpy>=1.21", + "numpy>=1.22", "numpy_groupies>=0.9.19", "toolz", ] From 789cf73a32a42c6de774b171deaae7f7cbbb9e4c Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sat, 14 Oct 2023 16:38:43 -0600 Subject: [PATCH 22/24] Update pyproject.toml: py3.12 --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b014d7af2..c7ed830f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] dependencies = [ "pandas", @@ -37,7 +38,7 @@ test = ["netCDF4"] [build-system] requires = [ "pandas", - "numpy>=1.20", + "numpy>=1.22", "numpy_groupies>=0.9.19", "toolz", "setuptools>=61.0.0", From fecd9a692c10551df11f727d4341a80f0aaf72d9 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Sun, 15 Oct 2023 05:48:42 +0200 Subject: [PATCH 23/24] use engine flox for ordered groups (#266) * use engine flox for ordered groups * Add issorted helper func * Some fixes * In xarray too * formatting * simplify * retry * flox * minversion numabgg * cleanup * fix type * update gitignore * add types * Fix env? * fix * fix merge * cleanup * [skip-ci] bench * temporarily disable numbagg * don't cache env * Finally! * bugfix * Fix doctest * more fixes * Fix CI * readd numbagg * Fix. --------- Co-authored-by: Deepak Cherian Co-authored-by: Deepak Cherian --- .github/workflows/ci-additional.yaml | 5 ++- .gitignore | 1 + asv_bench/benchmarks/reduce.py | 2 +- ci/environment.yml | 1 + flox/aggregations.py | 2 ++ flox/core.py | 45 ++++++++++++++++++++++--- flox/xarray.py | 4 +-- flox/xrutils.py | 27 ++++++++++++++- tests/test_core.py | 49 +++++++++++++++++++++------- 9 files changed, 116 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index c71fc8913..ed78d328c 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -72,7 +72,10 @@ jobs: conda list - name: Run doctests run: | - python -m pytest --doctest-modules flox --ignore flox/tests --cov=./ --cov-report=xml + python -m pytest --doctest-modules \ + flox/aggregations.py flox/core.py flox/xarray.py \ + --ignore flox/tests \ + --cov=./ --cov-report=xml - name: Upload code coverage to Codecov uses: codecov/codecov-action@v3.1.4 with: diff --git a/.gitignore b/.gitignore index 04f8d580d..67105856e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +asv_bench/pkgs/ docs/source/generated/ html/ .asv/ diff --git a/asv_bench/benchmarks/reduce.py b/asv_bench/benchmarks/reduce.py index 326b73566..59db1c580 100644 --- a/asv_bench/benchmarks/reduce.py +++ b/asv_bench/benchmarks/reduce.py @@ -7,7 +7,7 @@ N = 3000 funcs = ["sum", "nansum", "mean", "nanmean", "max", "nanmax", "count"] -engines = ["flox", "numpy", "numbagg"] +engines = [None, "flox", "numpy", "numbagg"] expected_groups = { "None": None, "bins": pd.IntervalIndex.from_breaks([1, 2, 4]), diff --git a/ci/environment.yml b/ci/environment.yml index d364f6fbe..f28917dc9 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -23,4 +23,5 @@ dependencies: - pooch - toolz - numba + - numbagg>=0.3 - scipy diff --git a/flox/aggregations.py b/flox/aggregations.py index b2129c197..3a65f9396 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -29,6 +29,7 @@ class AggDtypeInit(TypedDict): class AggDtype(TypedDict): + user: DTypeLike | None final: np.dtype numpy: tuple[np.dtype | type[np.intp], ...] intermediate: tuple[np.dtype | type[np.intp], ...] @@ -569,6 +570,7 @@ def _initialize_aggregation( final_dtype = _normalize_dtype(dtype_ or agg.dtype_init["final"], array_dtype, fill_value) agg.dtype = { + "user": dtype, # Save to automatically choose an engine "final": final_dtype, "numpy": (final_dtype,), "intermediate": tuple( diff --git a/flox/core.py b/flox/core.py index 482c8fac1..66e39b6f1 100644 --- a/flox/core.py +++ b/flox/core.py @@ -33,7 +33,9 @@ generic_aggregate, ) from .cache import memoize -from .xrutils import is_duck_array, is_duck_dask_array, isnull +from .xrutils import is_duck_array, is_duck_dask_array, isnull, module_available + +HAS_NUMBAGG = module_available("numbagg", minversion="0.3.0") if TYPE_CHECKING: try: @@ -69,6 +71,7 @@ T_Dtypes = Union[np.typing.DTypeLike, Sequence[np.typing.DTypeLike], None] T_FillValues = Union[np.typing.ArrayLike, Sequence[np.typing.ArrayLike], None] T_Engine = Literal["flox", "numpy", "numba", "numbagg"] + T_EngineOpt = None | T_Engine T_Method = Literal["map-reduce", "blockwise", "cohorts"] T_IsBins = Union[bool | Sequence[bool]] @@ -83,6 +86,10 @@ DUMMY_AXIS = -2 +def _issorted(arr: np.ndarray) -> bool: + return bool((arr[:-1] <= arr[1:]).all()) + + def _is_arg_reduction(func: T_Agg) -> bool: if isinstance(func, str) and func in ["argmin", "argmax", "nanargmax", "nanargmin"]: return True @@ -632,6 +639,7 @@ def chunk_argreduce( reindex: bool = False, engine: T_Engine = "numpy", sort: bool = True, + user_dtype=None, ) -> IntermediateDict: """ Per-chunk arg reduction. @@ -652,6 +660,7 @@ def chunk_argreduce( dtype=dtype, engine=engine, sort=sort, + user_dtype=user_dtype, ) if not isnull(results["groups"]).all(): idx = np.broadcast_to(idx, array.shape) @@ -685,6 +694,7 @@ def chunk_reduce( engine: T_Engine = "numpy", kwargs: Sequence[dict] | None = None, sort: bool = True, + user_dtype=None, ) -> IntermediateDict: """ Wrapper for numpy_groupies aggregate that supports nD ``array`` and @@ -785,6 +795,7 @@ def chunk_reduce( group_idx = group_idx.reshape(-1) assert group_idx.ndim == 1 + empty = np.all(props.nanmask) results: IntermediateDict = {"groups": [], "intermediates": []} @@ -1100,6 +1111,7 @@ def _grouped_combine( dtype=(np.intp,), engine=engine, sort=sort, + user_dtype=agg.dtype["user"], )["intermediates"][0] ) @@ -1129,6 +1141,7 @@ def _grouped_combine( dtype=(dtype,), engine=engine, sort=sort, + user_dtype=agg.dtype["user"], ) results["intermediates"].append(*_results["intermediates"]) results["groups"] = _results["groups"] @@ -1174,6 +1187,7 @@ def _reduce_blockwise( engine=engine, sort=sort, reindex=reindex, + user_dtype=agg.dtype["user"], ) if _is_arg_reduction(agg): @@ -1366,6 +1380,7 @@ def dask_groupby_agg( fill_value=agg.fill_value["intermediate"], dtype=agg.dtype["intermediate"], reindex=reindex, + user_dtype=agg.dtype["user"], ) if do_simple_combine: # Add a dummy dimension that then gets reduced over @@ -1757,6 +1772,23 @@ def _validate_expected_groups(nby: int, expected_groups: T_ExpectedGroupsOpt) -> return expected_groups +def _choose_engine(by, agg: Aggregation): + dtype = agg.dtype["user"] + + not_arg_reduce = not _is_arg_reduction(agg) + + # numbagg only supports nan-skipping reductions + # without dtype specified + if HAS_NUMBAGG and "nan" in agg.name: + if not_arg_reduce and dtype is None: + return "numbagg" + + if not_arg_reduce and (not is_duck_dask_array(by) and _issorted(by)): + return "flox" + else: + return "numpy" + + def groupby_reduce( array: np.ndarray | DaskArray, *by: T_By, @@ -1769,7 +1801,7 @@ def groupby_reduce( dtype: np.typing.DTypeLike = None, min_count: int | None = None, method: T_Method = "map-reduce", - engine: T_Engine = "numpy", + engine: T_EngineOpt = None, reindex: bool | None = None, finalize_kwargs: dict[Any, Any] | None = None, ) -> tuple[DaskArray, Unpack[tuple[np.ndarray | DaskArray, ...]]]: # type: ignore[misc] # Unpack not in mypy yet @@ -2027,9 +2059,14 @@ def groupby_reduce( # overwrite than when min_count is set fill_value = np.nan - kwargs = dict(axis=axis_, fill_value=fill_value, engine=engine) + kwargs = dict(axis=axis_, fill_value=fill_value) agg = _initialize_aggregation(func, dtype, array.dtype, fill_value, min_count_, finalize_kwargs) + # Need to set this early using `agg` + # It cannot be done in the core loop of chunk_reduce + # since we "prepare" the data for flox. + kwargs["engine"] = _choose_engine(by_, agg) if engine is None else engine + groups: tuple[np.ndarray | DaskArray, ...] if not has_dask: results = _reduce_blockwise( @@ -2080,7 +2117,7 @@ def groupby_reduce( assert len(groups) == 1 sorted_idx = np.argsort(groups[0]) # This optimization helps specifically with resampling - if not (sorted_idx[:-1] <= sorted_idx[1:]).all(): + if not _issorted(sorted_idx): result = result[..., sorted_idx] groups = (groups[0][sorted_idx],) diff --git a/flox/xarray.py b/flox/xarray.py index 0f5d68fde..d20ee903d 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -73,7 +73,7 @@ def xarray_reduce( fill_value=None, dtype: np.typing.DTypeLike = None, method: str = "map-reduce", - engine: str = "numpy", + engine: str | None = None, keep_attrs: bool | None = True, skipna: bool | None = None, min_count: int | None = None, @@ -369,7 +369,7 @@ def wrapper(array, *by, func, skipna, core_dims, **kwargs): # Flox's count works with non-numeric and its faster than converting. requires_numeric = func not in ["count", "any", "all"] or ( - func == "count" and engine != "flox" + func == "count" and kwargs["engine"] != "flox" ) if requires_numeric: is_npdatetime = array.dtype.kind in "Mm" diff --git a/flox/xrutils.py b/flox/xrutils.py index b1be05fe9..497cd7b24 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -2,12 +2,14 @@ # defined in xarray import datetime +import importlib from collections.abc import Iterable -from typing import Any +from typing import Any, Optional import numpy as np import pandas as pd from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] +from packaging.version import Version try: import cftime @@ -317,3 +319,26 @@ def nanlast(values, axis, keepdims=False): return np.expand_dims(result, axis=axis) else: return result + + +def module_available(module: str, minversion: Optional[str] = None) -> bool: + """Checks whether a module is installed without importing it. + + Use this for a lightweight check and lazy imports. + + Parameters + ---------- + module : str + Name of the module. + + Returns + ------- + available : bool + Whether the module is installed. + """ + has = importlib.util.find_spec(module) is not None + if has: + mod = importlib.import_module(module) + return Version(mod.__version__) < Version(minversion) if minversion is not None else True + else: + return False diff --git a/tests/test_core.py b/tests/test_core.py index d170c95e5..99f181255 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -11,8 +11,10 @@ from numpy_groupies.aggregate_numpy import aggregate from flox import xrutils -from flox.aggregations import Aggregation +from flox.aggregations import Aggregation, _initialize_aggregation from flox.core import ( + HAS_NUMBAGG, + _choose_engine, _convert_expected_groups_to_index, _get_optimal_chunks_for_groups, _normalize_indexes, @@ -600,12 +602,9 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): by = np.broadcast_to(labels2d, (3, *labels2d.shape)) rng = np.random.default_rng(12345) array = rng.random(by.shape) - kwargs = dict( - func=func, axis=axis, expected_groups=[0, 2], fill_value=fill_value, engine=engine - ) - expected, _ = groupby_reduce(array, by, **kwargs) + kwargs = dict(func=func, axis=axis, expected_groups=[0, 2], fill_value=fill_value) + expected, _ = groupby_reduce(array, by, engine=engine, **kwargs) if engine == "flox": - kwargs.pop("engine") expected_npg, _ = groupby_reduce(array, by, **kwargs, engine="numpy") assert_equal(expected_npg, expected) @@ -622,12 +621,9 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): by = np.broadcast_to(labels2d, (3, *labels2d.shape)) rng = np.random.default_rng(12345) array = rng.random(by.shape) - kwargs = dict( - func=func, axis=axis, expected_groups=[0, 2], fill_value=fill_value, engine=engine - ) - expected, _ = groupby_reduce(array, by, **kwargs) + kwargs = dict(func=func, axis=axis, expected_groups=[0, 2], fill_value=fill_value) + expected, _ = groupby_reduce(array, by, engine=engine, **kwargs) if engine == "flox": - kwargs.pop("engine") expected_npg, _ = groupby_reduce(array, by, **kwargs, engine="numpy") assert_equal(expected_npg, expected) @@ -640,6 +636,7 @@ def test_groupby_reduce_axis_subset_against_numpy(func, axis, engine): actual, _ = groupby_reduce( da.from_array(array, chunks=(-1, 2, 3)), da.from_array(by, chunks=(-1, 2, 2)), + engine=engine, **kwargs, ) assert_equal(actual, expected, tolerance) @@ -1546,3 +1543,33 @@ def test_method_check_numpy(): ] ) assert_equal(actual, expected) + + +@pytest.mark.parametrize("dtype", [None, np.float64]) +def test_choose_engine(dtype): + numbagg_possible = HAS_NUMBAGG and dtype is None + default = "numbagg" if numbagg_possible else "numpy" + mean = _initialize_aggregation( + "mean", + dtype=dtype, + array_dtype=np.dtype("int64"), + fill_value=0, + min_count=0, + finalize_kwargs=None, + ) + argmax = _initialize_aggregation( + "argmax", + dtype=dtype, + array_dtype=np.dtype("int64"), + fill_value=0, + min_count=0, + finalize_kwargs=None, + ) + + # sorted by -> flox + sorted_engine = _choose_engine(np.array([1, 1, 2, 2]), agg=mean) + assert sorted_engine == ("numbagg" if numbagg_possible else "flox") + # unsorted by -> numpy + assert _choose_engine(np.array([3, 1, 1]), agg=mean) == default + # argmax does not give engine="flox" + assert _choose_engine(np.array([1, 1, 2, 2]), agg=argmax) == "numpy" From c15572ea416236d6fdf4c9b4caeb91962c2c6a82 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 15 Oct 2023 09:33:03 -0600 Subject: [PATCH 24/24] Add `packaging` as dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index c7ed830f5..9cf0b4ca4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ classifiers = [ ] dependencies = [ "pandas", + "packaging>=21.3", "numpy>=1.22", "numpy_groupies>=0.9.19", "toolz",