Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rank_histogram tied ranks #364

Merged
merged 32 commits into from Nov 26, 2021
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Expand Up @@ -6,6 +6,12 @@ Changelog History
xskillscore v0.0.25 (2021-xx-xx)
--------------------------------

Bug Fixes
~~~~~~~~~
- :py:func:`~xskillscore.rank_histogram` ``random_for_tied=True`` handles tied ranks
correctly by default. ``random_for_tied=False`` ignores this and retains previous
behaviour. (:issue:`335`, :pr:`364`) `Aaron Spring`_.

Internal Changes
~~~~~~~~~~~~~~~~
- Reduce dependencies (:issue:`359`, :pr:`363`) `Aaron Spring`_.
Expand Down
42 changes: 39 additions & 3 deletions xskillscore/core/probabilistic.py
@@ -1,5 +1,6 @@
import numpy as np
import properscoring
import scipy.stats
import xarray as xr

from .contingency import Contingency, _get_category_bounds
Expand Down Expand Up @@ -830,7 +831,14 @@ def rps(
return res


def rank_histogram(observations, forecasts, dim=None, member_dim="member"):
def rank_histogram(
observations,
forecasts,
dim=None,
member_dim="member",
random_for_tied=True,
keep_attrs=True,
):
"""Returns the rank histogram (Talagrand diagram) along the specified dimensions.

Parameters
Expand All @@ -844,12 +852,22 @@ def rank_histogram(observations, forecasts, dim=None, member_dim="member"):
Defaults to None meaning compute over all dimensions
member_dim : str, optional
Name of ensemble member dimension. By default, 'member'.
random_for_tied : bool
Whether assign tied ranks random rank, see Hamill 2001
aaronspring marked this conversation as resolved.
Show resolved Hide resolved
aaronspring marked this conversation as resolved.
Show resolved Hide resolved
keep_attrs : bool, optional
Whether to copy attributes from the first argument to the output.

Returns
-------
rank_histogram : xarray.Dataset or xarray.DataArray
New object containing the histogram of ranks

Reference
---------
* Hamill, T. M. (2001). Interpretation of Rank Histograms for Verifying
Ensemble Forecasts. Monthly Weather Review, 129(3), 550–560.
doi: 10/dkkvh3

Examples
--------
>>> observations = xr.DataArray(np.random.normal(size=(3, 3)),
Expand Down Expand Up @@ -877,7 +895,16 @@ def _rank_first(x, y):
"""Concatenates x and y and returns the rank of the
first element along the last axes"""
xy = np.concatenate((x[..., np.newaxis], y), axis=-1)
return rankdata(xy, axis=-1)[..., 0]
if random_for_tied:
ranks_min = scipy.stats.rankdata(xy, axis=-1, method="min")
ranks_max = scipy.stats.rankdata(xy, axis=-1, method="max")
ranks = ranks_min + np.int32(
(ranks_max - ranks_min + 1) * np.random.rand(*xy.shape)
)
else: # no special handling of ties
ranks = rankdata(xy, axis=-1)
ranks = ranks[..., 0] # take obs rank
return ranks

if dim is not None:
if len(dim) == 0:
Expand All @@ -894,12 +921,21 @@ def _rank_first(x, y):
input_core_dims=[[], [member_dim]],
dask="parallelized",
output_dtypes=[int],
keep_attrs=keep_attrs,
)

bin_edges = np.arange(0.5, len(forecasts[member_dim]) + 2)
return histogram(
hist = histogram(
ranks, bins=[bin_edges], bin_names=["rank"], dim=dim, bin_dim_suffix=""
)
if keep_attrs: # attach by hand
hist.attrs.update(observations.attrs)
hist.attrs.update(forecasts.attrs)
if isinstance(hist, xr.Dataset):
for v in hist.data_vars:
hist[v].attrs.update(observations[v].attrs)
hist[v].attrs.update(forecasts[v].attrs)
return hist


def discrimination(
Expand Down
22 changes: 16 additions & 6 deletions xskillscore/tests/test_probabilistic.py
Expand Up @@ -343,32 +343,33 @@ def test_brier_score_vs_fair_brier_score(o, f_prob, dim):
@pytest.mark.parametrize("chunk_bool", [True, False])
@pytest.mark.parametrize("input_type", ["DataArray", "Dataset", "multidim Dataset"])
@pytest.mark.parametrize("dim", DIMS)
def test_rank_histogram_sum(o, f_prob, dim, chunk_bool, input_type):
@pytest.mark.parametrize("keep_attrs", [True, False])
def test_rank_histogram_sum(o, f_prob, dim, chunk_bool, input_type, keep_attrs):
"""Test that the number of samples in the rank histogram is correct"""
o, f_prob = modify_inputs(o, f_prob, input_type, chunk_bool)
if dim == []:
with pytest.raises(ValueError):
rank_histogram(o, f_prob, dim=dim)
else:
rank_hist = rank_histogram(o, f_prob, dim=dim)
rank_hist = rank_histogram(o, f_prob, dim=dim, keep_attrs=keep_attrs)
if "Dataset" in input_type:
rank_hist = rank_hist[list(o.data_vars)[0]]
o = o[list(o.data_vars)[0]]
assert_allclose(rank_hist.sum(), o.count())
assert_allclose(rank_hist.sum(), o.count())
# test that returns chunks
assert_chunk(rank_hist, chunk_bool)
# test that attributes are kept # TODO: add
# assert_keep_attrs(rank_hist, o, keep_attrs)
# test that attributes are kept
assert_keep_attrs(rank_hist, o, keep_attrs)
# test that input types equal output types
assign_type_input_output(rank_hist, o)


def test_rank_histogram_values(o, f_prob):
"""Test values in extreme cases that observations \
all smaller/larger than forecasts"""
assert rank_histogram((f_prob.min() - 1) + 0 * o, f_prob)[0] == o.size
assert rank_histogram((f_prob.max() + 1) + 0 * o, f_prob)[-1] == o.size
assert rank_histogram(o - 10, f_prob)[0] == o.size
assert rank_histogram(o + 10, f_prob)[-1] == o.size


@pytest.mark.parametrize("dim", DIMS)
Expand Down Expand Up @@ -1086,3 +1087,12 @@ def test_brier_score_float_forecast_or_observations(o, f_prob):
brier_score(o, f_prob)
brier_score(o, 0.5)
brier_score(1, f_prob)


def test_rank_hist_tied():
"""Test that rank_histogram handles tied ranks."""
a = xr.DataArray(np.zeros(100), dims="a")
b = xr.DataArray(np.zeros((100, 10)), dims=["a", "member"])
rh = rank_histogram(a, b)
assert rh.min() > 3
assert rh.max() < 30