xarray-contrib · aaronspring · Nov 26, 2021 · Nov 23, 2021 · Nov 23, 2021 · Nov 23, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,12 @@ Changelog History
 xskillscore v0.0.25 (2021-xx-xx)
 --------------------------------
 
+Bug Fixes
+~~~~~~~~~
+- :py:func:`~xskillscore.rank_histogram` ``random_for_tied=True`` handles tied ranks
+  correctly by default. ``random_for_tied=False`` ignores this and retains previous
+  behaviour. (:issue:`335`, :pr:`364`) `Aaron Spring`_.
+
 Internal Changes
 ~~~~~~~~~~~~~~~~
 - Reduce dependencies (:issue:`359`, :pr:`363`) `Aaron Spring`_.

diff --git a/xskillscore/core/probabilistic.py b/xskillscore/core/probabilistic.py
@@ -1,5 +1,6 @@
 import numpy as np
 import properscoring
+import scipy.stats
 import xarray as xr
 
 from .contingency import Contingency, _get_category_bounds
@@ -830,7 +831,14 @@ def rps(
     return res
 
 
-def rank_histogram(observations, forecasts, dim=None, member_dim="member"):
+def rank_histogram(
+    observations,
+    forecasts,
+    dim=None,
+    member_dim="member",
+    random_for_tied=True,
+    keep_attrs=True,
+):
     """Returns the rank histogram (Talagrand diagram) along the specified dimensions.
 
     Parameters
@@ -844,12 +852,22 @@ def rank_histogram(observations, forecasts, dim=None, member_dim="member"):
         Defaults to None meaning compute over all dimensions
     member_dim : str, optional
         Name of ensemble member dimension. By default, 'member'.
+    random_for_tied : bool
+        Whether assign tied ranks random rank, see Hamill 2001
+    keep_attrs : bool, optional
+        Whether to copy attributes from the first argument to the output.
 
     Returns
     -------
     rank_histogram : xarray.Dataset or xarray.DataArray
         New object containing the histogram of ranks
 
+    Reference
+    ---------
+    * Hamill, T. M. (2001). Interpretation of Rank Histograms for Verifying
+        Ensemble Forecasts. Monthly Weather Review, 129(3), 550–560.
+        doi: 10/dkkvh3
+
     Examples
     --------
     >>> observations = xr.DataArray(np.random.normal(size=(3, 3)),
@@ -877,7 +895,16 @@ def _rank_first(x, y):
         """Concatenates x and y and returns the rank of the
         first element along the last axes"""
         xy = np.concatenate((x[..., np.newaxis], y), axis=-1)
-        return rankdata(xy, axis=-1)[..., 0]
+        if random_for_tied:
+            ranks_min = scipy.stats.rankdata(xy, axis=-1, method="min")
+            ranks_max = scipy.stats.rankdata(xy, axis=-1, method="max")
+            ranks = ranks_min + np.int32(
+                (ranks_max - ranks_min + 1) * np.random.rand(*xy.shape)
+            )
+        else:  # no special handling of ties
+            ranks = rankdata(xy, axis=-1)
+        ranks = ranks[..., 0]  # take obs rank
+        return ranks
 
     if dim is not None:
         if len(dim) == 0:
@@ -894,12 +921,21 @@ def _rank_first(x, y):
         input_core_dims=[[], [member_dim]],
         dask="parallelized",
         output_dtypes=[int],
+        keep_attrs=keep_attrs,
     )
 
     bin_edges = np.arange(0.5, len(forecasts[member_dim]) + 2)
-    return histogram(
+    hist = histogram(
         ranks, bins=[bin_edges], bin_names=["rank"], dim=dim, bin_dim_suffix=""
     )
+    if keep_attrs:  # attach by hand
+        hist.attrs.update(observations.attrs)
+        hist.attrs.update(forecasts.attrs)
+        if isinstance(hist, xr.Dataset):
+            for v in hist.data_vars:
+                hist[v].attrs.update(observations[v].attrs)
+                hist[v].attrs.update(forecasts[v].attrs)
+    return hist
 
 
 def discrimination(

diff --git a/xskillscore/tests/test_probabilistic.py b/xskillscore/tests/test_probabilistic.py
@@ -343,32 +343,33 @@ def test_brier_score_vs_fair_brier_score(o, f_prob, dim):
 @pytest.mark.parametrize("chunk_bool", [True, False])
 @pytest.mark.parametrize("input_type", ["DataArray", "Dataset", "multidim Dataset"])
 @pytest.mark.parametrize("dim", DIMS)
-def test_rank_histogram_sum(o, f_prob, dim, chunk_bool, input_type):
+@pytest.mark.parametrize("keep_attrs", [True, False])
+def test_rank_histogram_sum(o, f_prob, dim, chunk_bool, input_type, keep_attrs):
     """Test that the number of samples in the rank histogram is correct"""
     o, f_prob = modify_inputs(o, f_prob, input_type, chunk_bool)
     if dim == []:
         with pytest.raises(ValueError):
             rank_histogram(o, f_prob, dim=dim)
     else:
-        rank_hist = rank_histogram(o, f_prob, dim=dim)
+        rank_hist = rank_histogram(o, f_prob, dim=dim, keep_attrs=keep_attrs)
         if "Dataset" in input_type:
             rank_hist = rank_hist[list(o.data_vars)[0]]
             o = o[list(o.data_vars)[0]]
             assert_allclose(rank_hist.sum(), o.count())
         assert_allclose(rank_hist.sum(), o.count())
         # test that returns chunks
         assert_chunk(rank_hist, chunk_bool)
-        # test that attributes are kept # TODO: add
-        # assert_keep_attrs(rank_hist, o, keep_attrs)
+        # test that attributes are kept
+        assert_keep_attrs(rank_hist, o, keep_attrs)
         # test that input types equal output types
         assign_type_input_output(rank_hist, o)
 
 
 def test_rank_histogram_values(o, f_prob):
     """Test values in extreme cases that observations \
         all smaller/larger than forecasts"""
-    assert rank_histogram((f_prob.min() - 1) + 0 * o, f_prob)[0] == o.size
-    assert rank_histogram((f_prob.max() + 1) + 0 * o, f_prob)[-1] == o.size
+    assert rank_histogram(o - 10, f_prob)[0] == o.size
+    assert rank_histogram(o + 10, f_prob)[-1] == o.size
 
 
 @pytest.mark.parametrize("dim", DIMS)
@@ -1086,3 +1087,12 @@ def test_brier_score_float_forecast_or_observations(o, f_prob):
     brier_score(o, f_prob)
     brier_score(o, 0.5)
     brier_score(1, f_prob)
+
+
+def test_rank_hist_tied():
+    """Test that rank_histogram handles tied ranks."""
+    a = xr.DataArray(np.zeros(100), dims="a")
+    b = xr.DataArray(np.zeros((100, 10)), dims=["a", "member"])
+    rh = rank_histogram(a, b)
+    assert rh.min() > 3
+    assert rh.max() < 30