MAE test (#209)

* more robust nomenclature: significance and alpha * diff mean MAE instead of mean diff MAE * two returns * adapt notebook * change outputs of mae_test * align sign_test * fix nb and docstrings * sign_test quickstart print(significantly_different) * add mae_test to docs API
xarray-contrib · Oct 23, 2020 · 0176116 · 0176116
1 parent 08a0816
commit 0176116
Show file tree

Hide file tree

Showing 7 changed files with 836 additions and 72 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -10,6 +10,16 @@ Features
 - Added mean error
   :py:func:`~xskillscore.me`. (:issue:`202`, :pr:`200`)
   `Andrew Huang`_
+- Added MAE significance test :py:func:`~xskillscore.mae_test` from Jolliffe and Ebert
+  https://www.cawcr.gov.au/projects/verification/CIdiff/FAQ-CIdiff.html
+  (:issue:`192`, :pr:`209`) `Aaron Spring`_
+
+Breaking changes
+~~~~~~~~~~~~~~~~
+- Aligned output of :py:func:`~xskillscore.sign_test` with
+  :py:func:`~xskillscore.mae_test`. Now tests from comparative.py return more than
+  one object including a boolean indicating ``signficance`` based on ``alpha``.
+  (:pr:`209`) `Aaron Spring`_
 
 Bug Fixes
 ~~~~~~~~~

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -119,3 +119,4 @@ Tests to compare whether one forecast is significantly better than another one.
     :toctree: api/
 
     sign_test
+    mae_test
diff --git a/docs/source/api/xskillscore.mae_test.rst b/docs/source/api/xskillscore.mae_test.rst
@@ -0,0 +1,6 @@
+xskillscore.mae\_test
+=====================
+
+.. currentmodule:: xskillscore
+
+.. autofunction:: mae_test
diff --git a/docs/source/quick-start.ipynb b/docs/source/quick-start.ipynb
diff --git a/xskillscore/__init__.py b/xskillscore/__init__.py
@@ -2,7 +2,7 @@
 from pkg_resources import DistributionNotFound, get_distribution
 
 from .core.accessor import XSkillScoreAccessor
-from .core.comparative import sign_test
+from .core.comparative import mae_test, sign_test
 from .core.contingency import Contingency
 from .core.deterministic import (
     effective_sample_size,

diff --git a/xskillscore/core/comparative.py b/xskillscore/core/comparative.py
@@ -1,9 +1,10 @@
 import warnings
 
+import numpy as np
 import scipy.stats as st
 import xarray as xr
 
-from .utils import _add_as_coord
+from .deterministic import mae, pearson_r
 
 
 def sign_test(
@@ -65,24 +66,44 @@ def sign_test(
 
     Returns
     -------
-    xarray.DataArray or xarray.Dataset : Positive (negative) sign_test values shows
+    xarray.DataArray or xarray.Dataset : boolean returns answering whether forecast1 is
+        significantly different to forecast2
+    xarray.DataArray or xarray.Dataset : Positive (negative) walk values shows
         how often ``forecast1`` is better (worse) than ``forecast2`` according to
-        metric computed over ``dim``. A coordinate, confidence, is included showing
-        the positive boundary for the random walk at significance level ``alpha``.
+        metric computed over ``dim``.
+    xarray.DataArray or xarray.Dataset : confidence shows the positive boundary for a
+        random walk at significance level ``alpha``.
 
 
     Examples
     --------
+    >>> np.random.seed(42)
     >>> f1 = xr.DataArray(np.random.normal(size=(30)),
     ...      coords=[('time', np.arange(30))])
     >>> f2 = xr.DataArray(np.random.normal(size=(30)),
-    ...      coords=[('time', np.arange(30))])
+    ...      coords=[('time', np.arange(30))]) + 2.
     >>> o = xr.DataArray(np.random.normal(size=(30)),
     ...      coords=[('time', np.arange(30))])
-    >>> st = sign_test(f1, f2, o, time_dim'time', metric='mae', orientation='negative')
-    >>> st.plot()
-    >>> st['confidence'].plot(color='gray')
-    >>> (-1*st['confidence']).plot(color='gray')
+    >>> significantly_different, walk, confidence = sign_test(f1, f2, o,
+            time_dim='time', metric='mae', orientation='negative')
+    >>> walk.plot()
+    >>> confidence.plot(color='gray')
+    >>> (-1*confidence).plot(color='gray')
+    >>> walk
+    <xarray.DataArray (time: 30)>
+    array([ 1,  2,  3,  2,  3,  2,  3,  4,  3,  4,  5,  4,  5,  4,  5,  4,  5,
+            6,  7,  8,  9, 10,  9, 10, 11, 12, 11, 12, 13, 14])
+    Coordinates:
+      * time     (time) int64 0 1 2 3 4 5 6 7 8 9 ... 20 21 22 23 24 25 26 27 28 29
+    >>> significantly_different
+    <xarray.DataArray (time: 30)>
+    array([False, False, False, False, False, False, False, False, False,
+           False, False, False, False, False, False, False, False, False,
+           False, False,  True,  True, False,  True,  True,  True,  True,
+            True,  True,  True])
+    Coordinates:
+      * time     (time) int64 0 1 2 3 4 5 6 7 8 9 ... 20 21 22 23 24 25 26 27 28 29
+        alpha    float64 0.05
 
     References
     ----------
@@ -164,6 +185,132 @@ def _categorical_metric(observations, forecasts, dim):
     # z_alpha is the value at which the standardized cumulative Gaussian distributed
     # exceeds alpha
     confidence = st.norm.ppf(1 - alpha / 2) * xr.ufuncs.sqrt(N)
-    walk.coords["alpha"] = alpha
-    walk = _add_as_coord(walk, confidence, "confidence")
-    return walk
+    confidence.coords["alpha"] = alpha
+    significantly_different = np.abs(walk) > confidence
+    return significantly_different, walk, confidence
+
+
+def mae_test(
+    forecasts1,
+    forecasts2,
+    observations=None,
+    dim=[],
+    time_dim="time",
+    alpha=0.05,
+):
+    """
+    Returns the Jolliffe and Ebert MAE significance test.
+
+    Tests whether forecasts1 and
+    forecasts2 have different mean absolute error (MAE) at significance level alpha.
+    https://www.cawcr.gov.au/projects/verification/CIdiff/FAQ-CIdiff.html
+
+
+    .. note::
+        ``alpha`` is the desired significance level and the maximum acceptable risk of
+        falsely rejecting the null-hypothesis. The smaller the value of α the greater
+        the strength of the test. The confidence level of the test is defined as
+        1 - alpha, and often expressed as a percentage. So for example a significance
+        level of 0.05, is equivalent to a 95% confidence level.
+        Source: NIST/SEMATECH e-Handbook of Statistical Methods.
+        https://www.itl.nist.gov/div898/handbook/prc/section1/prc14.htm
+
+    Parameters
+    ----------
+    forecasts1 : xarray.Dataset or xarray.DataArray
+        first forecast to be compared to the observations
+    forecasts2 : xarray.Dataset or xarray.DataArray
+        second forecast to be compared to the observations
+    observations : xarray.Dataset or xarray.DataArray or None
+        observations to be compared to both forecasts. if None, assumes that arguments
+        forecasts1 and forecasts2 are already MAEs. Defaults to None.
+    time_dim : str
+        time dimension of dimension over which to compute the temporal correlation.
+        Defaults to ``'time'``.
+    dim : str or list of str
+        dimensions to apply MAE to. Cannot contain ``time_dim``. Defaults to [].
+    alpha : float
+        significance level alpha that forecast1 is different than forecast2.
+
+    Returns
+    -------
+    xarray.DataArray or xarray.Dataset :
+        is the difference in MAE significant? boolean returns
+    xarray.DataArray or xarray.Dataset :
+        Difference in xs.mae reduced by ``dim`` and ``time_dim``
+    xarray.DataArray or xarray.Dataset :
+        half-width of the confidence interval at the significance level ``alpha``.
+
+    Examples
+    --------
+    >>> np.random.seed(42)
+    >>> f1 = xr.DataArray(np.random.normal(size=(30)),
+    ...      coords=[('time', np.arange(30))])
+    >>> f2 = xr.DataArray(np.random.normal(size=(30)),
+    ...      coords=[('time', np.arange(30))])
+    >>> o = xr.DataArray(np.random.normal(size=(30)),
+    ...      coords=[('time', np.arange(30))])
+    >>> significantly_different, diff, hwci = mae_test(f1, f2, o, time_dim='time',
+            dim=[], alpha=0.05)
+    >>> significantly_different
+    <xarray.DataArray ()>
+    array(False)
+    >>> diff
+    <xarray.DataArray ()>
+    array(-0.01919449)
+    >>> hwci
+    <xarray.DataArray ()>
+    array(0.38729387)
+    >>> # absolute magnitude of difference is smaller than half-width of
+    >>> # confidence interval, therefore not significant at level alpha=0.05
+    >>> # now comparing against an offset f2, the difference in MAE is significant
+    >>> significantly_different, diff, hwci = mae_test(f1, f2 + 2., o, time_dim='time',
+            dim=[], alpha=0.05)
+    >>> significantly_different
+    <xarray.DataArray ()>
+    array(True)
+
+
+    References
+    ----------
+        * https://www.cawcr.gov.au/projects/verification/CIdiff/FAQ-CIdiff.html
+
+    """
+    if isinstance(dim, str):
+        dim = [dim]
+    if time_dim in dim:
+        raise ValueError("`dim` cannot contain `time_dim`")
+
+    msg = f"`alpha` must be between 0 and 1 or `return_p`, found {alpha}"
+    if isinstance(alpha, str):
+        if alpha != "return_p":
+            raise ValueError(msg)
+    elif isinstance(alpha, float):
+        if not 0 < alpha < 1:
+            raise ValueError(msg)
+    else:
+        raise ValueError(msg)
+
+    if observations is not None:
+        # Compare the forecasts and observations using metric
+        mae_f1o = mae(observations, forecasts1, dim=dim)
+        mae_f2o = mae(observations, forecasts2, dim=dim)
+    elif observations is None:
+        mae_f1o = forecasts1
+        mae_f2o = forecasts2
+
+    pearson_r_f1f2 = pearson_r(mae_f1o, mae_f2o, dim=time_dim)
+
+    # diff mae
+    diff = mae_f2o.mean(time_dim) - mae_f1o.mean(time_dim)
+
+    notnan = 1 * (mae_f1o.notnull() & mae_f2o.notnull())
+    N = notnan.sum(time_dim)
+    # average variances and take square root instead of averaging standard deviations
+    std = ((mae_f1o.var(time_dim) + mae_f2o.var(time_dim)) / 2) ** 0.5
+
+    confidence = st.norm.ppf(1 - alpha / 2)
+    # half width of the confidence interval
+    hwci = (2 * (1 - pearson_r_f1f2)) ** 0.5 * confidence * std / N ** 0.5
+    significantly_different = np.abs(diff) > hwci  # MAE difference significant?
+    return significantly_different, diff, hwci