Mask fill values in output dataset (#148)

* use mask_and_scale=True when loading output store * properly raise zarr dataset creation errors * add warning about fill values in docs * black * fix doc examples * update release notes * fix release notes title hierarchy * doc tweaks * allow passing kwargs to open_zarr ...via the new ``decoding`` argument. * black
xarray-contrib · Nov 16, 2020 · b3e7e93 · b3e7e93
1 parent 8e45cf5
commit b3e7e93
Show file tree

Hide file tree

Showing 10 changed files with 104 additions and 61 deletions.
diff --git a/README.rst b/README.rst
@@ -44,7 +44,9 @@ by Jake VanderPlas.
 
     @xs.process
     class GameOfLife:
-        world = xs.variable(dims=('x', 'y'), intent='inout')
+        world = xs.variable(
+            dims=('x', 'y'), intent='inout', encoding={'fill_value': None}
+        )
 
         def run_step(self):
             nbrs_count = sum(

diff --git a/doc/about.rst b/doc/about.rst
@@ -35,7 +35,9 @@ by Jake VanderPlas.
        ...:
        ...: @xs.process
        ...: class GameOfLife:
-       ...:     world = xs.variable(dims=('x', 'y'), intent='inout')
+       ...:     world = xs.variable(
+       ...:         dims=('x', 'y'), intent='inout', encoding={'fill_value': None}
+       ...:     )
        ...:
        ...:     def run_step(self):
        ...:         nbrs_count = sum(

diff --git a/doc/io_storage.rst b/doc/io_storage.rst
@@ -214,43 +214,17 @@ data.
 
 Those options can be set for variables declared in process classes. See the
 ``encoding`` parameter of :func:`~xsimlab.variable` for all available options.
-In the example below we specify a custom fill value for the ``position``
-variable, which will be used to replace missing values:
-
-.. ipython::
-
-   In [4]: @xs.process
-      ...: class Particles:
-      ...:     position = xs.variable(dims='pt', intent='out',
-      ...:                            encoding={'fill_value': -1.0})
-      ...:
-      ...:     def initialize(self):
-      ...:         self._rng = np.random.default_rng(123)
-      ...:
-      ...:     def run_step(self):
-      ...:         nparticles = self._rng.integers(1, 4)
-      ...:         self.position = self._rng.uniform(0, 10, size=nparticles)
-      ...:
-
-   In [5]: model = xs.Model({'pt': Particles})
-
-   In [6]: with model:
-      ...:     in_ds = xs.create_setup(clocks={'steps': range(4)},
-      ...:                             output_vars={'pt__position': 'steps'})
-      ...:     out_ds = in_ds.xsimlab.run()
-      ...:
-
-   In [7]: out_ds.pt__position
 
 Encoding options may also be set or overridden when calling
-:func:`~xarray.Dataset.xsimlab.run`, e.g.,
-
-.. ipython::
-
-   In [8]: out_ds = in_ds.xsimlab.run(
-      ...:     model=model,
-      ...:     encoding={'pt__position': {'fill_value': -10.0}}
-      ...: )
-      ...:
-
-   In [9]: out_ds.pt__position
+:func:`~xarray.Dataset.xsimlab.run`.
+
+.. warning::
+
+   Zarr uses ``0`` as the default fill value for numeric value types. This may
+   badly affect the results, as array elements with the fill value are replaced
+   by NA in the output xarray Dataset. For variables which accept ``0`` as a
+   possible (non-missing) value, it is highly recommended to explicitly provide
+   another ``fill_value``. Alternatively, it is possible to deactivate this
+   value masking behavior by setting the ``mask_and_scale=False`` option and
+   pass it via the ``decoding`` parameter of
+   :func:`~xarray.Dataset.xsimlab.run`.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -6,10 +6,23 @@ Release Notes
 v0.5.0 (Unreleased)
 -------------------
 
+Breaking changes
+~~~~~~~~~~~~~~~~
+
+- Fill values are now masked (NA) when when loading the simulation output store
+  as a xarray Dataset (:issue:`148`). Note that Zarr sets the fill value to
+  ``0`` by default for numeric data types, so it is highly recommended to
+  explicitly define another fill value in model variable encodings if ``0`` is
+  expected to be a valid (non-missing) data value, or alternatively use
+  ``mask_and_scale=False`` in the ``decoding`` options passed to
+  :func:`xarray.Dataset.simlab.run`.
+
 Bug fixes
 ~~~~~~~~~
 
 - Fix saving output variables with dtype=object (:issue:`145`).
+- Fix issues when saving output datasets to disk that were caused by the
+  ``_FillValue`` attribute (:issue:`148`).
 
 v0.4.1 (17 April 2020)
 ----------------------

diff --git a/xsimlab/drivers.py b/xsimlab/drivers.py
@@ -376,6 +376,7 @@ def __init__(
         batch_dim=None,
         store=None,
         encoding=None,
+        decoding=None,
         check_dims=CheckDimsOption.STRICT,
         validate=ValidateOption.INPUTS,
         hooks=None,
@@ -419,6 +420,7 @@ def __init__(
             model,
             zobject=store,
             encoding=encoding,
+            decoding=decoding,
             batch_dim=batch_dim,
             lock=lock,
         )

diff --git a/xsimlab/stores.py b/xsimlab/stores.py
@@ -106,6 +106,7 @@ def __init__(
         model: Model,
         zobject: Optional[Union[zarr.Group, MutableMapping, str]] = None,
         encoding: Optional[EncodingDict] = None,
+        decoding: Optional[Dict] = None,
         batch_dim: Optional[str] = None,
         lock: Optional[Any] = None,
     ):
@@ -128,6 +129,10 @@ def __init__(
 
         if encoding is None:
             encoding = {}
+        if decoding is None:
+            decoding = {}
+
+        self.decoding = decoding
 
         self.var_info = _get_var_info(dataset, model, encoding)
 
@@ -213,10 +218,14 @@ def _create_zarr_dataset(
         zkwargs.update(var_info["encoding"])
 
         try:
+            # TODO: race condition? use lock?
             zdataset = self.zgroup.create_dataset(name, **zkwargs)
-        except ValueError:
+        except ValueError as e:
             # return early if already existing dataset (batches of simulations)
-            return
+            if name in self.zgroup.keys():
+                return
+            else:
+                raise e
 
         # add dimension labels and variable attributes as metadata
         dim_labels = None
@@ -342,15 +351,18 @@ def open_as_xr_dataset(self) -> xr.Dataset:
         else:
             chunks = "auto"
 
-        ds = xr.open_zarr(
-            self.zgroup.store,
-            group=self.zgroup.path,
-            chunks=chunks,
-            consolidated=self.consolidated,
-            # disable mask (not nice with zarr default fill_value=0)
-            mask_and_scale=False,
+        # overwrite decoding options
+        open_kwargs = self.decoding.copy()
+        open_kwargs.update(
+            {
+                "chunks": chunks,
+                "group": self.zgroup.path,
+                "consolidated": self.consolidated,
+            }
         )
 
+        ds = xr.open_zarr(self.zgroup.store, **open_kwargs)
+
         if self.in_memory:
             # lazy loading may be confusing for the default, in-memory option
             ds.load()

diff --git a/xsimlab/tests/fixture_model.py b/xsimlab/tests/fixture_model.py
@@ -12,7 +12,12 @@
 
 @xs.process
 class Profile:
-    u = xs.variable(dims="x", description="quantity u", intent="inout")
+    u = xs.variable(
+        dims="x",
+        description="quantity u",
+        intent="inout",
+        encoding={"fill_value": np.nan},
+    )
     u_diffs = xs.group("diff")
     u_opp = xs.on_demand(dims="x")
 
@@ -43,7 +48,7 @@ class InitProfile:
     u = xs.foreign(Profile, "u", intent="out")
 
     def initialize(self):
-        self.x = np.arange(self.n_points)
+        self.x = np.arange(self.n_points).astype("double")
 
         self.u = np.zeros(self.n_points)
         self.u[0] = 1.0
@@ -58,7 +63,9 @@ class Roll:
         attrs={"units": "unitless"},
     )
     u = xs.foreign(Profile, "u")
-    u_diff = xs.variable(dims="x", groups="diff", intent="out")
+    u_diff = xs.variable(
+        dims="x", groups="diff", intent="out", encoding={"fill_value": np.nan}
+    )
 
     def run_step(self):
         self.u_diff = np.roll(self.u, self.shift) - self.u
@@ -69,21 +76,23 @@ class Add:
     offset = xs.variable(
         description=("offset * dt added every time step " "to profile u")
     )
-    u_diff = xs.variable(groups="diff", intent="out")
+    u_diff = xs.variable(groups="diff", intent="out", encoding={"fill_value": np.nan})
 
     @xs.runtime(args="step_delta")
     def run_step(self, dt):
-        self.u_diff = self.offset * dt
+        self.u_diff = self.offset * dt * 1.0
 
 
 @xs.process
 class AddOnDemand:
     offset = xs.variable(dims=[(), "x"], description="offset added to profile u")
-    u_diff = xs.on_demand(dims=[(), "x"], groups="diff")
+    u_diff = xs.on_demand(
+        dims=[(), "x"], groups="diff", encoding={"fill_value": np.nan}
+    )
 
     @u_diff.compute
     def _compute_u_diff(self):
-        return self.offset
+        return self.offset * 1.0
 
 
 @pytest.fixture

diff --git a/xsimlab/tests/test_drivers.py b/xsimlab/tests/test_drivers.py
@@ -97,10 +97,6 @@ def test_run_model_get_results(self, in_dataset, out_dataset, xarray_driver):
         xarray_driver.run_model()
         out_ds_actual = xarray_driver.get_results()
 
-        # skip attributes added by xr.open_zarr from check
-        for xr_var in out_ds_actual.variables.values():
-            xr_var.attrs.pop("_FillValue", None)
-
         assert out_ds_actual is not out_dataset
         xr.testing.assert_identical(out_ds_actual.load(), out_dataset)
 

diff --git a/xsimlab/tests/test_xr_accessor.py b/xsimlab/tests/test_xr_accessor.py
@@ -1,3 +1,4 @@
+from math import nan
 import pytest
 from dask.distributed import Client
 import xarray as xr
@@ -545,6 +546,33 @@ def run_step(self):
         expected = xr.DataArray(data, dims=dims, coords=coords) * 2
         xr.testing.assert_equal(out_ds["p__out_var"], expected)
 
+    @pytest.mark.parametrize(
+        "decoding,expected",
+        [
+            (None, [nan, nan]),  # mask_and_scale=True by default
+            ({"mask_and_scale": False}, [-1, -1]),
+        ],
+    )
+    def test_run_decoding(self, decoding, expected):
+        @xs.process
+        class P:
+            var = xs.variable(dims="x", intent="out", encoding={"fill_value": -1})
+
+            def initialize(self):
+                self.var = [-1, -1]
+
+        m = xs.Model({"p": P})
+
+        in_ds = xs.create_setup(
+            model=m,
+            clocks={"clock": [0, 1]},
+            output_vars={"p__var": None},
+        )
+
+        out_ds = in_ds.xsimlab.run(model=m, decoding=decoding)
+
+        np.testing.assert_array_equal(out_ds.p__var, expected)
+
 
 def test_create_setup(model, in_dataset):
     expected = xr.Dataset()

diff --git a/xsimlab/xr_accessor.py b/xsimlab/xr_accessor.py
@@ -677,6 +677,7 @@ def run(
         validate="inputs",
         store=None,
         encoding=None,
+        decoding=None,
         hooks=None,
         parallel=False,
         scheduler=None,
@@ -730,6 +731,9 @@ def run(
             model variables (see :func:`~xsimlab.variable` for a full list of
             of options available). Additionally, 'chunks' and 'synchronizer'
             options are supported here.
+        decoding : dict, optional
+            Options passed as keyword arguments to :func:`xarray.open_zarr` to load
+            the simulation outputs from the zarr store as a new xarray dataset.
         hooks : list, optional
             One or more runtime hooks, i.e., functions decorated with
             :func:`~xsimlab.runtime_hook` or instances of
@@ -754,7 +758,7 @@ def run(
         Returns
         -------
         output : Dataset
-            Another Dataset with both model inputs and outputs. The data is lazily
+            Another Dataset with both model inputs and outputs. The data is (lazily)
             loaded from the zarr store used to save inputs and outputs.
 
         Notes
@@ -806,6 +810,7 @@ def run(
             batch_dim=batch_dim,
             store=store,
             encoding=encoding,
+            decoding=decoding,
             check_dims=check_dims,
             validate=validate,
             hooks=hooks,