Skip to content

Commit

Permalink
Mask fill values in output dataset (#148)
Browse files Browse the repository at this point in the history
* use mask_and_scale=True when loading output store

* properly raise zarr dataset creation errors

* add warning about fill values in docs

* black

* fix doc examples

* update release notes

* fix release notes title hierarchy

* doc tweaks

* allow passing kwargs to open_zarr

...via the new ``decoding`` argument.

* black
  • Loading branch information
benbovy committed Nov 16, 2020
1 parent 8e45cf5 commit b3e7e93
Show file tree
Hide file tree
Showing 10 changed files with 104 additions and 61 deletions.
4 changes: 3 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ by Jake VanderPlas.
@xs.process
class GameOfLife:
world = xs.variable(dims=('x', 'y'), intent='inout')
world = xs.variable(
dims=('x', 'y'), intent='inout', encoding={'fill_value': None}
)
def run_step(self):
nbrs_count = sum(
Expand Down
4 changes: 3 additions & 1 deletion doc/about.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ by Jake VanderPlas.
...:
...: @xs.process
...: class GameOfLife:
...: world = xs.variable(dims=('x', 'y'), intent='inout')
...: world = xs.variable(
...: dims=('x', 'y'), intent='inout', encoding={'fill_value': None}
...: )
...:
...: def run_step(self):
...: nbrs_count = sum(
Expand Down
50 changes: 12 additions & 38 deletions doc/io_storage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,43 +214,17 @@ data.

Those options can be set for variables declared in process classes. See the
``encoding`` parameter of :func:`~xsimlab.variable` for all available options.
In the example below we specify a custom fill value for the ``position``
variable, which will be used to replace missing values:

.. ipython::

In [4]: @xs.process
...: class Particles:
...: position = xs.variable(dims='pt', intent='out',
...: encoding={'fill_value': -1.0})
...:
...: def initialize(self):
...: self._rng = np.random.default_rng(123)
...:
...: def run_step(self):
...: nparticles = self._rng.integers(1, 4)
...: self.position = self._rng.uniform(0, 10, size=nparticles)
...:

In [5]: model = xs.Model({'pt': Particles})

In [6]: with model:
...: in_ds = xs.create_setup(clocks={'steps': range(4)},
...: output_vars={'pt__position': 'steps'})
...: out_ds = in_ds.xsimlab.run()
...:

In [7]: out_ds.pt__position

Encoding options may also be set or overridden when calling
:func:`~xarray.Dataset.xsimlab.run`, e.g.,

.. ipython::

In [8]: out_ds = in_ds.xsimlab.run(
...: model=model,
...: encoding={'pt__position': {'fill_value': -10.0}}
...: )
...:

In [9]: out_ds.pt__position
:func:`~xarray.Dataset.xsimlab.run`.

.. warning::

Zarr uses ``0`` as the default fill value for numeric value types. This may
badly affect the results, as array elements with the fill value are replaced
by NA in the output xarray Dataset. For variables which accept ``0`` as a
possible (non-missing) value, it is highly recommended to explicitly provide
another ``fill_value``. Alternatively, it is possible to deactivate this
value masking behavior by setting the ``mask_and_scale=False`` option and
pass it via the ``decoding`` parameter of
:func:`~xarray.Dataset.xsimlab.run`.
13 changes: 13 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,23 @@ Release Notes
v0.5.0 (Unreleased)
-------------------

Breaking changes
~~~~~~~~~~~~~~~~

- Fill values are now masked (NA) when when loading the simulation output store
as a xarray Dataset (:issue:`148`). Note that Zarr sets the fill value to
``0`` by default for numeric data types, so it is highly recommended to
explicitly define another fill value in model variable encodings if ``0`` is
expected to be a valid (non-missing) data value, or alternatively use
``mask_and_scale=False`` in the ``decoding`` options passed to
:func:`xarray.Dataset.simlab.run`.

Bug fixes
~~~~~~~~~

- Fix saving output variables with dtype=object (:issue:`145`).
- Fix issues when saving output datasets to disk that were caused by the
``_FillValue`` attribute (:issue:`148`).

v0.4.1 (17 April 2020)
----------------------
Expand Down
2 changes: 2 additions & 0 deletions xsimlab/drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ def __init__(
batch_dim=None,
store=None,
encoding=None,
decoding=None,
check_dims=CheckDimsOption.STRICT,
validate=ValidateOption.INPUTS,
hooks=None,
Expand Down Expand Up @@ -419,6 +420,7 @@ def __init__(
model,
zobject=store,
encoding=encoding,
decoding=decoding,
batch_dim=batch_dim,
lock=lock,
)
Expand Down
30 changes: 21 additions & 9 deletions xsimlab/stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def __init__(
model: Model,
zobject: Optional[Union[zarr.Group, MutableMapping, str]] = None,
encoding: Optional[EncodingDict] = None,
decoding: Optional[Dict] = None,
batch_dim: Optional[str] = None,
lock: Optional[Any] = None,
):
Expand All @@ -128,6 +129,10 @@ def __init__(

if encoding is None:
encoding = {}
if decoding is None:
decoding = {}

self.decoding = decoding

self.var_info = _get_var_info(dataset, model, encoding)

Expand Down Expand Up @@ -213,10 +218,14 @@ def _create_zarr_dataset(
zkwargs.update(var_info["encoding"])

try:
# TODO: race condition? use lock?
zdataset = self.zgroup.create_dataset(name, **zkwargs)
except ValueError:
except ValueError as e:
# return early if already existing dataset (batches of simulations)
return
if name in self.zgroup.keys():
return
else:
raise e

# add dimension labels and variable attributes as metadata
dim_labels = None
Expand Down Expand Up @@ -342,15 +351,18 @@ def open_as_xr_dataset(self) -> xr.Dataset:
else:
chunks = "auto"

ds = xr.open_zarr(
self.zgroup.store,
group=self.zgroup.path,
chunks=chunks,
consolidated=self.consolidated,
# disable mask (not nice with zarr default fill_value=0)
mask_and_scale=False,
# overwrite decoding options
open_kwargs = self.decoding.copy()
open_kwargs.update(
{
"chunks": chunks,
"group": self.zgroup.path,
"consolidated": self.consolidated,
}
)

ds = xr.open_zarr(self.zgroup.store, **open_kwargs)

if self.in_memory:
# lazy loading may be confusing for the default, in-memory option
ds.load()
Expand Down
23 changes: 16 additions & 7 deletions xsimlab/tests/fixture_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@

@xs.process
class Profile:
u = xs.variable(dims="x", description="quantity u", intent="inout")
u = xs.variable(
dims="x",
description="quantity u",
intent="inout",
encoding={"fill_value": np.nan},
)
u_diffs = xs.group("diff")
u_opp = xs.on_demand(dims="x")

Expand Down Expand Up @@ -43,7 +48,7 @@ class InitProfile:
u = xs.foreign(Profile, "u", intent="out")

def initialize(self):
self.x = np.arange(self.n_points)
self.x = np.arange(self.n_points).astype("double")

self.u = np.zeros(self.n_points)
self.u[0] = 1.0
Expand All @@ -58,7 +63,9 @@ class Roll:
attrs={"units": "unitless"},
)
u = xs.foreign(Profile, "u")
u_diff = xs.variable(dims="x", groups="diff", intent="out")
u_diff = xs.variable(
dims="x", groups="diff", intent="out", encoding={"fill_value": np.nan}
)

def run_step(self):
self.u_diff = np.roll(self.u, self.shift) - self.u
Expand All @@ -69,21 +76,23 @@ class Add:
offset = xs.variable(
description=("offset * dt added every time step " "to profile u")
)
u_diff = xs.variable(groups="diff", intent="out")
u_diff = xs.variable(groups="diff", intent="out", encoding={"fill_value": np.nan})

@xs.runtime(args="step_delta")
def run_step(self, dt):
self.u_diff = self.offset * dt
self.u_diff = self.offset * dt * 1.0


@xs.process
class AddOnDemand:
offset = xs.variable(dims=[(), "x"], description="offset added to profile u")
u_diff = xs.on_demand(dims=[(), "x"], groups="diff")
u_diff = xs.on_demand(
dims=[(), "x"], groups="diff", encoding={"fill_value": np.nan}
)

@u_diff.compute
def _compute_u_diff(self):
return self.offset
return self.offset * 1.0


@pytest.fixture
Expand Down
4 changes: 0 additions & 4 deletions xsimlab/tests/test_drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,6 @@ def test_run_model_get_results(self, in_dataset, out_dataset, xarray_driver):
xarray_driver.run_model()
out_ds_actual = xarray_driver.get_results()

# skip attributes added by xr.open_zarr from check
for xr_var in out_ds_actual.variables.values():
xr_var.attrs.pop("_FillValue", None)

assert out_ds_actual is not out_dataset
xr.testing.assert_identical(out_ds_actual.load(), out_dataset)

Expand Down
28 changes: 28 additions & 0 deletions xsimlab/tests/test_xr_accessor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from math import nan
import pytest
from dask.distributed import Client
import xarray as xr
Expand Down Expand Up @@ -545,6 +546,33 @@ def run_step(self):
expected = xr.DataArray(data, dims=dims, coords=coords) * 2
xr.testing.assert_equal(out_ds["p__out_var"], expected)

@pytest.mark.parametrize(
"decoding,expected",
[
(None, [nan, nan]), # mask_and_scale=True by default
({"mask_and_scale": False}, [-1, -1]),
],
)
def test_run_decoding(self, decoding, expected):
@xs.process
class P:
var = xs.variable(dims="x", intent="out", encoding={"fill_value": -1})

def initialize(self):
self.var = [-1, -1]

m = xs.Model({"p": P})

in_ds = xs.create_setup(
model=m,
clocks={"clock": [0, 1]},
output_vars={"p__var": None},
)

out_ds = in_ds.xsimlab.run(model=m, decoding=decoding)

np.testing.assert_array_equal(out_ds.p__var, expected)


def test_create_setup(model, in_dataset):
expected = xr.Dataset()
Expand Down
7 changes: 6 additions & 1 deletion xsimlab/xr_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,7 @@ def run(
validate="inputs",
store=None,
encoding=None,
decoding=None,
hooks=None,
parallel=False,
scheduler=None,
Expand Down Expand Up @@ -730,6 +731,9 @@ def run(
model variables (see :func:`~xsimlab.variable` for a full list of
of options available). Additionally, 'chunks' and 'synchronizer'
options are supported here.
decoding : dict, optional
Options passed as keyword arguments to :func:`xarray.open_zarr` to load
the simulation outputs from the zarr store as a new xarray dataset.
hooks : list, optional
One or more runtime hooks, i.e., functions decorated with
:func:`~xsimlab.runtime_hook` or instances of
Expand All @@ -754,7 +758,7 @@ def run(
Returns
-------
output : Dataset
Another Dataset with both model inputs and outputs. The data is lazily
Another Dataset with both model inputs and outputs. The data is (lazily)
loaded from the zarr store used to save inputs and outputs.
Notes
Expand Down Expand Up @@ -806,6 +810,7 @@ def run(
batch_dim=batch_dim,
store=store,
encoding=encoding,
decoding=decoding,
check_dims=check_dims,
validate=validate,
hooks=hooks,
Expand Down

0 comments on commit b3e7e93

Please sign in to comment.