Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

better handling of non-cf-compliant time data #263

Merged
merged 3 commits into from
Jun 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 38 additions & 4 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,15 @@
"standard_name": "time",
},
)

time_non_cf_unsupported = xr.DataArray(
data=np.arange(1850 + 1 / 24.0, 1851 + 3 / 12.0, 1 / 12.0),
dims=["time"],
attrs={
"units": "year A.D.",
"long_name": "time",
"standard_name": "time",
},
)
time_bnds = xr.DataArray(
name="time_bnds",
data=np.array(
Expand Down Expand Up @@ -104,6 +112,16 @@
dims=["time", "bnds"],
attrs={"xcdat_bounds": "True"},
)
tb = []
for t in time_non_cf_unsupported:
tb.append([t - 1 / 24.0, t + 1 / 24.0])
time_bnds_non_cf_unsupported = xr.DataArray(
name="time_bnds",
data=tb,
coords={"time": time_non_cf_unsupported},
dims=["time", "bnds"],
attrs={"is_generated": "True"},
)
Comment on lines +115 to +124
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created test bounds for a dataset that is non-cf-compliant (but not handled by xcdat), but I didn't end up using these in the unit test I created (see below).


# LATITUDE
# ========
Expand Down Expand Up @@ -159,7 +177,9 @@
)


def generate_dataset(cf_compliant: bool, has_bounds: bool) -> xr.Dataset:
def generate_dataset(
cf_compliant: bool, has_bounds: bool, unsupported: bool = False
) -> xr.Dataset:
"""Generates a dataset using coordinate and data variable fixtures.

Parameters
Expand All @@ -169,12 +189,22 @@ def generate_dataset(cf_compliant: bool, has_bounds: bool) -> xr.Dataset:
has_bounds : bool, optional
Include bounds for coordinates. This also adds the "bounds" attribute
to existing coordinates to link them to their respective bounds.
unsupported : bool, optional
Create time units that are unsupported and cannot be decoded.
Note that cf_compliant must be set to False.

Returns
-------
xr.Dataset
Test dataset.
"""

if unsupported & cf_compliant:
raise ValueError(
"Cannot set cf_compliant=True and unsupported=True. \n"
"Set cf_compliant=False."
)

if has_bounds:
ds = xr.Dataset(
data_vars={
Expand All @@ -189,8 +219,12 @@ def generate_dataset(cf_compliant: bool, has_bounds: bool) -> xr.Dataset:
ds.coords["time"] = time_cf.copy()
ds["time_bnds"] = time_bnds.copy()
elif not cf_compliant:
ds.coords["time"] = time_non_cf.copy()
ds["time_bnds"] = time_bnds_non_cf.copy()
if unsupported:
ds.coords["time"] = time_non_cf_unsupported.copy()
ds["time_bnds"] = time_bnds_non_cf_unsupported.copy()
else:
ds.coords["time"] = time_non_cf.copy()
ds["time_bnds"] = time_bnds_non_cf.copy()

# If the "bounds" attribute is included in an existing DataArray and
# added to a new Dataset, it will get dropped. Therefore, it needs to be
Expand Down
10 changes: 10 additions & 0 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ def test_non_cf_compliant_time_is_not_decoded(self):
expected = generate_dataset(cf_compliant=False, has_bounds=True)
assert result.identical(expected)

def test_non_cf_compliant_and_unsupported_time_is_not_decoded(self):
ds = generate_dataset(cf_compliant=False, has_bounds=True, unsupported=True)
ds.to_netcdf(self.file_path)

# even though decode_times=True, it should fail to decode unsupported time axis
result = open_dataset(self.file_path, decode_times=True)
expected = ds

assert result.identical(expected)

def test_non_cf_compliant_time_is_decoded(self):
ds = generate_dataset(cf_compliant=False, has_bounds=False)
ds.to_netcdf(self.file_path)
Expand Down
49 changes: 39 additions & 10 deletions xcdat/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ def open_dataset(
the Dataset, by default True. Bounds are required for many xCDAT
features.
decode_times: bool, optional
If True, decode times encoded in the standard NetCDF datetime format
into datetime objects. Otherwise, leave them encoded as numbers.
This keyword may not be supported by all the backends, by default True.
If True, attempt to decode times encoded in the standard NetCDF
datetime format into datetime objects. Otherwise, leave them encoded
as numbers. This keyword may not be supported by all the backends,
by default True.
center_times: bool, optional
If True, center time coordinates using the midpoint between its upper
and lower bounds. Otherwise, use the provided time coordinates, by
Expand Down Expand Up @@ -84,6 +85,7 @@ def open_dataset(
if cf_compliant_time is False:
# XCDAT handles decoding time values with non-CF units.
ds = xr.open_dataset(path, decode_times=False, **kwargs)
# attempt to decode non-cf-compliant time axis
ds = decode_non_cf_time(ds)
else:
ds = xr.open_dataset(path, decode_times=True, **kwargs)
Expand Down Expand Up @@ -225,13 +227,14 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset:
numerically encoded time values (representing the offset from the reference
date) to pandas DateOffset objects. These offset values are added to the
reference date, forming DataArrays of datetime objects that replace the time
coordinate and time bounds (if they exist) values in the Dataset.
coordinate and time bounds (if they exist) in the Dataset.

Parameters
----------
dataset : xr.Dataset
Dataset with numerically encoded time coordinates and time bounds (if
they exist).
they exist). If the time coordinates cannot be decoded then the original
dataset is returned.

Returns
-------
Expand Down Expand Up @@ -304,7 +307,14 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset:
time = ds.cf["T"]
time_bounds = ds.get(time.attrs.get("bounds"), None)
units_attr = time.attrs.get("units")
units, ref_date = _split_time_units_attr(units_attr)

# If the time units cannot be split into a unit and reference date, it
# cannot be decoded so the original dateset is returned.
try:
units, ref_date = _split_time_units_attr(units_attr)
except ValueError:
return ds

ref_date = pd.to_datetime(ref_date)

data = [ref_date + pd.DateOffset(**{units: offset}) for offset in time.data]
Expand Down Expand Up @@ -403,7 +413,13 @@ def _has_cf_compliant_time(
return None

time = ds.cf["T"]
units = _split_time_units_attr(time.attrs.get("units"))[0]

# If the time units attr cannot be split, it is not cf_compliant.
try:
units = _split_time_units_attr(time.attrs.get("units"))[0]
except ValueError:
return False

cf_compliant = units not in NON_CF_TIME_UNITS

return cf_compliant
Expand Down Expand Up @@ -589,6 +605,7 @@ def _preprocess_non_cf_dataset(
if callable:
ds_new = callable(ds)

# Attempt to decode non-cf-compliant time axis.
ds_new = decode_non_cf_time(ds_new)

return ds_new
Expand All @@ -606,11 +623,23 @@ def _split_time_units_attr(units_attr: str) -> Tuple[str, str]:
-------
Tuple[str, str]
The units (e.g, "months") and the reference date (e.g., "1800-01-01").
If the units attribute doesn't exist for the time coordinates.

Raises
------
KeyError
If the time units attribute was not found.

ValueError
If the time units attribute is not of the form `X since Y`.
"""
if units_attr is None:
raise KeyError("No 'units' attribute found for the dataset's time coordinates.")
raise KeyError("The dataset's time coordinates does not have a 'units' attr.")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated a preexisting KeyError message


units, reference_date = units_attr.split(" since ")
if "since" in units_attr:
units, reference_date = units_attr.split(" since ")
else:
raise ValueError(
"This dataset does not have time coordinates of the form 'X since Y'."
)

return units, reference_date