From 3050a8dbad61dd30e964f035c0768923988babc9 Mon Sep 17 00:00:00 2001 From: Stephen Po-Chedley Date: Tue, 26 Jul 2022 10:34:33 -0700 Subject: [PATCH 01/12] initial solution for #278 add unit test add comments for test, cleanup extraneous code initial work on #282 Bugfix/278 cannot generate bounds (#281) * initial solution for #278 * add unit test * add comments for test, cleanup extraneous code --- conda-env/dev.yml | 3 ++- xcdat/dataset.py | 35 ++++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/conda-env/dev.yml b/conda-env/dev.yml index 970c4a24..6a1136be 100644 --- a/conda-env/dev.yml +++ b/conda-env/dev.yml @@ -17,7 +17,8 @@ dependencies: - numpy=1.22.4 - pandas=1.4.3 - xarray=2022.3.0 - - xesmf=0.6.3 + - python-dateutil=2.8.2 + - types-python-dateutil=2.8.19 # Quality Assurance # ================== # If versions are updated, also update 'rev' in `.pre-commit.config.yaml` diff --git a/xcdat/dataset.py b/xcdat/dataset.py index d4806498..205ba9e5 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -4,8 +4,10 @@ from glob import glob from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union +import cftime import pandas as pd import xarray as xr +from dateutil import relativedelta as rd from xcdat import bounds # noqa: F401 from xcdat.axis import center_times as center_times_func @@ -315,9 +317,15 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: except ValueError: return ds - ref_date = pd.to_datetime(ref_date) + ref_date = pd.to_datetime(ref_date).to_pydatetime() + + if units == "months": + data = [ref_date + rd.relativedelta(months=offset) for offset in time.data] + data = [cftime.datetime(t.year, t.month, t.day) for t in data] + elif units == "years": + data = [ref_date + rd.relativedelta(years=offset) for offset in time.data] + data = [cftime.datetime(t.year, t.month, t.day) for t in data] - data = [ref_date + pd.DateOffset(**{units: offset}) for offset in time.data] decoded_time = xr.DataArray( name=time.name, data=data, @@ -335,13 +343,22 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: ds = ds.assign_coords({time.name: decoded_time}) if time_bounds is not None: - data_bounds = [ - [ - ref_date + pd.DateOffset(**{units: lower}), - ref_date + pd.DateOffset(**{units: upper}), + if units == "months": + data_bounds = [ + [ + ref_date + rd.relativedelta(months=lower), + ref_date + rd.relativedelta(months=upper), + ] + for [lower, upper] in time_bounds.data + ] + elif units == "years": + data_bounds = [ + [ + ref_date + rd.relativedelta(years=lower), + ref_date + rd.relativedelta(years=upper), + ] + for [lower, upper] in time_bounds.data ] - for [lower, upper] in time_bounds.data - ] decoded_time_bnds = xr.DataArray( name=time_bounds.name, data=data_bounds, @@ -429,7 +446,7 @@ def _postprocess_dataset( dataset: xr.Dataset, data_var: Optional[str] = None, center_times: bool = False, - add_bounds: bool = False, + add_bounds: bool = True, lon_orient: Optional[Tuple[float, float]] = None, ) -> xr.Dataset: """Post-processes a Dataset object. From a681787e366cdfa8464497946efdc5cbccb0e101 Mon Sep 17 00:00:00 2001 From: tomvothecoder Date: Thu, 28 Jul 2022 14:52:26 -0500 Subject: [PATCH 02/12] PR review refactor - Add `types-python-dateutil` to `mypy` dependencies - Update `ref_date` var to `ref_dt_obj` to avoid mypy error `error: Unsupported operand types for + ("str" and "relativedelta")` - Use dictionary unpacking for units variable - Use `datetime.strptime` instead of `pd.datetime()` which runs into the `pd.Timestamp` limitation - Add logger.warning when non-CF compliant time coords cannot be decoded --- .pre-commit-config.yaml | 8 +++- xcdat/dataset.py | 89 ++++++++++++++++++++--------------------- 2 files changed, 50 insertions(+), 47 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 380d33dd..f72a4fc0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,4 +35,10 @@ repos: - id: mypy args: ["--config=setup.cfg"] additional_dependencies: - [dask==2022.6.1, numpy==1.23.0, pandas==1.4.3, xarray==2022.3.0] + [ + dask==2022.6.1, + numpy==1.22.4, + pandas==1.4.3, + xarray==2022.3.0, + types-python-dateutil==2.8.2, + ] diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 205ba9e5..75916c75 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -1,11 +1,11 @@ """Dataset module for functions related to an xarray.Dataset.""" import pathlib +from datetime import datetime from functools import partial from glob import glob from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union import cftime -import pandas as pd import xarray as xr from dateutil import relativedelta as rd @@ -222,14 +222,25 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: """Decodes time coordinates and time bounds with non-CF compliant units. By default, ``xarray`` uses the ``cftime`` module, which only supports - decoding time with [3]_ CF compliant units. This function fills the gap in + decoding time with CF compliant units [3]_. This function fills the gap in xarray by being able to decode time with non-CF compliant units such as - "months since ..." and "years since ...". It extracts the units and - reference date from the "units" attribute, which are used to convert the - numerically encoded time values (representing the offset from the reference - date) to pandas DateOffset objects. These offset values are added to the - reference date, forming DataArrays of datetime objects that replace the time - coordinate and time bounds (if they exist) in the Dataset. + "months since ..." and "years since ...". + + The steps include: + + 1. Extract ``units`` and ``reference_date`` strings from the "units" + attribute. + + * For example with "months since 1800-01-01", ``units="months"`` and + ``reference_date="1800-01-01"`` + + 2. Using the ``reference_date``, create a datetime object (``ref_dt_obj``). + 3. Starting from ``ref_dt_obj``, use the numerically encoded time coordinate + values (each representing an offset) to create an array of + ``cftime.datetime`` objects. + 4. Using the array of ``cftime.datetime`` objects, create a new xr.DataArray + of time coordinates to replace the numerically encoded ones. + 5. If it exists, create a time bounds DataArray using steps 3 and 4. Parameters ---------- @@ -246,23 +257,15 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: Notes ----- - The [4]_ pandas ``DateOffset`` object is a time duration relative to a - reference date that respects calendar arithmetic. This means it considers - CF calendar types with or without leap years when adding the offsets to the - reference date. - - DateOffset is used instead of timedelta64 because timedelta64 does not - respect calendar arithmetic. One downside of DateOffset (unlike timedelta64) - is that there is currently no simple way of vectorizing the addition of - DateOffset objects to Timestamp/datetime64 objects. However, the performance - of element-wise iteration should be sufficient for datasets that have - "months" and "years" time units since the number of time coordinates should - be small compared to "days" or "hours". + ``cftime.datetime`` objects are used to represent time coordinates because + it is not restricted by the ``pandas.Timestamp`` range (1678 and 2262). + Refer to [4]_ and [5]_ for more information on this limitation. References ----- .. [3] https://cfconventions.org/cf-conventions/cf-conventions.html#time-coordinate - .. [4] https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects + .. [4] https://docs.xarray.dev/en/stable/user-guide/weather-climate.html#non-standard-calendars-and-dates-outside-the-timestamp-valid-range + .. [5] https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations Examples -------- @@ -310,21 +313,23 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: time_bounds = ds.get(time.attrs.get("bounds"), None) units_attr = time.attrs.get("units") - # If the time units cannot be split into a unit and reference date, it - # cannot be decoded so the original dateset is returned. try: units, ref_date = _split_time_units_attr(units_attr) except ValueError: + logger.warning( + "Attempted to decode non-CF compliant time coordinates, but the units " + f"({units_attr}) is not in a supported format ('months since...' or " + "'months since...'). Returning dataset with existing time coordinates." + ) return ds - ref_date = pd.to_datetime(ref_date).to_pydatetime() - - if units == "months": - data = [ref_date + rd.relativedelta(months=offset) for offset in time.data] - data = [cftime.datetime(t.year, t.month, t.day) for t in data] - elif units == "years": - data = [ref_date + rd.relativedelta(years=offset) for offset in time.data] - data = [cftime.datetime(t.year, t.month, t.day) for t in data] + # Create a reference date object and use the relative delta offsets + # to create `cftime.datetime` objects. Note, the reference date object type + # must start as `datetime` because `cftime` does not support arithmetic with + # `rd.relativedelta`. + ref_dt_obj = datetime.strptime(ref_date, "%Y-%m-%d") + data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] + data = [cftime.datetime(t.year, t.month, t.day) for t in data] decoded_time = xr.DataArray( name=time.name, @@ -343,22 +348,14 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: ds = ds.assign_coords({time.name: decoded_time}) if time_bounds is not None: - if units == "months": - data_bounds = [ - [ - ref_date + rd.relativedelta(months=lower), - ref_date + rd.relativedelta(months=upper), - ] - for [lower, upper] in time_bounds.data - ] - elif units == "years": - data_bounds = [ - [ - ref_date + rd.relativedelta(years=lower), - ref_date + rd.relativedelta(years=upper), - ] - for [lower, upper] in time_bounds.data + data_bounds = [ + [ + ref_dt_obj + rd.relativedelta(**{units: lower}), + ref_dt_obj + rd.relativedelta(**{units: upper}), ] + for [lower, upper] in time_bounds.data + ] + decoded_time_bnds = xr.DataArray( name=time_bounds.name, data=data_bounds, From 4c33bf912ab3c0bc8b378da8a2cf8a31e59f5750 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Thu, 28 Jul 2022 15:31:51 -0700 Subject: [PATCH 03/12] Consider calendar type when decoding non-Cf time - Fix tests --- tests/test_dataset.py | 442 +++++++++++++++++++++++++++--------------- xcdat/dataset.py | 28 ++- 2 files changed, 315 insertions(+), 155 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index ed838095..35d91121 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,6 +1,7 @@ import pathlib import warnings +import cftime import numpy as np import pytest import xarray as xr @@ -55,59 +56,86 @@ def test_non_cf_compliant_time_is_decoded(self): # Generate an expected dataset with decoded non-CF compliant time units. expected = generate_dataset(cf_compliant=True, has_bounds=True) - expected_time_data = np.array( - [ - "2000-01-01T00:00:00.000000000", - "2000-02-01T00:00:00.000000000", - "2000-03-01T00:00:00.000000000", - "2000-04-01T00:00:00.000000000", - "2000-05-01T00:00:00.000000000", - "2000-06-01T00:00:00.000000000", - "2000-07-01T00:00:00.000000000", - "2000-08-01T00:00:00.000000000", - "2000-09-01T00:00:00.000000000", - "2000-10-01T00:00:00.000000000", - "2000-11-01T00:00:00.000000000", - "2000-12-01T00:00:00.000000000", - "2001-01-01T00:00:00.000000000", - "2001-02-01T00:00:00.000000000", - "2001-03-01T00:00:00.000000000", - ], - dtype="datetime64[ns]", - ) expected["time"] = xr.DataArray( name="time", - data=expected_time_data, + data=np.array( + [ + cftime.datetime(2000, 1, 1), + cftime.datetime(2000, 2, 1), + cftime.datetime(2000, 3, 1), + cftime.datetime(2000, 4, 1), + cftime.datetime(2000, 5, 1), + cftime.datetime(2000, 6, 1), + cftime.datetime(2000, 7, 1), + cftime.datetime(2000, 8, 1), + cftime.datetime(2000, 9, 1), + cftime.datetime(2000, 10, 1), + cftime.datetime(2000, 11, 1), + cftime.datetime(2000, 12, 1), + cftime.datetime(2001, 1, 1), + cftime.datetime(2001, 2, 1), + cftime.datetime(2001, 3, 1), + ], + ), dims="time", - attrs={ - "units": "months since 2000-01-01", - "calendar": "standard", - "axis": "T", - "long_name": "time", - "standard_name": "time", - "bounds": "time_bnds", - }, ) - expected.time_bnds.data[:] = np.array( - [ - ["1999-12-16T12:00:00.000000000", "2000-01-16T12:00:00.000000000"], - ["2000-01-16T12:00:00.000000000", "2000-02-15T12:00:00.000000000"], - ["2000-02-15T12:00:00.000000000", "2000-03-16T12:00:00.000000000"], - ["2000-03-16T12:00:00.000000000", "2000-04-16T00:00:00.000000000"], - ["2000-04-16T00:00:00.000000000", "2000-05-16T12:00:00.000000000"], - ["2000-05-16T12:00:00.000000000", "2000-06-16T00:00:00.000000000"], - ["2000-06-16T00:00:00.000000000", "2000-07-16T12:00:00.000000000"], - ["2000-07-16T12:00:00.000000000", "2000-08-16T12:00:00.000000000"], - ["2000-08-16T12:00:00.000000000", "2000-09-16T00:00:00.000000000"], - ["2000-09-16T00:00:00.000000000", "2000-10-16T12:00:00.000000000"], - ["2000-10-16T12:00:00.000000000", "2000-11-16T00:00:00.000000000"], - ["2000-11-16T00:00:00.000000000", "2000-12-16T12:00:00.000000000"], - ["2000-12-16T12:00:00.000000000", "2001-01-16T12:00:00.000000000"], - ["2001-01-16T12:00:00.000000000", "2001-02-15T00:00:00.000000000"], - ["2001-02-15T00:00:00.000000000", "2001-03-15T00:00:00.000000000"], - ], - dtype="datetime64[ns]", + expected["time_bnds"] = xr.DataArray( + name="time_bnds", + data=np.array( + [ + [ + cftime.datetime(1999, 12, 16, 12), + cftime.datetime(2000, 1, 16, 12), + ], + [ + cftime.datetime(2000, 1, 16, 12), + cftime.datetime(2000, 2, 15, 12), + ], + [ + cftime.datetime(2000, 2, 15, 12), + cftime.datetime(2000, 3, 16, 12), + ], + [cftime.datetime(2000, 3, 16, 12), cftime.datetime(2000, 4, 16, 0)], + [cftime.datetime(2000, 4, 16, 0), cftime.datetime(2000, 5, 16, 12)], + [cftime.datetime(2000, 5, 16, 12), cftime.datetime(2000, 6, 16, 0)], + [cftime.datetime(2000, 6, 16, 0), cftime.datetime(2000, 7, 16, 12)], + [ + cftime.datetime(2000, 7, 16, 12), + cftime.datetime(2000, 8, 16, 12), + ], + [cftime.datetime(2000, 8, 16, 12), cftime.datetime(2000, 9, 16, 0)], + [ + cftime.datetime(2000, 9, 16, 0), + cftime.datetime(2000, 10, 16, 12), + ], + [ + cftime.datetime(2000, 10, 16, 12), + cftime.datetime(2000, 11, 16, 0), + ], + [ + cftime.datetime(2000, 11, 16, 0), + cftime.datetime(2000, 12, 16, 12), + ], + [ + cftime.datetime(2000, 12, 16, 12), + cftime.datetime(2001, 1, 16, 12), + ], + [cftime.datetime(2001, 1, 16, 12), cftime.datetime(2001, 2, 15, 0)], + [cftime.datetime(2001, 2, 15, 0), cftime.datetime(2001, 3, 15, 0)], + ] + ), + dims=["time", "bnds"], + attrs={"xcdat_bounds": "True"}, ) + + expected.time.attrs = { + "units": "months since 2000-01-01", + "calendar": "standard", + "axis": "T", + "long_name": "time", + "standard_name": "time", + "bounds": "time_bnds", + } expected.time.encoding = { # Set source as result source because it changes every test run. "source": result.time.encoding["source"], @@ -182,67 +210,91 @@ def test_non_cf_compliant_time_is_decoded(self): ds1.to_netcdf(self.file_path1) ds2.to_netcdf(self.file_path2) - result = open_mfdataset( - [self.file_path1, self.file_path2], - data_var="ts", - ) + result = open_mfdataset([self.file_path1, self.file_path2], data_var="ts") - # Generate an expected dataset, which is a combination of both datasets - # with decoded time units and coordinate bounds. + # Generate an expected dataset with decoded non-CF compliant time units. expected = generate_dataset(cf_compliant=True, has_bounds=True) - expected_time_data = np.array( - [ - "2000-01-01T00:00:00.000000000", - "2000-02-01T00:00:00.000000000", - "2000-03-01T00:00:00.000000000", - "2000-04-01T00:00:00.000000000", - "2000-05-01T00:00:00.000000000", - "2000-06-01T00:00:00.000000000", - "2000-07-01T00:00:00.000000000", - "2000-08-01T00:00:00.000000000", - "2000-09-01T00:00:00.000000000", - "2000-10-01T00:00:00.000000000", - "2000-11-01T00:00:00.000000000", - "2000-12-01T00:00:00.000000000", - "2001-01-01T00:00:00.000000000", - "2001-02-01T00:00:00.000000000", - "2001-03-01T00:00:00.000000000", - ], - dtype="datetime64[ns]", - ) + expected["time"] = xr.DataArray( name="time", - data=expected_time_data, + data=np.array( + [ + cftime.datetime(2000, 1, 1), + cftime.datetime(2000, 2, 1), + cftime.datetime(2000, 3, 1), + cftime.datetime(2000, 4, 1), + cftime.datetime(2000, 5, 1), + cftime.datetime(2000, 6, 1), + cftime.datetime(2000, 7, 1), + cftime.datetime(2000, 8, 1), + cftime.datetime(2000, 9, 1), + cftime.datetime(2000, 10, 1), + cftime.datetime(2000, 11, 1), + cftime.datetime(2000, 12, 1), + cftime.datetime(2001, 1, 1), + cftime.datetime(2001, 2, 1), + cftime.datetime(2001, 3, 1), + ], + ), dims="time", - attrs={ - "units": "months since 2000-01-01", - "calendar": "standard", - "axis": "T", - "long_name": "time", - "standard_name": "time", - "bounds": "time_bnds", - }, ) - expected.time_bnds.data[:] = np.array( - [ - ["1999-12-16T12:00:00.000000000", "2000-01-16T12:00:00.000000000"], - ["2000-01-16T12:00:00.000000000", "2000-02-15T12:00:00.000000000"], - ["2000-02-15T12:00:00.000000000", "2000-03-16T12:00:00.000000000"], - ["2000-03-16T12:00:00.000000000", "2000-04-16T00:00:00.000000000"], - ["2000-04-16T00:00:00.000000000", "2000-05-16T12:00:00.000000000"], - ["2000-05-16T12:00:00.000000000", "2000-06-16T00:00:00.000000000"], - ["2000-06-16T00:00:00.000000000", "2000-07-16T12:00:00.000000000"], - ["2000-07-16T12:00:00.000000000", "2000-08-16T12:00:00.000000000"], - ["2000-08-16T12:00:00.000000000", "2000-09-16T00:00:00.000000000"], - ["2000-09-16T00:00:00.000000000", "2000-10-16T12:00:00.000000000"], - ["2000-10-16T12:00:00.000000000", "2000-11-16T00:00:00.000000000"], - ["2000-11-16T00:00:00.000000000", "2000-12-16T12:00:00.000000000"], - ["2000-12-16T12:00:00.000000000", "2001-01-16T12:00:00.000000000"], - ["2001-01-16T12:00:00.000000000", "2001-02-15T00:00:00.000000000"], - ["2001-02-15T00:00:00.000000000", "2001-03-15T00:00:00.000000000"], - ], - dtype="datetime64[ns]", + expected["time_bnds"] = xr.DataArray( + name="time_bnds", + data=np.array( + [ + [ + cftime.datetime(1999, 12, 16, 12), + cftime.datetime(2000, 1, 16, 12), + ], + [ + cftime.datetime(2000, 1, 16, 12), + cftime.datetime(2000, 2, 15, 12), + ], + [ + cftime.datetime(2000, 2, 15, 12), + cftime.datetime(2000, 3, 16, 12), + ], + [cftime.datetime(2000, 3, 16, 12), cftime.datetime(2000, 4, 16, 0)], + [cftime.datetime(2000, 4, 16, 0), cftime.datetime(2000, 5, 16, 12)], + [cftime.datetime(2000, 5, 16, 12), cftime.datetime(2000, 6, 16, 0)], + [cftime.datetime(2000, 6, 16, 0), cftime.datetime(2000, 7, 16, 12)], + [ + cftime.datetime(2000, 7, 16, 12), + cftime.datetime(2000, 8, 16, 12), + ], + [cftime.datetime(2000, 8, 16, 12), cftime.datetime(2000, 9, 16, 0)], + [ + cftime.datetime(2000, 9, 16, 0), + cftime.datetime(2000, 10, 16, 12), + ], + [ + cftime.datetime(2000, 10, 16, 12), + cftime.datetime(2000, 11, 16, 0), + ], + [ + cftime.datetime(2000, 11, 16, 0), + cftime.datetime(2000, 12, 16, 12), + ], + [ + cftime.datetime(2000, 12, 16, 12), + cftime.datetime(2001, 1, 16, 12), + ], + [cftime.datetime(2001, 1, 16, 12), cftime.datetime(2001, 2, 15, 0)], + [cftime.datetime(2001, 2, 15, 0), cftime.datetime(2001, 3, 15, 0)], + ] + ), + dims=["time", "bnds"], + attrs={"xcdat_bounds": "True"}, ) + + expected.time.attrs = { + "units": "months since 2000-01-01", + "calendar": "standard", + "axis": "T", + "long_name": "time", + "standard_name": "time", + "bounds": "time_bnds", + } expected.time.encoding = { # Set source as result source because it changes every test run. "source": result.time.encoding["source"], @@ -383,7 +435,7 @@ def setup(self): "axis": "T", "long_name": "time", "standard_name": "time", - "calendar": "noleap", + # calendar attr is specified by test. }, ) time_bnds = xr.DataArray( @@ -414,6 +466,8 @@ def test_raises_error_if_function_is_called_on_already_decoded_cf_compliant_data def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 2000-01-01" result = decode_non_cf_time(ds) @@ -422,8 +476,11 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): "time": xr.DataArray( name="time", data=np.array( - ["2000-02-01", "2000-03-01", "2000-04-01"], - dtype="datetime64", + [ + cftime.datetime(2000, 2, 1, calendar=calendar), + cftime.datetime(2000, 3, 1, calendar=calendar), + cftime.datetime(2000, 4, 1, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -432,11 +489,19 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): name="time_bnds", data=np.array( [ - ["2000-01-01", "2000-02-01"], - ["2000-02-01", "2000-03-01"], - ["2000-03-01", "2000-04-01"], + [ + cftime.datetime(2000, 1, 1, calendar=calendar), + cftime.datetime(2000, 2, 1, calendar=calendar), + ], + [ + cftime.datetime(2000, 2, 1, calendar=calendar), + cftime.datetime(2000, 3, 1, calendar=calendar), + ], + [ + cftime.datetime(2000, 3, 1, calendar=calendar), + cftime.datetime(2000, 4, 1, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -450,7 +515,7 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): "dtype": np.dtype(np.int64), "original_shape": expected.time.data.shape, "units": ds.time.attrs["units"], - "calendar": ds.time.attrs["calendar"], + "calendar": calendar, } expected.time_bnds.encoding = ds.time_bnds.encoding assert result.time.encoding == expected.time.encoding @@ -458,6 +523,8 @@ def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 2000-01-15" result = decode_non_cf_time(ds) @@ -466,8 +533,11 @@ def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): "time": xr.DataArray( name="time", data=np.array( - ["2000-02-15", "2000-03-15", "2000-04-15"], - dtype="datetime64", + [ + cftime.datetime(2000, 2, 15, calendar=calendar), + cftime.datetime(2000, 3, 15, calendar=calendar), + cftime.datetime(2000, 4, 15, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -476,11 +546,19 @@ def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): name="time_bnds", data=np.array( [ - ["2000-01-15", "2000-02-15"], - ["2000-02-15", "2000-03-15"], - ["2000-03-15", "2000-04-15"], + [ + cftime.datetime(2000, 1, 15, calendar=calendar), + cftime.datetime(2000, 2, 15, calendar=calendar), + ], + [ + cftime.datetime(2000, 2, 15, calendar=calendar), + cftime.datetime(2000, 3, 15, calendar=calendar), + ], + [ + cftime.datetime(2000, 3, 15, calendar=calendar), + cftime.datetime(2000, 4, 15, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -502,6 +580,8 @@ def test_decodes_months_with_a_reference_date_at_the_middle_of_the_month(self): def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 1999-12-31" result = decode_non_cf_time(ds) @@ -510,8 +590,11 @@ def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): "time": xr.DataArray( name="time", data=np.array( - ["2000-01-31", "2000-02-29", "2000-03-31"], - dtype="datetime64", + [ + cftime.datetime(2000, 1, 31, calendar=calendar), + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2000, 3, 31, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -520,11 +603,19 @@ def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): name="time_bnds", data=np.array( [ - ["1999-12-31", "2000-01-31"], - ["2000-01-31", "2000-02-29"], - ["2000-02-29", "2000-03-31"], + [ + cftime.datetime(1999, 12, 31, calendar=calendar), + cftime.datetime(2000, 1, 31, calendar=calendar), + ], + [ + cftime.datetime(2000, 1, 31, calendar=calendar), + cftime.datetime(2000, 2, 29, calendar=calendar), + ], + [ + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2000, 3, 31, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -546,16 +637,22 @@ def test_decodes_months_with_a_reference_date_at_the_end_of_the_month(self): def test_decodes_months_with_a_reference_date_on_a_leap_year(self): ds = self.ds.copy() + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "months since 2000-02-29" result = decode_non_cf_time(ds) + expected = xr.Dataset( { "time": xr.DataArray( name="time", data=np.array( - ["2000-03-29", "2000-04-29", "2000-05-29"], - dtype="datetime64", + [ + cftime.datetime(2000, 3, 29, calendar=calendar), + cftime.datetime(2000, 4, 29, calendar=calendar), + cftime.datetime(2000, 5, 29, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -564,11 +661,19 @@ def test_decodes_months_with_a_reference_date_on_a_leap_year(self): name="time_bnds", data=np.array( [ - ["2000-02-29", "2000-03-29"], - ["2000-03-29", "2000-04-29"], - ["2000-04-29", "2000-05-29"], + [ + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2000, 3, 29, calendar=calendar), + ], + [ + cftime.datetime(2000, 3, 29, calendar=calendar), + cftime.datetime(2000, 4, 29, calendar=calendar), + ], + [ + cftime.datetime(2000, 4, 29, calendar=calendar), + cftime.datetime(2000, 5, 29, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -590,16 +695,23 @@ def test_decodes_months_with_a_reference_date_on_a_leap_year(self): def test_decodes_years_with_a_reference_date_at_the_middle_of_the_year(self): ds = self.ds.copy() + + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "years since 2000-06-01" result = decode_non_cf_time(ds) + expected = xr.Dataset( { "time": xr.DataArray( name="time", data=np.array( - ["2001-06-01", "2002-06-01", "2003-06-01"], - dtype="datetime64", + [ + cftime.datetime(2001, 6, 1, calendar=calendar), + cftime.datetime(2002, 6, 1, calendar=calendar), + cftime.datetime(2003, 6, 1, calendar=calendar), + ], ), dims=["time"], attrs=ds.time.attrs, @@ -608,11 +720,19 @@ def test_decodes_years_with_a_reference_date_at_the_middle_of_the_year(self): name="time_bnds", data=np.array( [ - ["2000-06-01", "2001-06-01"], - ["2001-06-01", "2002-06-01"], - ["2002-06-01", "2003-06-01"], + [ + cftime.datetime(2000, 6, 1, calendar=calendar), + cftime.datetime(2001, 6, 1, calendar=calendar), + ], + [ + cftime.datetime(2001, 6, 1, calendar=calendar), + cftime.datetime(2002, 6, 1, calendar=calendar), + ], + [ + cftime.datetime(2002, 6, 1, calendar=calendar), + cftime.datetime(2003, 6, 1, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, @@ -634,36 +754,50 @@ def test_decodes_years_with_a_reference_date_at_the_middle_of_the_year(self): def test_decodes_years_with_a_reference_date_on_a_leap_year(self): ds = self.ds.copy() + + calendar = "standard" + ds.time.attrs["calendar"] = calendar ds.time.attrs["units"] = "years since 2000-02-29" result = decode_non_cf_time(ds) + expected = xr.Dataset( { "time": xr.DataArray( name="time", - data=[ - np.datetime64("2001-02-28"), - np.datetime64("2002-02-28"), - np.datetime64("2003-02-28"), - ], + data=np.array( + [ + cftime.datetime(2001, 2, 28, calendar=calendar), + cftime.datetime(2002, 2, 28, calendar=calendar), + cftime.datetime(2003, 2, 28, calendar=calendar), + ], + ), dims=["time"], + attrs=ds.time.attrs, ), "time_bnds": xr.DataArray( name="time_bnds", data=np.array( [ - ["2000-02-29", "2001-02-28"], - ["2001-02-28", "2002-02-28"], - ["2002-02-28", "2003-02-28"], + [ + cftime.datetime(2000, 2, 29, calendar=calendar), + cftime.datetime(2001, 2, 28, calendar=calendar), + ], + [ + cftime.datetime(2001, 2, 28, calendar=calendar), + cftime.datetime(2002, 2, 28, calendar=calendar), + ], + [ + cftime.datetime(2002, 2, 28, calendar=calendar), + cftime.datetime(2003, 2, 28, calendar=calendar), + ], ], - dtype="datetime64", ), dims=["time", "bnds"], attrs=ds.time_bnds.attrs, ), } ) - expected.time.attrs = ds.time.attrs assert result.identical(expected) expected.time.encoding = { @@ -934,23 +1068,27 @@ def callable(ds): expected = ds.copy().isel(time=slice(0, 1)) expected["time"] = xr.DataArray( name="time", - data=np.array( - ["2000-01-01"], - dtype="datetime64", - ), + data=np.array([cftime.datetime(2000, 1, 1)]), dims=["time"], ) expected["time_bnds"] = xr.DataArray( name="time_bnds", data=np.array( - [["1999-12-01", "2000-01-01"]], - dtype="datetime64", + [[cftime.datetime(1999, 12, 1), cftime.datetime(2000, 1, 1)]], ), dims=["time", "bnds"], ) + expected.time.attrs = { + "units": "months since 2000-01-01", + "calendar": "standard", + "axis": "T", + "long_name": "time", + "standard_name": "time", + "bounds": "time_bnds", + } + + expected.time_bnds.attrs = {"xcdat_bounds": "True"} - expected.time.attrs = ds.time.attrs - expected.time_bnds.attrs = ds.time_bnds.attrs assert result.identical(expected) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 75916c75..96f94435 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -313,13 +313,21 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: time_bounds = ds.get(time.attrs.get("bounds"), None) units_attr = time.attrs.get("units") + # Default to standard if no calendar is set because "if the calendar kwarg + # is set to a blank string (‘’) or None (the default is ‘standard’) the + # instance will not be calendar-aware and some methods will not work". + # Source: https://unidata.github.io/cftime/api.html#cftime.datetime + # TODO: Maybe a logger warning should be added if a calendar attribute + # doesn't exist? + calendar_attr = time.attrs.get("calendar", "standard") + try: units, ref_date = _split_time_units_attr(units_attr) except ValueError: logger.warning( "Attempted to decode non-CF compliant time coordinates, but the units " - f"({units_attr}) is not in a supported format ('months since...' or " - "'months since...'). Returning dataset with existing time coordinates." + f"attr ('{units_attr}') is not in a supported format ('months since...' or " + "'years since...'). Returning dataset with the original time coordinates." ) return ds @@ -329,7 +337,9 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: # `rd.relativedelta`. ref_dt_obj = datetime.strptime(ref_date, "%Y-%m-%d") data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] - data = [cftime.datetime(t.year, t.month, t.day) for t in data] + data = [ + cftime.datetime(t.year, t.month, t.day, calendar=calendar_attr) for t in data + ] decoded_time = xr.DataArray( name=time.name, @@ -356,6 +366,18 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: for [lower, upper] in time_bounds.data ] + data_bounds = [ + [ + cftime.datetime( + lower.year, lower.month, lower.day, calendar=calendar_attr + ), + cftime.datetime( + upper.year, upper.month, upper.day, calendar=calendar_attr + ), + ] + for [lower, upper] in data_bounds + ] + decoded_time_bnds = xr.DataArray( name=time_bounds.name, data=data_bounds, From ba66aeffa938789f260e2392ac573b460436628d Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Thu, 28 Jul 2022 15:51:09 -0700 Subject: [PATCH 04/12] Update xcdat/dataset.py Co-authored-by: pochedls --- xcdat/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 96f94435..5acaf56d 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -8,6 +8,7 @@ import cftime import xarray as xr from dateutil import relativedelta as rd +from dateutil import parser from xcdat import bounds # noqa: F401 from xcdat.axis import center_times as center_times_func From 4d01344f24858979cfed8235a18fa6b3ae807578 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Thu, 28 Jul 2022 15:51:19 -0700 Subject: [PATCH 05/12] Update xcdat/dataset.py Co-authored-by: pochedls --- xcdat/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 5acaf56d..97e81ec8 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -336,7 +336,7 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: # to create `cftime.datetime` objects. Note, the reference date object type # must start as `datetime` because `cftime` does not support arithmetic with # `rd.relativedelta`. - ref_dt_obj = datetime.strptime(ref_date, "%Y-%m-%d") + ref_dt_obj = parser.parse(ref_date, default=datetime.datetime(2000, 1, 1)) data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] data = [ cftime.datetime(t.year, t.month, t.day, calendar=calendar_attr) for t in data From 4fb0656ee92f50e339a8307b667d34e8bdd6b6ef Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Thu, 28 Jul 2022 15:54:32 -0700 Subject: [PATCH 06/12] Fix datetime reference --- xcdat/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 97e81ec8..681cfb30 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -7,8 +7,8 @@ import cftime import xarray as xr -from dateutil import relativedelta as rd from dateutil import parser +from dateutil import relativedelta as rd from xcdat import bounds # noqa: F401 from xcdat.axis import center_times as center_times_func @@ -336,7 +336,7 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: # to create `cftime.datetime` objects. Note, the reference date object type # must start as `datetime` because `cftime` does not support arithmetic with # `rd.relativedelta`. - ref_dt_obj = parser.parse(ref_date, default=datetime.datetime(2000, 1, 1)) + ref_dt_obj = parser.parse(ref_date, default=datetime(2000, 1, 1)) data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] data = [ cftime.datetime(t.year, t.month, t.day, calendar=calendar_attr) for t in data From e0760bd923c5889c774c9cdb9f91c665f18f39dd Mon Sep 17 00:00:00 2001 From: Stephen Po-Chedley Date: Thu, 28 Jul 2022 18:24:08 -0700 Subject: [PATCH 07/12] return more specific cftime datetime types --- xcdat/dataset.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 681cfb30..1148d6e7 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -321,6 +321,7 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: # TODO: Maybe a logger warning should be added if a calendar attribute # doesn't exist? calendar_attr = time.attrs.get("calendar", "standard") + cf_calendar_type = _get_date_type(calendar_attr) try: units, ref_date = _split_time_units_attr(units_attr) @@ -339,7 +340,7 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: ref_dt_obj = parser.parse(ref_date, default=datetime(2000, 1, 1)) data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] data = [ - cftime.datetime(t.year, t.month, t.day, calendar=calendar_attr) for t in data + cf_calendar_type(t.year, t.month, t.day, calendar=calendar_attr) for t in data ] decoded_time = xr.DataArray( @@ -369,10 +370,10 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: data_bounds = [ [ - cftime.datetime( + cf_calendar_type( lower.year, lower.month, lower.day, calendar=calendar_attr ), - cftime.datetime( + cf_calendar_type( upper.year, upper.month, upper.day, calendar=calendar_attr ), ] @@ -680,3 +681,21 @@ def _split_time_units_attr(units_attr: str) -> Tuple[str, str]: ) return units, reference_date + + +def _get_date_type(calendar): + """Return the cftime date type for a given calendar name. + https://github.com/pydata/xarray/blob/main/xarray/coding/cftime_offsets.py#L68 + """ + calendars = { + "noleap": cftime.DatetimeNoLeap, + "360_day": cftime.Datetime360Day, + "365_day": cftime.DatetimeNoLeap, + "366_day": cftime.DatetimeAllLeap, + "gregorian": cftime.DatetimeGregorian, + "proleptic_gregorian": cftime.DatetimeProlepticGregorian, + "julian": cftime.DatetimeJulian, + "all_leap": cftime.DatetimeAllLeap, + "standard": cftime.DatetimeGregorian, + } + return calendars[calendar] From 597ae3086473587990de1df55d701647221c3f93 Mon Sep 17 00:00:00 2001 From: Stephen Po-Chedley Date: Thu, 28 Jul 2022 18:29:11 -0700 Subject: [PATCH 08/12] removing extraneous calendar specification --- xcdat/dataset.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 1148d6e7..7721db21 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -339,9 +339,7 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: # `rd.relativedelta`. ref_dt_obj = parser.parse(ref_date, default=datetime(2000, 1, 1)) data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] - data = [ - cf_calendar_type(t.year, t.month, t.day, calendar=calendar_attr) for t in data - ] + data = [cf_calendar_type(t.year, t.month, t.day) for t in data] decoded_time = xr.DataArray( name=time.name, @@ -370,12 +368,8 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: data_bounds = [ [ - cf_calendar_type( - lower.year, lower.month, lower.day, calendar=calendar_attr - ), - cf_calendar_type( - upper.year, upper.month, upper.day, calendar=calendar_attr - ), + cf_calendar_type(lower.year, lower.month, lower.day), + cf_calendar_type(upper.year, upper.month, upper.day), ] for [lower, upper] in data_bounds ] From 3e9d09c3b8151f32c60c12cd998300f3fb8f141a Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Mon, 1 Aug 2022 11:48:38 -0700 Subject: [PATCH 09/12] Refactor using xarray methods - Move try and except statements into `decode_non_cf_time()` - Extract function `_get_cftime_coords()` to encapsulate related logic from `decode_non_cf_time()` --- tests/test_dataset.py | 32 ++++++---- xcdat/dataset.py | 133 +++++++++++++++++++++++------------------- 2 files changed, 96 insertions(+), 69 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 35d91121..7dbb63cb 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -456,13 +456,29 @@ def setup(self): } self.ds = xr.Dataset({"time": time, "time_bnds": time_bnds}) - def test_raises_error_if_function_is_called_on_already_decoded_cf_compliant_dataset( - self, - ): - ds = generate_dataset(cf_compliant=True, has_bounds=True) + def test_returns_original_dataset_if_calendar_attr_is_not_set(self): + ds = generate_dataset(cf_compliant=False, has_bounds=True) + + del ds.time.attrs["calendar"] + + result = decode_non_cf_time(ds) + assert ds.identical(result) + + def test_returns_original_dataset_if_units_attr_is_not_set(self): + ds = generate_dataset(cf_compliant=False, has_bounds=True) - with pytest.raises(KeyError): - decode_non_cf_time(ds) + del ds.time.attrs["units"] + + result = decode_non_cf_time(ds) + assert ds.identical(result) + + def test_returns_original_dataset_if_units_attr_is_in_an_unsupported_format(self): + ds = generate_dataset(cf_compliant=False, has_bounds=True) + + ds.time.attrs["units"] = "year AD" + + result = decode_non_cf_time(ds) + assert ds.identical(result) def test_decodes_months_with_a_reference_date_at_the_start_of_the_month(self): ds = self.ds.copy() @@ -1093,10 +1109,6 @@ def callable(ds): class Test_SplitTimeUnitsAttr: - def test_raises_error_if_units_attr_is_none(self): - with pytest.raises(KeyError): - _split_time_units_attr(None) # type: ignore - def test_splits_units_attr_to_unit_and_reference_date(self): assert _split_time_units_attr("months since 1800") == ("months", "1800") assert _split_time_units_attr("months since 1800-01-01") == ( diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 7721db21..cb475b86 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -5,14 +5,16 @@ from glob import glob from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union -import cftime +import numpy as np import xarray as xr from dateutil import parser from dateutil import relativedelta as rd +from xarray.coding.cftime_offsets import get_date_type +from xarray.coding.times import convert_times from xcdat import bounds # noqa: F401 from xcdat.axis import center_times as center_times_func -from xcdat.axis import swap_lon_axis +from xcdat.axis import get_axis_coord, swap_lon_axis from xcdat.logger import setup_custom_logger logger = setup_custom_logger(__name__) @@ -309,38 +311,39 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: 'years since 2000-01-01', 'calendar': 'noleap'} """ ds = dataset.copy() + time = get_axis_coord(ds, "T") - time = ds.cf["T"] - time_bounds = ds.get(time.attrs.get("bounds"), None) - units_attr = time.attrs.get("units") - - # Default to standard if no calendar is set because "if the calendar kwarg - # is set to a blank string (‘’) or None (the default is ‘standard’) the - # instance will not be calendar-aware and some methods will not work". - # Source: https://unidata.github.io/cftime/api.html#cftime.datetime - # TODO: Maybe a logger warning should be added if a calendar attribute - # doesn't exist? - calendar_attr = time.attrs.get("calendar", "standard") - cf_calendar_type = _get_date_type(calendar_attr) + try: + calendar = time.attrs["calendar"] + except KeyError: + logger.warning( + "This dataset's time coordinates do not have a 'calendar' attribute set, " + "so time coordinates could not be decoded. Set the 'calendar' attribute " + "and try decoding the time coordinates again." + ) + return ds + + try: + units_attr = time.attrs["units"] + except KeyError: + logger.warning( + "This dataset's time coordinates do not have a 'units' attribute set, " + "so the time coordinates could not be decoded. Set the 'units' attribute" + "and try decoding the time coordinates again." + ) + return ds try: units, ref_date = _split_time_units_attr(units_attr) except ValueError: logger.warning( - "Attempted to decode non-CF compliant time coordinates, but the units " - f"attr ('{units_attr}') is not in a supported format ('months since...' or " - "'years since...'). Returning dataset with the original time coordinates." + f"This dataset's 'units' attributes is ('{units_attr}') is not in a " + "supported format ('months since...' or 'years since...'), so the time " + "coordinates could not be decoded." ) return ds - # Create a reference date object and use the relative delta offsets - # to create `cftime.datetime` objects. Note, the reference date object type - # must start as `datetime` because `cftime` does not support arithmetic with - # `rd.relativedelta`. - ref_dt_obj = parser.parse(ref_date, default=datetime(2000, 1, 1)) - data = [ref_dt_obj + rd.relativedelta(**{units: offset}) for offset in time.data] - data = [cf_calendar_type(t.year, t.month, t.day) for t in data] - + data = _get_cftime_coords(ref_date, time.values, calendar, units) decoded_time = xr.DataArray( name=time.name, data=data, @@ -353,26 +356,19 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: "dtype": time.dtype, "original_shape": time.shape, "units": units_attr, - "calendar": time.attrs.get("calendar", "none"), + "calendar": calendar, } ds = ds.assign_coords({time.name: decoded_time}) + try: + time_bounds = ds.bounds.get_bounds("T") + except KeyError: + time_bounds = None + if time_bounds is not None: - data_bounds = [ - [ - ref_dt_obj + rd.relativedelta(**{units: lower}), - ref_dt_obj + rd.relativedelta(**{units: upper}), - ] - for [lower, upper] in time_bounds.data - ] - - data_bounds = [ - [ - cf_calendar_type(lower.year, lower.month, lower.day), - cf_calendar_type(upper.year, upper.month, upper.day), - ] - for [lower, upper] in data_bounds - ] + lowers = _get_cftime_coords(ref_date, time_bounds.values[:, 0], calendar, units) + uppers = _get_cftime_coords(ref_date, time_bounds.values[:, 1], calendar, units) + data_bounds = np.vstack((lowers, uppers)).T decoded_time_bnds = xr.DataArray( name=time_bounds.name, @@ -664,9 +660,6 @@ def _split_time_units_attr(units_attr: str) -> Tuple[str, str]: ValueError If the time units attribute is not of the form `X since Y`. """ - if units_attr is None: - raise KeyError("The dataset's time coordinates does not have a 'units' attr.") - if "since" in units_attr: units, reference_date = units_attr.split(" since ") else: @@ -677,19 +670,41 @@ def _split_time_units_attr(units_attr: str) -> Tuple[str, str]: return units, reference_date -def _get_date_type(calendar): - """Return the cftime date type for a given calendar name. - https://github.com/pydata/xarray/blob/main/xarray/coding/cftime_offsets.py#L68 +def _get_cftime_coords( + ref_date: str, offsets: np.ndarray, calendar: str, units: str +) -> np.ndarray: + """Get an array of `cftime` coordinates starting from a reference date. + + Parameters + ---------- + ref_date : str + The starting reference date. + offsets : np.ndarray + An array of numerically encoded time offsets from the reference date. + calendar : str + The CF calendar type. + units : str + The time units. + + Returns + ------- + np.ndarray + An array of `cftime` coordinates. """ - calendars = { - "noleap": cftime.DatetimeNoLeap, - "360_day": cftime.Datetime360Day, - "365_day": cftime.DatetimeNoLeap, - "366_day": cftime.DatetimeAllLeap, - "gregorian": cftime.DatetimeGregorian, - "proleptic_gregorian": cftime.DatetimeProlepticGregorian, - "julian": cftime.DatetimeJulian, - "all_leap": cftime.DatetimeAllLeap, - "standard": cftime.DatetimeGregorian, - } - return calendars[calendar] + # Starting from the reference date, create an array of `datetime` objects + # by adding each offset (a numerically encoded value) to the reference date. + # The `parse.parse` default is set to datetime(2000, 1, 1), with each + # component being a placeholder if the value does not exist. For example, 1 + # and 1 are placeholders for month and day if those values don't exist. + ref_datetime: datetime = parser.parse(ref_date, default=datetime(2000, 1, 1)) + offsets = np.array( + [ref_datetime + rd.relativedelta(**{units: offset}) for offset in offsets], + dtype="datetime64", + ) + + # Convert the array of `datetime`` objects into `cftime` objects based on + # the calendar type. + date_type = get_date_type(calendar) + coords = convert_times(offsets, date_type=date_type) + + return coords From beb2a074e67d2b46c81b8d869187b0b5e6bfd491 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Mon, 1 Aug 2022 14:17:30 -0700 Subject: [PATCH 10/12] Update docstrings and silence logger warnings in tests --- tests/test_dataset.py | 23 +++++++++++++--- xcdat/dataset.py | 63 +++++++++++++++++++++++++------------------ 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 7dbb63cb..81d85826 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1,3 +1,4 @@ +import logging import pathlib import warnings @@ -38,7 +39,10 @@ def test_non_cf_compliant_time_is_not_decoded(self): expected = generate_dataset(cf_compliant=False, has_bounds=True) assert result.identical(expected) - def test_non_cf_compliant_and_unsupported_time_is_not_decoded(self): + def test_non_cf_compliant_and_unsupported_time_is_not_decoded(self, caplog): + # Update logger level to silence the logger warning during test runs. + caplog.set_level(logging.ERROR) + ds = generate_dataset(cf_compliant=False, has_bounds=True, unsupported=True) ds.to_netcdf(self.file_path) @@ -456,7 +460,10 @@ def setup(self): } self.ds = xr.Dataset({"time": time, "time_bnds": time_bnds}) - def test_returns_original_dataset_if_calendar_attr_is_not_set(self): + def test_returns_original_dataset_if_calendar_attr_is_not_set(self, caplog): + # Update logger level to silence the logger warning during test runs. + caplog.set_level(logging.ERROR) + ds = generate_dataset(cf_compliant=False, has_bounds=True) del ds.time.attrs["calendar"] @@ -464,7 +471,10 @@ def test_returns_original_dataset_if_calendar_attr_is_not_set(self): result = decode_non_cf_time(ds) assert ds.identical(result) - def test_returns_original_dataset_if_units_attr_is_not_set(self): + def test_returns_original_dataset_if_units_attr_is_not_set(self, caplog): + # Update logger level to silence the logger warning during test runs. + caplog.set_level(logging.ERROR) + ds = generate_dataset(cf_compliant=False, has_bounds=True) del ds.time.attrs["units"] @@ -472,7 +482,12 @@ def test_returns_original_dataset_if_units_attr_is_not_set(self): result = decode_non_cf_time(ds) assert ds.identical(result) - def test_returns_original_dataset_if_units_attr_is_in_an_unsupported_format(self): + def test_returns_original_dataset_if_units_attr_is_in_an_unsupported_format( + self, caplog + ): + # Update logger level to silence the logger warning during test runs. + caplog.set_level(logging.ERROR) + ds = generate_dataset(cf_compliant=False, has_bounds=True) ds.time.attrs["units"] = "year AD" diff --git a/xcdat/dataset.py b/xcdat/dataset.py index cb475b86..7ede3f76 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -224,24 +224,27 @@ def open_mfdataset( def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: """Decodes time coordinates and time bounds with non-CF compliant units. - By default, ``xarray`` uses the ``cftime`` module, which only supports - decoding time with CF compliant units [3]_. This function fills the gap in - xarray by being able to decode time with non-CF compliant units such as - "months since ..." and "years since ...". + By default, ``xarray`` only supports decoding time with CF compliant units + [3]_. This function enables decoding time with non-CF compliant units. - The steps include: + The time coordinates must have a "calendar" attribute set to a CF calendar + type supported by ``cftime`` ("noleap", "360_day", "365_day", "366_day", + "gregorian", "proleptic_gregorian", "julian", "all_leap", or "standard") + and a "units" attribute set to a supported format ("months since ..." or + "years since ..."). - 1. Extract ``units`` and ``reference_date`` strings from the "units" - attribute. + The logic for this function: - * For example with "months since 1800-01-01", ``units="months"`` and - ``reference_date="1800-01-01"`` + 1. Extract units and reference date strings from the "units" attribute. - 2. Using the ``reference_date``, create a datetime object (``ref_dt_obj``). - 3. Starting from ``ref_dt_obj``, use the numerically encoded time coordinate - values (each representing an offset) to create an array of - ``cftime.datetime`` objects. - 4. Using the array of ``cftime.datetime`` objects, create a new xr.DataArray + * For example with "months since 1800-01-01", the units are "months" and + reference date is "1800-01-01". + + 2. Using the reference date, create a reference ``datetime`` object. + 3. Starting from the reference ``datetime`` object, use the numerically + encoded time coordinate values (each representing an offset) to create an + array of ``cftime`` objects based on the calendar type. + 4. Using the array of ``cftime`` objects, create a new xr.DataArray of time coordinates to replace the numerically encoded ones. 5. If it exists, create a time bounds DataArray using steps 3 and 4. @@ -256,13 +259,13 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: ------- xr.Dataset Dataset with decoded time coordinates and time bounds (if they exist) as - datetime objects. + ``cftime`` objects. Notes ----- - ``cftime.datetime`` objects are used to represent time coordinates because - it is not restricted by the ``pandas.Timestamp`` range (1678 and 2262). - Refer to [4]_ and [5]_ for more information on this limitation. + Time coordinates are represented by ``cftime.datetime`` objects because + it is not restricted by the ``pandas.Timestamp`` range (years 1678 through + 2262). Refer to [4]_ and [5]_ for more information on this limitation. References ----- @@ -275,7 +278,8 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: Decode the time coordinates with non-CF units in a Dataset: - >>> from xcdat.dataset import decode_time_units + >>> from xcdat.dataset import decode_non_cf_time + >>> >>> ds.time array([0, 1, 2]) @@ -289,11 +293,13 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: standard_name: time calendar: noleap >>> - >>> ds_decoded = decode_time_units(ds) + >>> ds_decoded = decode_non_cf_time(ds) >>> ds_decoded.time - array(['2000-01-01T00:00:00.000000000', '2001-01-01T00:00:00.000000000', - '2002-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + array([cftime.DatetimeNoLeap(1850, 1, 1, 0, 0, 0, 0, has_year_zero=True), + cftime.DatetimeNoLeap(1850, 1, 1, 0, 0, 0, 0, has_year_zero=True), + cftime.DatetimeNoLeap(1850, 1, 1, 0, 0, 0, 0, has_year_zero=True)], + dtype='object') Coordinates: * time (time) datetime64[ns] 2000-01-01 2001-01-01 2002-01-01 Attributes: @@ -307,8 +313,11 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: View time encoding information: >>> ds_decoded.time.encoding - {'source': None, 'dtype': dtype('int64'), 'original_shape': (3,), 'units': - 'years since 2000-01-01', 'calendar': 'noleap'} + {'source': None, + 'dtype': dtype('int64'), + 'original_shape': (3,), + 'units': 'years since 2000-01-01', + 'calendar': 'noleap'} """ ds = dataset.copy() time = get_axis_coord(ds, "T") @@ -682,7 +691,9 @@ def _get_cftime_coords( offsets : np.ndarray An array of numerically encoded time offsets from the reference date. calendar : str - The CF calendar type. + The CF calendar type supported by ``cftime``. This includes "noleap", + "360_day", "365_day", "366_day", "gregorian", "proleptic_gregorian", + "julian", "all_leap", and "standard". units : str The time units. @@ -702,7 +713,7 @@ def _get_cftime_coords( dtype="datetime64", ) - # Convert the array of `datetime`` objects into `cftime` objects based on + # Convert the array of `datetime` objects into `cftime` objects based on # the calendar type. date_type = get_date_type(calendar) coords = convert_times(offsets, date_type=date_type) From 37939d21f0b77ebf9322bc17515d97cd4b104754 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Tue, 2 Aug 2022 13:54:57 -0700 Subject: [PATCH 11/12] Fix Timestamp limitation from dtype - Update logger warning --- xcdat/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index 7ede3f76..aca4abf4 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -346,7 +346,7 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: units, ref_date = _split_time_units_attr(units_attr) except ValueError: logger.warning( - f"This dataset's 'units' attributes is ('{units_attr}') is not in a " + f"This dataset's 'units' attribute ('{units_attr}') is not in a " "supported format ('months since...' or 'years since...'), so the time " "coordinates could not be decoded." ) @@ -710,7 +710,7 @@ def _get_cftime_coords( ref_datetime: datetime = parser.parse(ref_date, default=datetime(2000, 1, 1)) offsets = np.array( [ref_datetime + rd.relativedelta(**{units: offset}) for offset in offsets], - dtype="datetime64", + dtype="object", ) # Convert the array of `datetime` objects into `cftime` objects based on From 9dfaa798108a98e5f961ab5ebe4693c55baff349 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Tue, 2 Aug 2022 13:58:52 -0700 Subject: [PATCH 12/12] Add space in logger warning --- xcdat/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xcdat/dataset.py b/xcdat/dataset.py index aca4abf4..141dec61 100644 --- a/xcdat/dataset.py +++ b/xcdat/dataset.py @@ -337,7 +337,7 @@ def decode_non_cf_time(dataset: xr.Dataset) -> xr.Dataset: except KeyError: logger.warning( "This dataset's time coordinates do not have a 'units' attribute set, " - "so the time coordinates could not be decoded. Set the 'units' attribute" + "so the time coordinates could not be decoded. Set the 'units' attribute " "and try decoding the time coordinates again." ) return ds