Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement literal np.timedelta64 coding #10101

Draft
wants to merge 30 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
063437b
Proof of concept literal timedelta64 coding
spencerkclark Mar 6, 2025
03f2988
Ensure test_roundtrip_timedelta_data test uses old encoding pathway
spencerkclark Mar 6, 2025
bdb53d7
Remove no longer relevant test
spencerkclark Mar 7, 2025
05c3ce6
Merge branch 'main' into timedelta64-encoding
spencerkclark Mar 7, 2025
00d9eaa
Include units attribute
spencerkclark Mar 8, 2025
b043b45
Move coder to times.py
spencerkclark Mar 8, 2025
6f4e6e4
Merge branch 'main' into timedelta64-encoding
spencerkclark Mar 8, 2025
7f73753
Add what's new entry
spencerkclark Mar 8, 2025
4a8e111
Merge branch 'timedelta64-encoding' of https://github.com/spencerkcla…
spencerkclark Mar 8, 2025
9ce2a24
Restore test and reduce diff
spencerkclark Mar 8, 2025
eb6e19a
Fix typing
spencerkclark Mar 8, 2025
436e588
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 8, 2025
a305238
Fix doctests
spencerkclark Mar 8, 2025
b406c64
Restore original order of encoders
spencerkclark Mar 8, 2025
a21b137
Add return types to tests
spencerkclark Mar 8, 2025
5108b02
Move everything to CFTimedeltaCoder; reuse code where possible
spencerkclark Mar 8, 2025
452968c
Fix mypy
spencerkclark Mar 9, 2025
503db4a
Use Kai's offset and scale_factor logic for all encoding
spencerkclark Mar 9, 2025
9aee097
Merge branch 'main' into timedelta64-encoding
spencerkclark Mar 22, 2025
56f55e2
Fix bad merge
spencerkclark Mar 22, 2025
c5e7de9
Forbid mixing other encoding with literal timedelta64 encoding
spencerkclark Mar 22, 2025
d1744af
Expose fine-grained control over decoding pathways
spencerkclark Mar 22, 2025
7c7b071
Rename test
spencerkclark Mar 22, 2025
da1edc4
Use consistent dtype spelling
spencerkclark Mar 22, 2025
2bb4b99
Continue supporting non-timedelta dtype-only encoding
spencerkclark Mar 22, 2025
0220ed5
Fix example attribute in docstring
spencerkclark Mar 22, 2025
c83fcb3
Update what's new
spencerkclark Mar 22, 2025
d1e8a5e
Fix typo
spencerkclark Mar 22, 2025
7b94d35
Complete test
spencerkclark Mar 22, 2025
f269e68
Fix docstring
spencerkclark Mar 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
@@ -21,7 +21,15 @@ v2025.03.1 (unreleased)

New Features
~~~~~~~~~~~~

- By default xarray now encodes :py:class:`numpy.timedelta64` values by
converting to :py:class:`numpy.int64` values and storing ``"dtype"`` and
``"units"`` attributes consistent with the dtype of the in-memory
:py:class:`numpy.timedelta64` values, e.g. ``"timedelta64[s]"`` and
``"seconds"`` for second-resolution timedeltas. These values will always be
decoded to timedeltas without a warning moving forward. Timedeltas encoded
via the previous approach can still be roundtripped exactly, but in the
future will not be decoded by default (:issue:`1621`, :issue:`10099`,
:pull:`10101`). By `Spencer Clark <https://github.com/spencerkclark>`_.

Breaking changes
~~~~~~~~~~~~~~~~
137 changes: 112 additions & 25 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import re
import typing
import warnings
from collections.abc import Callable, Hashable
from datetime import datetime, timedelta
@@ -92,6 +93,14 @@
)


_INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS = [
"_FillValue",
"missing_value",
"add_offset",
"scale_factor",
]


def _is_standard_calendar(calendar: str) -> bool:
return calendar.lower() in _STANDARD_CALENDARS

@@ -1394,62 +1403,140 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
return variable


def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
dtype = attrs_or_encoding.get("dtype", None)
return isinstance(dtype, str) and dtype.startswith("timedelta64")


class CFTimedeltaCoder(VariableCoder):
"""Coder for CF Timedelta coding.

Parameters
----------
time_unit : PDDatetimeUnitOptions
Target resolution when decoding timedeltas. Defaults to "ns".
Target resolution when decoding timedeltas via units. Defaults to "ns".
When decoding via dtype, the resolution is specified in the dtype
attribute, so this parameter is ignored.
decode_via_units : bool
Whether to decode timedeltas based on the presence of a timedelta-like
units attribute, e.g. "seconds". Defaults to True, but in the future
will default to False.
decode_via_dtype : bool
Whether to decode timedeltas based on the presence of a np.timedelta64
dtype attribute, e.g. "timedelta64[s]". Defaults to True.
"""

def __init__(
self,
time_unit: PDDatetimeUnitOptions = "ns",
decode_via_units: bool = True,
decode_via_dtype: bool = True,
) -> None:
self.time_unit = time_unit
self.decode_via_units = decode_via_units
self.decode_via_dtype = decode_via_dtype
self._emit_decode_timedelta_future_warning = False

def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if np.issubdtype(variable.data.dtype, np.timedelta64):
dims, data, attrs, encoding = unpack_for_encoding(variable)
has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
dtype = encoding.get("dtype", None)
units = encoding.pop("units", None)

dtype = encoding.get("dtype", None)
# in the case of packed data we need to encode into
# float first, the correct dtype will be established
# via CFScaleOffsetCoder/CFMaskCoder
if "add_offset" in encoding or "scale_factor" in encoding:
dtype = data.dtype if data.dtype.kind == "f" else "float64"

# in the case of packed data we need to encode into
# float first, the correct dtype will be established
# via CFScaleOffsetCoder/CFMaskCoder
if "add_offset" in encoding or "scale_factor" in encoding:
dtype = data.dtype if data.dtype.kind == "f" else "float64"

data, units = encode_cf_timedelta(data, encoding.pop("units", None), dtype)
else:
resolution, _ = np.datetime_data(variable.dtype)
dtype = np.int64
attrs_dtype = f"timedelta64[{resolution}]"
units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
# Remove dtype encoding if it exists to prevent it from
# interfering downstream in NonStringCoder.
encoding.pop("dtype", None)

if any(
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
):
raise ValueError(
f"Specifying '_FillValue', 'missing_value', "
f"'add_offset', or 'scale_factor' is not supported "
f"when literally encoding the np.timedelta64 values "
f"of variable {name!r}. To encode {name!r} with such "
f"encoding parameters, additionally set "
f"encoding['units'] to a unit of time, e.g. "
f"'seconds'. To proceed with literal np.timedelta64 "
f"encoding of {name!r}, remove any encoding entries "
f"for '_FillValue', 'missing_value', 'add_offset', "
f"or 'scale_factor'."
)

data, units = encode_cf_timedelta(data, units, dtype)
safe_setitem(attrs, "units", units, name=name)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable

def decode(self, variable: Variable, name: T_Name = None) -> Variable:
units = variable.attrs.get("units", None)
if isinstance(units, str) and units in TIME_UNITS:
if self._emit_decode_timedelta_future_warning:
emit_user_level_warning(
"In a future version of xarray decode_timedelta will "
"default to False rather than None. To silence this "
"warning, set decode_timedelta to True, False, or a "
"'CFTimedeltaCoder' instance.",
FutureWarning,
)
has_timedelta_units = isinstance(units, str) and units in TIME_UNITS
has_timedelta_dtype = has_timedelta64_encoding_dtype(variable.attrs)
is_dtype_decodable = has_timedelta_units and has_timedelta_dtype
is_units_decodable = has_timedelta_units
if (is_dtype_decodable and self.decode_via_dtype) or (
is_units_decodable and self.decode_via_units
):
dims, data, attrs, encoding = unpack_for_decoding(variable)

units = pop_to(attrs, encoding, "units")
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
transform = partial(
decode_cf_timedelta, units=units, time_unit=self.time_unit
)
if is_dtype_decodable and self.decode_via_dtype:
if any(
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
):
raise ValueError(
"Decoding np.timedelta64 values via dtype is not "
"supported when '_FillValue', 'missing_value', "
"'add_offset', or 'scale_factor' are present in "
"encoding."
)
dtype = pop_to(attrs, encoding, "dtype", name=name)
dtype = np.dtype(dtype)
resolution, _ = np.datetime_data(dtype)
if resolution not in typing.get_args(PDDatetimeUnitOptions):
raise ValueError(
f"Following pandas, xarray only supports decoding to "
f"timedelta64 values with a resolution of 's', 'ms', "
f"'us', or 'ns'. Encoded values have a resolution of "
f"{resolution!r}."
)
time_unit = cast(PDDatetimeUnitOptions, resolution)
elif self.decode_via_units:
if self._emit_decode_timedelta_future_warning:
emit_user_level_warning(
"In a future version, xarray will not decode "
"timedelta values based on the presence of a "
"timedelta-like units attribute by default. Instead "
"it will rely on the presence of a np.timedelta64 "
"dtype attribute, which is now xarray's default way "
"of encoding np.timedelta64 values. To continue "
"decoding timedeltas based on the presence of a "
"timedelta-like units attribute, users will need to "
"explicitly opt-in by passing True or "
"CFTimedeltaCoder(decode_via_units=True) to "
"decode_timedelta. To silence this warning, set "
"decode_timedelta to True, False, or a "
"'CFTimedeltaCoder' instance.",
FutureWarning,
)
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
time_unit = self.time_unit
transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
data = lazy_elemwise_func(data, transform, dtype=dtype)

return Variable(dims, data, attrs, encoding, fastpath=True)
else:
return variable
6 changes: 4 additions & 2 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
@@ -204,8 +204,10 @@ def decode_cf_variable(
var = coder.decode(var, name=name)

if decode_timedelta:
if not isinstance(decode_timedelta, CFTimedeltaCoder):
decode_timedelta = CFTimedeltaCoder()
if isinstance(decode_timedelta, bool):
decode_timedelta = CFTimedeltaCoder(
decode_via_units=decode_timedelta, decode_via_dtype=decode_timedelta
)
decode_timedelta._emit_decode_timedelta_future_warning = (
decode_timedelta_was_none
)
3 changes: 3 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
@@ -635,7 +635,10 @@ def test_roundtrip_timedelta_data(self) -> None:
# though we cannot test that until we fix the timedelta decoding
# to support large ranges
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore]
encoding = {"units": "seconds"}
expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]})
expected["td"].encoding = encoding
expected["td0"].encoding = encoding
with self.roundtrip(
expected, open_kwargs={"decode_timedelta": CFTimedeltaCoder(time_unit="ns")}
) as actual:
128 changes: 126 additions & 2 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
@@ -20,6 +20,7 @@
)
from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder
from xarray.coding.times import (
_INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS,
_encode_datetime_with_cftime,
_netcdf_to_numpy_timeunit,
_numpy_to_netcdf_timeunit,
@@ -1515,7 +1516,7 @@ def test_roundtrip_timedelta64_nanosecond_precision(
timedelta_values[2] = nat
timedelta_values[4] = nat

encoding = dict(dtype=dtype, _FillValue=fill_value)
encoding = dict(dtype=dtype, _FillValue=fill_value, units="nanoseconds")
var = Variable(["time"], timedelta_values, encoding=encoding)

encoded_var = conventions.encode_cf_variable(var)
@@ -1867,7 +1868,8 @@ def test_decode_timedelta(
decode_times, decode_timedelta, expected_dtype, warns
) -> None:
timedeltas = pd.timedelta_range(0, freq="D", periods=3)
var = Variable(["time"], timedeltas)
encoding = {"units": "days"}
var = Variable(["time"], timedeltas, encoding=encoding)
encoded = conventions.encode_cf_variable(var)
if warns:
with pytest.warns(FutureWarning, match="decode_timedelta"):
@@ -1959,3 +1961,125 @@ def test_decode_floating_point_timedelta_no_serialization_warning() -> None:
decoded = conventions.decode_cf_variable("foo", encoded, decode_timedelta=True)
with assert_no_warnings():
decoded.load()


def test_literal_timedelta64_coding(time_unit: PDDatetimeUnitOptions) -> None:
timedeltas = pd.timedelta_range(0, freq="D", periods=3, unit=time_unit) # type: ignore[call-arg]
variable = Variable(["time"], timedeltas)
expected_dtype = f"timedelta64[{time_unit}]"
expected_units = _numpy_to_netcdf_timeunit(time_unit)

encoded = conventions.encode_cf_variable(variable)
assert encoded.attrs["dtype"] == expected_dtype
assert encoded.attrs["units"] == expected_units

decoded = conventions.decode_cf_variable("timedeltas", encoded)
assert decoded.encoding["dtype"] == expected_dtype
assert decoded.encoding["units"] == expected_units

assert_identical(decoded, variable)
assert decoded.dtype == variable.dtype

reencoded = conventions.encode_cf_variable(decoded)
assert_identical(reencoded, encoded)
assert reencoded.dtype == encoded.dtype


def test_literal_timedelta_coding_resolution_error() -> None:
attrs = {"dtype": "timedelta64[D]", "units": "days"}
encoded = Variable(["time"], [0, 1, 2], attrs=attrs)
with pytest.raises(ValueError, match="xarray only supports"):
conventions.decode_cf_variable("timedeltas", encoded)


@pytest.mark.parametrize("attribute", ["dtype", "units"])
def test_literal_timedelta_decode_invalid_encoding(attribute) -> None:
attrs = {"dtype": "timedelta64[s]", "units": "seconds"}
encoding = {attribute: "foo"}
encoded = Variable(["time"], [0, 1, 2], attrs=attrs, encoding=encoding)
with pytest.raises(ValueError, match="failed to prevent"):
conventions.decode_cf_variable("timedeltas", encoded)


@pytest.mark.parametrize("attribute", ["dtype", "units"])
def test_literal_timedelta_encode_invalid_attribute(attribute) -> None:
timedeltas = pd.timedelta_range(0, freq="D", periods=3)
attrs = {attribute: "foo"}
variable = Variable(["time"], timedeltas, attrs=attrs)
with pytest.raises(ValueError, match="failed to prevent"):
conventions.encode_cf_variable(variable)


@pytest.mark.parametrize("invalid_key", _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS)
def test_literal_timedelta_encoding_mask_and_scale_error(invalid_key) -> None:
encoding = {invalid_key: 1.0}
timedeltas = pd.timedelta_range(0, freq="D", periods=3)
variable = Variable(["time"], timedeltas, encoding=encoding)
with pytest.raises(ValueError, match=invalid_key):
conventions.encode_cf_variable(variable)


@pytest.mark.parametrize("invalid_key", _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS)
def test_literal_timedelta_decoding_mask_and_scale_error(invalid_key) -> None:
attrs = {invalid_key: 1.0, "dtype": "timedelta64[s]", "units": "seconds"}
variable = Variable(["time"], [0, 1, 2], attrs=attrs)
with pytest.raises(ValueError, match=invalid_key):
conventions.decode_cf_variable("foo", variable)


@pytest.mark.parametrize(
("decode_via_units", "decode_via_dtype", "attrs", "expect_timedelta64"),
[
(True, True, {"units": "seconds"}, True),
(True, False, {"units": "seconds"}, True),
(False, True, {"units": "seconds"}, False),
(False, False, {"units": "seconds"}, False),
(True, True, {"dtype": "timedelta64[s]", "units": "seconds"}, True),
(True, False, {"dtype": "timedelta64[s]", "units": "seconds"}, True),
(False, True, {"dtype": "timedelta64[s]", "units": "seconds"}, True),
(False, False, {"dtype": "timedelta64[s]", "units": "seconds"}, False),
],
ids=lambda x: f"{x!r}",
)
def test_timedelta_decoding_options(
decode_via_units, decode_via_dtype, attrs, expect_timedelta64
) -> None:
array = np.array([0, 1, 2], dtype=np.dtype("int64"))
encoded = Variable(["time"], array, attrs=attrs)

# Confirm we decode to the expected dtype.
decode_timedelta = CFTimedeltaCoder(
time_unit="s",
decode_via_units=decode_via_units,
decode_via_dtype=decode_via_dtype,
)
decoded = conventions.decode_cf_variable(
"foo", encoded, decode_timedelta=decode_timedelta
)
if expect_timedelta64:
assert decoded.dtype == np.dtype("timedelta64[s]")
else:
assert decoded.dtype == np.dtype("int64")

# Confirm we exactly roundtrip.
reencoded = conventions.encode_cf_variable(decoded)
assert_identical(reencoded, encoded)


def test_timedelta_encoding_explicit_non_timedelta64_dtype() -> None:
encoding = {"dtype": np.dtype("int32")}
timedeltas = pd.timedelta_range(0, freq="D", periods=3)
variable = Variable(["time"], timedeltas, encoding=encoding)

encoded = conventions.encode_cf_variable(variable)
assert encoded.attrs["units"] == "days"
assert encoded.dtype == np.dtype("int32")

with pytest.warns(FutureWarning, match="timedelta"):
decoded = conventions.decode_cf_variable("foo", encoded)
assert_identical(decoded, variable)

reencoded = conventions.encode_cf_variable(decoded)
assert_identical(reencoded, encoded)
assert encoded.attrs["units"] == "days"
assert encoded.dtype == np.dtype("int32")
Loading
Oops, something went wrong.