From ce128bc9fb070a20f2c761f2e9cf119902afb577 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 14 May 2026 09:48:02 -0700 Subject: [PATCH 1/2] geotiff: default _vrt.read_vrt missing_sources to 'raise' (#1843) Flip the internal ``read_vrt`` default in ``_vrt.py`` from ``'warn'`` to ``'raise'`` so an unreadable VRT source halts the call instead of leaving a silent zero-fill hole. On an integer raster without a nodata sentinel the old lenient default produced data indistinguishable from real zero pixels unless the caller checked ``attrs['vrt_holes']``. The change matches the up-front rejection pattern the rest of the module uses (#1697, #1737, #1784, band validation). Callers that want the previous behaviour pass ``missing_sources='warn'`` explicitly, which still emits ``GeoTIFFFallbackWarning`` and records the skipped source on ``vrt.holes``. ``XRSPATIAL_GEOTIFF_STRICT=1`` keeps its module-wide force-raise semantics. One existing test (``test_missing_source_still_takes_lenient_warning_path`` from #1784) relied on the lenient default; it now passes ``missing_sources='warn'`` explicitly to exercise the opt-in branch. The public ``read_vrt`` / ``open_geotiff`` wrappers in ``__init__.py`` still default to ``'warn'`` and forward the kwarg explicitly, so this commit changes the internal contract only; aligning the public default is a separate follow-up. --- CHANGELOG.md | 1 + xrspatial/geotiff/_vrt.py | 21 +++-- ...est_geotiff_vrt_srcrect_validation_1784.py | 9 +- ..._vrt_missing_sources_default_raise_1843.py | 83 +++++++++++++++++++ 4 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_vrt_missing_sources_default_raise_1843.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a5b6115c..9ae2b872 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ #### Bug fixes and improvements - Refresh the geotiff mmap cache when a file is replaced under the same path so re-reads after an atomic-rename overwrite no longer return stale bytes - Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read back exactly +- Default internal `_vrt.read_vrt` `missing_sources` to `'raise'` so an unreadable VRT source no longer produces a silent zero-fill hole on integer rasters; pass `missing_sources='warn'` to opt back into the previous lenient behaviour (#1843) ### Version 0.9.9 - 2026-05-05 diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 812b5bf4..f151d903 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -852,7 +852,7 @@ def _apply_integer_sentinel_mask(arr, vrt, band): def read_vrt(vrt_path: str, *, window=None, band: int | None = None, max_pixels: int | None = None, - missing_sources: str = 'warn', + missing_sources: str = 'raise', parsed: VRTDataset | None = None, ) -> tuple[np.ndarray, VRTDataset]: """Read a VRT file by assembling pixel data from its source files. @@ -868,11 +868,22 @@ def read_vrt(vrt_path: str, *, window=None, max_pixels : int or None Maximum allowed pixel count (width * height * samples) for the assembled VRT region. None uses the reader default. - missing_sources : {'warn', 'raise'} + missing_sources : {'raise', 'warn'}, default 'raise' Policy for unreadable source files referenced by the VRT. - ``'warn'`` emits ``GeoTIFFFallbackWarning`` and records - ``vrt.holes`` unless ``XRSPATIAL_GEOTIFF_STRICT=1`` is set. - ``'raise'`` fails immediately. + ``'raise'`` (the default) fails immediately on an unreadable + source so a partial mosaic never surfaces silently. This matches + the rest of the geotiff module's up-front rejection of malformed + input and avoids the ambiguity of a zero-fill hole on an integer + raster without a nodata sentinel (see issue #1843). Prior to + #1843 the default was ``'warn'``; callers that relied on the + lenient behaviour should pass ``missing_sources='warn'`` + explicitly. + ``'warn'`` is the opt-in escape hatch for partial mosaics: it + emits ``GeoTIFFFallbackWarning`` and records the skipped source + on ``vrt.holes`` (surfaced as ``attrs['vrt_holes']`` on the + public DataArray). + ``XRSPATIAL_GEOTIFF_STRICT=1`` forces a raise across the whole + module regardless of this kwarg (see issue #1662). parsed : VRTDataset or None Pre-parsed VRT structure. When supplied, ``vrt_path`` is not re-read or re-parsed and the source-path containment check is diff --git a/xrspatial/geotiff/tests/test_geotiff_vrt_srcrect_validation_1784.py b/xrspatial/geotiff/tests/test_geotiff_vrt_srcrect_validation_1784.py index 5edb8ca4..07aa487a 100644 --- a/xrspatial/geotiff/tests/test_geotiff_vrt_srcrect_validation_1784.py +++ b/xrspatial/geotiff/tests/test_geotiff_vrt_srcrect_validation_1784.py @@ -117,14 +117,19 @@ def test_negative_srcrect_message_names_bad_values(): def test_missing_source_still_takes_lenient_warning_path(): """A *valid* SrcRect with a missing source file must still hit the lenient warning path -- the new SrcRect check must not swallow the - missing-file case that PR #1675 narrowed.""" + missing-file case that PR #1675 narrowed. + + Issue #1843 flipped the default to ``missing_sources='raise'`` so + this test now passes ``'warn'`` explicitly to exercise the opt-in + lenient branch. + """ from xrspatial.geotiff import GeoTIFFFallbackWarning with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: # No source file written; SrcRect itself is well-formed. vrt_path = _write_vrt(td, src_filename='does_not_exist.tif') with warnings.catch_warnings(record=True) as caught: warnings.simplefilter('always') - arr, _ = read_vrt(vrt_path) + arr, _ = read_vrt(vrt_path, missing_sources='warn') # The lenient path must produce a fallback warning and a result # array (zero-filled at the hole), not raise. fallback = [w for w in caught diff --git a/xrspatial/geotiff/tests/test_vrt_missing_sources_default_raise_1843.py b/xrspatial/geotiff/tests/test_vrt_missing_sources_default_raise_1843.py new file mode 100644 index 00000000..2f7ea9c1 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_missing_sources_default_raise_1843.py @@ -0,0 +1,83 @@ +"""Regression test for #1843: the internal ``read_vrt`` in ``_vrt.py`` +defaults to ``missing_sources='raise'`` so an unreadable source halts +the call instead of leaving a silent zero-fill hole on integer rasters. + +Callers wanting the historical lenient behaviour pass +``missing_sources='warn'`` explicitly. The strict-mode env var +``XRSPATIAL_GEOTIFF_STRICT=1`` continues to force-raise across the +whole module (orthogonal axis, not affected by this change). +""" +from __future__ import annotations + +import pytest + +from xrspatial.geotiff._vrt import read_vrt as _read_vrt_internal +from xrspatial.geotiff import GeoTIFFFallbackWarning + + +def _write_missing_source_vrt(path): + path.write_text( + '\n' + ' \n' + ' \n' + ' missing_1843.tif' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + +def test_read_vrt_default_raises_on_unreadable_source(tmp_path): + """Without an explicit ``missing_sources`` kwarg, an unreadable + backing source must raise rather than silently zero-fill. + + This is the behaviour change from #1843. Before this commit the + default was ``'warn'`` and a missing ``Byte`` tile produced a hole + of zero pixels that was indistinguishable from real data unless + the caller checked ``attrs['vrt_holes']``. + """ + vrt = tmp_path / "tmp_1843_default_raise.vrt" + _write_missing_source_vrt(vrt) + + with pytest.raises((OSError, ValueError)): + _read_vrt_internal(str(vrt)) + + +def test_read_vrt_explicit_warn_preserves_lenient_behaviour(tmp_path): + """``missing_sources='warn'`` is still the escape hatch for callers + that want partial mosaics with ``vrt.holes`` populated. + + Pinning the lenient path here keeps the historical contract + available to callers who opt in. The warning and the hole record + must both still surface. + """ + vrt = tmp_path / "tmp_1843_explicit_warn.vrt" + _write_missing_source_vrt(vrt) + + with pytest.warns(GeoTIFFFallbackWarning, match="could not be read"): + arr, parsed = _read_vrt_internal(str(vrt), missing_sources='warn') + + assert arr.shape == (2, 2) + assert len(parsed.holes) == 1 + assert parsed.holes[0]['source'].endswith('missing_1843.tif') + + +def test_read_vrt_strict_env_still_raises_under_warn(monkeypatch, tmp_path): + """``XRSPATIAL_GEOTIFF_STRICT=1`` continues to force-raise even + when the caller explicitly asks for the lenient ``'warn'`` policy. + + The strict env var is a module-wide override (see #1662); it must + still win over per-call ``missing_sources='warn'`` so CI runs with + strict mode catch partial mosaics regardless of caller settings. + """ + vrt = tmp_path / "tmp_1843_strict_env.vrt" + _write_missing_source_vrt(vrt) + + monkeypatch.setenv("XRSPATIAL_GEOTIFF_STRICT", "1") + + with pytest.raises((OSError, ValueError)): + _read_vrt_internal(str(vrt), missing_sources='warn') From b3970e56d34b35a79f66105f422bb41624a09127 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Thu, 14 May 2026 10:13:56 -0700 Subject: [PATCH 2/2] geotiff: refresh _vrt.read_vrt fallback comments for raise-by-default Address copilot review on #1850: the inline notes around the source-read fallback still described "Default mode warns" and the vrt_holes inspection path, which no longer match the new raise-by-default behavior. Reword them to describe the current default and the explicit missing_sources='warn' opt-in. --- xrspatial/geotiff/_vrt.py | 44 ++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index f151d903..a070016a 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -252,13 +252,14 @@ class VRTDataset: # coords must be emitted without the shift. Parsed from # ``Point``. raster_type: str = 'area' # 'area' or 'point' - # Per-load record of sources skipped by ``read_vrt`` under the - # lenient default (not strict mode). Each entry is a dict with - # ``source``, ``band`` (1-based), ``dst_rect`` (xoff, yoff, - # xsize, ysize), and ``error`` keys. Empty when no sources failed - # to read. Populated by :func:`read_vrt` so callers can detect - # holes by attribute lookup instead of parsing a UserWarning - # message (issue #1734). + # Per-load record of sources skipped by ``read_vrt`` when called + # with ``missing_sources='warn'`` (the lenient opt-in; the default + # since #1843 is ``'raise'``, and strict mode always raises). Each + # entry is a dict with ``source``, ``band`` (1-based), ``dst_rect`` + # (xoff, yoff, xsize, ysize), and ``error`` keys. Empty when no + # sources failed to read. Populated by :func:`read_vrt` so callers + # on the warn path can detect holes by attribute lookup instead of + # parsing the ``GeoTIFFFallbackWarning`` message (issue #1734). holes: list[dict] = field(default_factory=list) @@ -1171,21 +1172,22 @@ def read_vrt(vrt_path: str, *, window=None, ) + _CODEC_DECODE_EXCEPTIONS as e: if isinstance(e, PixelSafetyLimitError): raise - # Under XRSPATIAL_GEOTIFF_STRICT=1, surface the read failure - # so partial mosaics are caught in CI. Default mode warns - # once per missing source then continues, preserving the - # historical behaviour. See issue #1662. + # Default (``missing_sources='raise'``) surfaces the read + # failure immediately so a partial mosaic never ships + # silently. ``XRSPATIAL_GEOTIFF_STRICT=1`` forces the same + # raise module-wide regardless of the kwarg (#1662). # - # The lenient default leaves zero-filled holes in - # integer VRTs that downstream code cannot tell from - # real data unless the band has a nodata sentinel set. - # Recording the skipped source on ``vrt.holes`` lets the - # public ``read_vrt`` surface a machine-readable - # ``attrs['vrt_holes']`` entry on the returned - # DataArray, alongside the warning. Callers that need - # to fail loudly can either inspect that attr or set - # ``XRSPATIAL_GEOTIFF_STRICT=1`` to raise here. - # See issue #1734. + # The ``missing_sources='warn'`` opt-in keeps the legacy + # lenient path: emit ``GeoTIFFFallbackWarning`` once per + # unreadable source and record the skip on ``vrt.holes``, + # which the public ``read_vrt`` surfaces as + # ``attrs['vrt_holes']`` on the returned DataArray. That + # path is for callers that explicitly want a holey mosaic + # plus a machine-readable list of which sources failed -- + # zero-filled holes in an integer VRT are otherwise + # indistinguishable from real data without a nodata + # sentinel, which is why the default was flipped to raise + # in #1843. See also issues #1734 and #1843. import warnings from . import _geotiff_strict_mode, GeoTIFFFallbackWarning if missing_sources == 'raise' or _geotiff_strict_mode():