From 58eb65f1c6d13e3ad06435e29699b2974f01ebb3 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 22 May 2026 19:42:33 -0700 Subject: [PATCH 1/4] Route HTTP(S) scheme detection through a case-insensitive helper (#2332) Several call sites in xrspatial/geotiff/ classified HTTP(S) sources with startswith(('http://', 'https://')). That comparison is case-sensitive, so URLs like HTTP://169.254.169.254/... slipped past _HTTPSource and the SSRF / DNS-pinning checks in _validate_http_url and fell through to fsspec. RFC 3986 section 3.1 defines URI schemes as case-insensitive. Adds _is_http_source(s) in _sources.py, backed by urlparse(s).scheme.lower() in ('http', 'https'), and routes every dispatch site through it: _sources.py (_is_fsspec_uri + _open_source), _reader.py, _writer.py (_is_fsspec_uri), _backends/dask.py, _backends/gpu.py (eager-via-CPU branch + GDS opt-in gate), and the _sidecar.py _is_http_url helper. New regression tests in test_http_scheme_case_2321.py cover the helper, _open_source dispatch for HTTP / HTTPS / Http / hTTpS, the partner _is_fsspec_uri classifiers in both _sources and _writer, the sidecar helper, and the end-to-end SSRF rejection of uppercase URLs that resolve to 127.0.0.1, 169.254.169.254, 10.0.0.1, 192.168.1.1, 0.0.0.0, and localhost. socket.getaddrinfo is monkeypatched so the tests stay offline. Parent: #2321 sub-task 5. --- xrspatial/geotiff/_backends/dask.py | 6 +- xrspatial/geotiff/_backends/gpu.py | 6 +- xrspatial/geotiff/_reader.py | 4 +- xrspatial/geotiff/_sidecar.py | 9 +- xrspatial/geotiff/_sources.py | 28 +- xrspatial/geotiff/_writer.py | 13 +- .../tests/test_http_scheme_case_2321.py | 270 ++++++++++++++++++ 7 files changed, 323 insertions(+), 13 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_http_scheme_case_2321.py diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index 95f7b09a..548b7f1e 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -194,11 +194,9 @@ def read_geotiff_dask(source: str, *, # and ``_CloudSource`` satisfies that contract. Going through it # bounds metadata reads to ``MAX_HTTP_HEADER_BYTES`` instead of # fetching the whole remote object up front. See PR #1755 review. - is_http = ( - isinstance(source, str) - and source.startswith(('http://', 'https://')) - ) + from .._sources import _is_http_source from .._reader import _is_fsspec_uri + is_http = _is_http_source(source) is_fsspec = isinstance(source, str) and _is_fsspec_uri(source) http_meta = None http_meta_key = None diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 92ba91f0..65ff6ed3 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -316,6 +316,7 @@ def read_geotiff_gpu(source: str, *, from .._header import parse_all_ifds, parse_header, select_overview_ifd, validate_tile_layout from .._reader import (MAX_PIXELS_DEFAULT, _check_dimensions, _FileSource, _is_fsspec_uri, _max_tile_bytes_from_env, _resolve_masked_fill) + from .._sources import _is_http_source # ``source`` is already coerced above (before the dispatch # validator); no need to re-coerce here. @@ -347,7 +348,7 @@ def read_geotiff_gpu(source: str, *, # decode instead of nvCOMP-on-GPU. Callers who want bounded GPU # memory should pass ``chunks=...``. if isinstance(source, str) and ( - source.startswith(('http://', 'https://')) + _is_http_source(source) or _is_fsspec_uri(source)): return _read_geotiff_gpu_eager_via_cpu( source, dtype=dtype, window=window, @@ -1053,7 +1054,8 @@ def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation): """ if not isinstance(source, str): return False - if source.startswith(('http://', 'https://')): + from .._sources import _is_http_source + if _is_http_source(source): return False try: from .._reader import _is_fsspec_uri diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index f9a3003e..f6ebd604 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -93,7 +93,7 @@ _BytesIOSource, _CloudSource, _coerce_path, _FileSource, _get_http_pool, _get_pinned_conn_classes, _http_allow_private_hosts, _http_connect_timeout, _http_read_timeout, _http_timeout_from_env, _HTTPSource, _ip_is_private, - _is_file_like, _is_fsspec_uri, _make_pinned_pool, + _is_file_like, _is_fsspec_uri, _is_http_source, _make_pinned_pool, _max_coalesced_range_bytes_from_env, _max_tile_bytes_from_env, _mmap_cache, _mmap_cache_size_from_env, _MmapCache, _open_source, _resolve_max_cloud_bytes, _validate_http_url, coalesce_ranges, @@ -142,7 +142,7 @@ def _read_to_array(source, *, window=None, overview_level: int | None = None, (np.ndarray, GeoInfo) tuple """ source = _coerce_path(source) - if isinstance(source, str) and source.startswith(('http://', 'https://')): + if _is_http_source(source): return _read_cog_http(source, overview_level=overview_level, band=band, max_pixels=max_pixels, window=window, allow_rotated=allow_rotated) diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py index 050900f0..76970f2c 100644 --- a/xrspatial/geotiff/_sidecar.py +++ b/xrspatial/geotiff/_sidecar.py @@ -43,7 +43,14 @@ class SidecarOverviews(NamedTuple): def _is_http_url(source: str) -> bool: - return source.startswith(("http://", "https://")) + """Case-insensitive HTTP(S) scheme test for sidecar routing. + + Delegates to :func:`xrspatial.geotiff._sources._is_http_source` so + the SSRF-relevant routing decision matches the rest of the package. + Issue #2332. + """ + from ._sources import _is_http_source + return _is_http_source(source) def find_sidecar(source) -> str | None: diff --git a/xrspatial/geotiff/_sources.py b/xrspatial/geotiff/_sources.py index c774f465..8484a836 100644 --- a/xrspatial/geotiff/_sources.py +++ b/xrspatial/geotiff/_sources.py @@ -1451,11 +1451,35 @@ def close(self): _CLOUD_SCHEMES = ('s3://', 'gs://', 'az://', 'abfs://') +def _is_http_source(source) -> bool: + """Return True if ``source`` is an HTTP(S) URL, case-insensitively. + + Centralized so every routing call site in ``xrspatial/geotiff/`` + classifies the scheme the same way. Before this helper existed, + each call site did ``source.startswith(('http://', 'https://'))``, + which is case-sensitive and let ``HTTP://example.internal/...`` + (uppercase) slip past :class:`_HTTPSource` and the SSRF allow-list + in :func:`_validate_http_url`. Per RFC 3986 section 3.1 URI schemes + are case-insensitive, so any uppercase / mixed-case variant has to + route through the same validator as ``http`` / ``https``. + + Non-string inputs (``None``, ``bytes``, ``os.PathLike``, file-like + objects) return ``False`` so callers can drop the surrounding + ``isinstance(_, str)`` check where they want to. Issue #2332. + """ + if not isinstance(source, str) or not source: + return False + # ``urlparse`` strips off the scheme cleanly even for unusual inputs + # (e.g. ``HTTP:`` with no ``//``) and avoids the prefix-tuple trap. + from urllib.parse import urlparse + return urlparse(source).scheme.lower() in ('http', 'https') + + def _is_fsspec_uri(path: str) -> bool: """Check if a path is a fsspec-compatible URI (not http/https/local).""" if not isinstance(path, str): return False - if path.startswith(('http://', 'https://')): + if _is_http_source(path): return False return '://' in path @@ -1636,7 +1660,7 @@ def _open_source(source): raise TypeError( f"source must be a str path/URL or a binary file-like object " f"with read+seek methods, got {type(source).__name__}") - if source.startswith(('http://', 'https://')): + if _is_http_source(source): return _HTTPSource(source) if _is_fsspec_uri(source): return _CloudSource(source) diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 5230430b..2a2eb797 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -1238,10 +1238,19 @@ def _write_streaming(dask_data, path: str, *, def _is_fsspec_uri(path) -> bool: - """Check if a path is a fsspec-compatible URI (string only).""" + """Check if a path is a fsspec-compatible URI (string only). + + HTTP(S) URLs are deliberately excluded here so the writer can raise + a typed "writes not supported over HTTP" error instead of handing + the URL to fsspec. Uses :func:`_sources._is_http_source` so the + HTTP detection is case-insensitive (RFC 3986); without that, an + uppercase ``HTTP://...`` slipped past this check and into fsspec. + Issue #2332. + """ + from ._sources import _is_http_source if not isinstance(path, str): return False - if path.startswith(('http://', 'https://')): + if _is_http_source(path): return False return '://' in path diff --git a/xrspatial/geotiff/tests/test_http_scheme_case_2321.py b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py new file mode 100644 index 00000000..a811e7e4 --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py @@ -0,0 +1,270 @@ +"""Case-insensitive HTTP(S) scheme routing for SSRF protection (#2332). + +Issue #2321 sub-task 5. + +Background +---------- +Several routing call sites in ``xrspatial/geotiff/`` historically used +``startswith(('http://', 'https://'))`` to decide whether a string source +should be opened by ``_HTTPSource`` (which runs SSRF + DNS-pinning checks +via ``_validate_http_url``) or handed off to fsspec. That comparison is +case-sensitive, so a URL like ``HTTP://169.254.169.254/latest/meta-data`` +slipped past ``_HTTPSource`` entirely and fell through to fsspec, which +has no SSRF allow-list. Uppercase schemes are valid per RFC 3986 sect. 3.1 +(``scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )``, case-insensitive). + +The fix centralizes scheme detection on a single helper, ``_is_http_source``, +that does ``urlparse(url).scheme.lower() in ('http', 'https')``, and routes +every call site through it. + +These tests exercise: + +* The helper itself across mixed-case schemes. +* ``_open_source`` returning ``_HTTPSource`` for uppercase URLs. +* The dispatch boolean in every other call site (reader, writer, sidecar, + dask backend, gpu backend, fsspec classifier). +* The SSRF allow-list still rejecting uppercase URLs that resolve to + private / loopback / link-local addresses. + +All tests are offline: ``socket.getaddrinfo`` is monkeypatched so the +validator never opens a real connection. +""" +from __future__ import annotations + +import socket + +import pytest + +from xrspatial.geotiff import UnsafeURLError +from xrspatial.geotiff import _reader as _reader_mod +from xrspatial.geotiff import _sources as _sources_mod + + +# --------------------------------------------------------------------------- +# Helpers (mirrors test_ssrf_hardening_1664.py) +# --------------------------------------------------------------------------- + + +def _fake_getaddrinfo(ip: str): + def _resolver(host, port, *args, **kwargs): + if ':' in ip: + return [(socket.AF_INET6, socket.SOCK_STREAM, 0, '', + (ip, port or 0, 0, 0))] + return [(socket.AF_INET, socket.SOCK_STREAM, 0, '', + (ip, port or 0))] + return _resolver + + +# --------------------------------------------------------------------------- +# Helper unit tests +# --------------------------------------------------------------------------- + + +class TestIsHttpSourceHelper: + """``_is_http_source`` is the single source of truth for HTTP routing.""" + + @pytest.mark.parametrize("url", [ + 'http://example.com/x.tif', + 'https://example.com/x.tif', + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + 'hTTpS://example.com/x.tif', + 'http://EXAMPLE.COM/x.tif', # host case must not matter either + ]) + def test_http_schemes_match(self, url): + assert _sources_mod._is_http_source(url) is True + + @pytest.mark.parametrize("url", [ + 's3://bucket/key.tif', + 'gs://bucket/key.tif', + 'az://container/blob.tif', + 'abfs://container/blob.tif', + 'file:///etc/passwd', + 'ftp://example.com/x.tif', + 'gopher://example.com/', + 'memory://x.tif', + '/local/path/file.tif', + 'relative/path.tif', + 'C:\\windows\\file.tif', + ]) + def test_non_http_schemes_do_not_match(self, url): + assert _sources_mod._is_http_source(url) is False + + @pytest.mark.parametrize("value", [None, 42, b'http://x', object()]) + def test_non_string_does_not_match(self, value): + # Be defensive: routing call sites also gate on isinstance(_, str) + # in some places, but the helper itself must not raise on junk. + assert _sources_mod._is_http_source(value) is False + + def test_empty_string_does_not_match(self): + assert _sources_mod._is_http_source('') is False + + def test_scheme_only_prefix_does_not_match(self): + # ``urlparse('http')`` returns scheme=''; only ``http:`` or + # ``http://`` should classify as HTTP. + assert _sources_mod._is_http_source('http') is False + + +# --------------------------------------------------------------------------- +# Dispatch: ``_open_source`` must route uppercase URLs through ``_HTTPSource`` +# --------------------------------------------------------------------------- + + +class TestOpenSourceRoutesUppercase: + """``_open_source('HTTP://...')`` must build an ``_HTTPSource``. + + We intercept ``_HTTPSource.__init__`` so the test never opens a real + HTTP connection; getting the call at all is what we are verifying. + """ + + def test_uppercase_http_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + # Skip the real validator / urllib3 pool setup. + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('HTTP://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['HTTP://example.com/x.tif'] + + def test_uppercase_https_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('HTTPS://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['HTTPS://example.com/x.tif'] + + def test_mixed_case_routes_to_http_source(self, monkeypatch): + calls = [] + + def _fake_init(self, url, *args, **kwargs): + calls.append(url) + self._url = url + + monkeypatch.setattr( + _sources_mod._HTTPSource, '__init__', _fake_init) + src = _sources_mod._open_source('hTTpS://example.com/x.tif') + assert isinstance(src, _sources_mod._HTTPSource) + assert calls == ['hTTpS://example.com/x.tif'] + + +# --------------------------------------------------------------------------- +# Dispatch booleans elsewhere in the code base +# --------------------------------------------------------------------------- + + +class TestDispatchBooleansAreCaseInsensitive: + """Every routing site must use the centralized helper, not startswith. + + Each call site below historically read:: + + source.startswith(('http://', 'https://')) + + which is the bug. We assert ``_is_http_source`` returns True for the + uppercase forms; the implementation modules import and call the same + helper at the dispatch site. + """ + + @pytest.mark.parametrize("url", [ + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + ]) + def test_helper_recognizes_uppercase(self, url): + assert _sources_mod._is_http_source(url) is True + + def test_is_fsspec_uri_excludes_uppercase_http(self): + # ``_is_fsspec_uri`` is the partner classifier in both + # ``_sources.py`` and ``_writer.py``. If it returned True for + # ``HTTP://...`` the writer would hand the URL to fsspec instead + # of raising the typed "writes not supported over HTTP" error. + assert _sources_mod._is_fsspec_uri('HTTP://example.com/x.tif') is False + assert _sources_mod._is_fsspec_uri('HTTPS://example.com/x.tif') is False + # sanity: real fsspec URIs still classify as fsspec + assert _sources_mod._is_fsspec_uri('s3://b/k.tif') is True + + def test_writer_is_fsspec_uri_excludes_uppercase_http(self): + from xrspatial.geotiff import _writer as _writer_mod + assert _writer_mod._is_fsspec_uri('HTTP://example.com/x.tif') is False + assert _writer_mod._is_fsspec_uri('HTTPS://example.com/x.tif') is False + assert _writer_mod._is_fsspec_uri('s3://b/k.tif') is True + + def test_sidecar_helper_is_case_insensitive(self): + from xrspatial.geotiff import _sidecar as _sidecar_mod + assert _sidecar_mod._is_http_url('HTTP://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('HTTPS://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('http://example.com/x.tif') is True + assert _sidecar_mod._is_http_url('s3://b/k.tif') is False + + +# --------------------------------------------------------------------------- +# End-to-end: uppercase scheme + private host must still be rejected +# --------------------------------------------------------------------------- + + +class TestUppercaseSchemeStillRejectsPrivateHosts: + """The whole point of the fix: uppercase URLs go through the SSRF gate. + + Before the fix, ``HTTP://169.254.169.254/...`` would skip the validator + and try to open via fsspec. After the fix, it routes through + ``_HTTPSource``, which calls ``_validate_http_url``, which raises + ``UnsafeURLError``. + """ + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http', 'hTTpS']) + @pytest.mark.parametrize("ip", [ + '127.0.0.1', + '169.254.169.254', + '10.0.0.1', + '192.168.1.1', + '0.0.0.0', + ]) + def test_private_host_rejected_regardless_of_scheme_case( + self, monkeypatch, scheme, ip): + monkeypatch.setattr(socket, 'getaddrinfo', _fake_getaddrinfo(ip)) + url = f'{scheme}://attacker.test/x.tif' + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(url) + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http']) + def test_localhost_rejected_regardless_of_scheme_case( + self, monkeypatch, scheme): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(f'{scheme}://localhost:8080/x.tif') + + @pytest.mark.parametrize("scheme", ['HTTP', 'HTTPS', 'Http']) + def test_uppercase_scheme_to_127_literal_rejected( + self, monkeypatch, scheme): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(f'{scheme}://127.0.0.1/x.tif') + + def test_open_source_uppercase_private_host_raises(self, monkeypatch): + """End-to-end: ``_open_source`` -> ``_HTTPSource`` -> validator. + + Confirms the dispatch wiring actually drives the URL through the + validator (not just that the validator works in isolation). + """ + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('169.254.169.254')) + # Make sure the env override is not set; the validator skips + # resolution when ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS`` is on. + monkeypatch.delenv( + 'XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', raising=False) + with pytest.raises(UnsafeURLError): + _sources_mod._open_source( + 'HTTP://metadata.google.internal/computeMetadata/v1/') From 532d5bcd86109cc0bc757c5dee91031b60d5c5b6 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 22 May 2026 19:50:06 -0700 Subject: [PATCH 2/4] Address review: hoist imports, typed HTTP write error, lock broadening (#2332) Follow-ups on the /review-pr pass for PR #2337: * Hoist ``from urllib.parse import urlparse`` to ``_sources.py`` module top so ``_is_http_source`` no longer pays the import cost per call. * Hoist ``from ._sources import _is_http_source`` to module top in ``_writer.py`` and ``_sidecar.py`` (no cycle: neither is imported by ``_sources``). * Keep the function-scope imports in ``_backends/dask.py`` and ``_backends/gpu.py``: the backend modules deliberately defer the reader / sources import so the package can be imported without urllib3 in dask-only environments. Add a one-line comment so the next reader does not assume it is an oversight. * Reject HTTP(S) write targets in ``_write_bytes`` with a typed ``NotImplementedError`` before the local file write path interprets the URL as a filename. The error message lists the supported destinations. Covers both lowercase and uppercase scheme casing. * Lock the behavioural broadening: ``_is_http_source('http:foo')`` now returns True (urlparse-correct, broader than the old ``startswith``). Added a test asserting the classifier matches and that ``_open_source('http:foo')`` raises ``UnsafeURLError`` so the validator still rejects it as "no hostname". * Added 7 new test cases (helper + open_source + 5 writer-rejection parametrizations); full geotiff suite: 5324 passed, 68 skipped. --- xrspatial/geotiff/_backends/dask.py | 5 +- xrspatial/geotiff/_backends/gpu.py | 2 + xrspatial/geotiff/_sidecar.py | 2 +- xrspatial/geotiff/_sources.py | 2 +- xrspatial/geotiff/_writer.py | 12 ++++- .../tests/test_http_scheme_case_2321.py | 48 +++++++++++++++++++ 6 files changed, 67 insertions(+), 4 deletions(-) diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index 548b7f1e..26228d0b 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -194,8 +194,11 @@ def read_geotiff_dask(source: str, *, # and ``_CloudSource`` satisfies that contract. Going through it # bounds metadata reads to ``MAX_HTTP_HEADER_BYTES`` instead of # fetching the whole remote object up front. See PR #1755 review. - from .._sources import _is_http_source + # Local imports: backend modules avoid eager-importing the reader / + # sources layer at module load so the package can be imported without + # urllib3 in environments that only consume the dask path. Issue #2332. from .._reader import _is_fsspec_uri + from .._sources import _is_http_source is_http = _is_http_source(source) is_fsspec = isinstance(source, str) and _is_fsspec_uri(source) http_meta = None diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 65ff6ed3..14d3abcf 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -1054,6 +1054,8 @@ def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation): """ if not isinstance(source, str): return False + # Local import: backend gate is called once per source and the GDS + # path is optional, so avoid eager top-level coupling. Issue #2332. from .._sources import _is_http_source if _is_http_source(source): return False diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py index 76970f2c..e2f6bd1f 100644 --- a/xrspatial/geotiff/_sidecar.py +++ b/xrspatial/geotiff/_sidecar.py @@ -26,6 +26,7 @@ # ``_reader`` imports ``_sidecar`` lazily (inside functions), so this # top-level import does not form a cycle at module load time. from ._reader import _is_fsspec_uri +from ._sources import _is_http_source #: Type of the bytes-like buffer a sidecar carries: an mmap for local #: files, bytes for HTTP / fsspec downloads. Narrowed from ``object`` @@ -49,7 +50,6 @@ def _is_http_url(source: str) -> bool: the SSRF-relevant routing decision matches the rest of the package. Issue #2332. """ - from ._sources import _is_http_source return _is_http_source(source) diff --git a/xrspatial/geotiff/_sources.py b/xrspatial/geotiff/_sources.py index 8484a836..637172a6 100644 --- a/xrspatial/geotiff/_sources.py +++ b/xrspatial/geotiff/_sources.py @@ -31,6 +31,7 @@ import threading from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urlparse import urllib3 @@ -1471,7 +1472,6 @@ def _is_http_source(source) -> bool: return False # ``urlparse`` strips off the scheme cleanly even for unusual inputs # (e.g. ``HTTP:`` with no ``//``) and avoids the prefix-tuple trap. - from urllib.parse import urlparse return urlparse(source).scheme.lower() in ('http', 'https') diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index 2a2eb797..df040d57 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -61,6 +61,7 @@ # (``_write``, ``_write_streaming``) and external importers (the # ``_writers`` subpackage, tests, ``_gpu_decode``) keep using the # ``xrspatial.geotiff._writer`` import path. +from ._sources import _is_http_source from ._write_layout import (BO, _assemble_cog_layout, _assemble_standard_layout, # noqa: F401 _assemble_tiff, _build_ifd, _compute_classic_ifd_overhead, _float_to_rational, _pack_tag_value, _promote_offsets_to_long8, @@ -1247,7 +1248,6 @@ def _is_fsspec_uri(path) -> bool: uppercase ``HTTP://...`` slipped past this check and into fsspec. Issue #2332. """ - from ._sources import _is_http_source if not isinstance(path, str): return False if _is_http_source(path): @@ -1282,6 +1282,16 @@ def _write_bytes(file_bytes: bytes | bytearray, path) -> None: path.write(file_bytes) return + # Reject HTTP(S) write targets with a typed error before the local + # file path tries to treat the URL as a filename. ``_is_http_source`` + # is case-insensitive so ``HTTP://...`` reports the same friendly + # error as ``http://...``. Issue #2332. + if isinstance(path, str) and _is_http_source(path): + raise NotImplementedError( + f"Writes are not supported over HTTP(S). Got {path!r}. " + "Write to a local path or an fsspec-supported cloud URL " + "(s3://, gs://, az://, ...) instead.") + if _is_fsspec_uri(path): try: import fsspec diff --git a/xrspatial/geotiff/tests/test_http_scheme_case_2321.py b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py index a811e7e4..e2ed2c0f 100644 --- a/xrspatial/geotiff/tests/test_http_scheme_case_2321.py +++ b/xrspatial/geotiff/tests/test_http_scheme_case_2321.py @@ -105,6 +105,23 @@ def test_scheme_only_prefix_does_not_match(self): # ``http://`` should classify as HTTP. assert _sources_mod._is_http_source('http') is False + def test_scheme_colon_no_slashes_classifies_as_http(self): + # ``urlparse('http:foo').scheme == 'http'``: this is broader than + # the old ``startswith('http://')`` gate but is RFC-correct. The + # validator rejects these downstream as "no hostname", so the + # security posture is unchanged. Locking the broader classifier + # in here keeps any future tightening explicit. Issue #2332. + assert _sources_mod._is_http_source('http:foo') is True + assert _sources_mod._is_http_source('HTTP:foo') is True + + def test_open_source_http_colon_no_hostname_raises(self): + # End-to-end follow-up: ``_open_source('http:foo')`` now routes + # into ``_HTTPSource``, which calls ``_validate_http_url`` and + # raises ``UnsafeURLError('... has no hostname')``. The previous + # case-sensitive gate would have sent this to fsspec instead. + with pytest.raises(UnsafeURLError): + _sources_mod._open_source('http:foo') + # --------------------------------------------------------------------------- # Dispatch: ``_open_source`` must route uppercase URLs through ``_HTTPSource`` @@ -268,3 +285,34 @@ def test_open_source_uppercase_private_host_raises(self, monkeypatch): with pytest.raises(UnsafeURLError): _sources_mod._open_source( 'HTTP://metadata.google.internal/computeMetadata/v1/') + + +# --------------------------------------------------------------------------- +# Writer: HTTP(S) destinations must raise a typed error, not a raw OSError +# --------------------------------------------------------------------------- + + +class TestWriterRejectsHttpTargets: + """``_write_bytes(_, 'HTTP://...')`` must raise ``NotImplementedError``. + + Without the early gate the uppercase URL fell through ``_is_fsspec_uri`` + (correctly returns False) and into the local file write path, which + surfaced an OS-specific ``OSError`` for the colon-in-filename. The + typed error matches the lowercase-HTTP behaviour and points users at + the supported destinations. Follow-up to issue #2332 review. + """ + + @pytest.mark.parametrize("url", [ + 'http://example.com/x.tif', + 'https://example.com/x.tif', + 'HTTP://example.com/x.tif', + 'HTTPS://example.com/x.tif', + 'Http://example.com/x.tif', + ]) + def test_write_bytes_rejects_http(self, url): + from xrspatial.geotiff import _writer as _writer_mod + with pytest.raises(NotImplementedError) as excinfo: + _writer_mod._write_bytes(b'IIxxxx', url) + msg = str(excinfo.value) + assert 'HTTP' in msg + assert url in msg From 96c94f6ffaf19ab77e7f3874c08719bc29524d20 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 24 May 2026 17:30:04 -0700 Subject: [PATCH 3/4] Add positive VRT mosaic tests over GeoTIFF sources (#2369) Sub-task of #2342. Covers the supported simple-VRT subset on both the eager numpy and dask read paths: - 2x1 horizontal mosaic of two compatible tiles - 2x2 mosaic of four compatible tiles - Windowed read aligned with the source seam - Dask reads of both mosaics with multiple chunks - Multi-band 2x1 mosaic Each test asserts values, coords, and key attrs (crs, transform, nodata) so the contract is checked at the pixel level rather than on shape alone. --- .../tests/test_vrt_simple_mosaic_2369.py | 431 ++++++++++++++++++ 1 file changed, 431 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py diff --git a/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py b/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py new file mode 100644 index 00000000..9e694544 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py @@ -0,0 +1,431 @@ +"""Positive coverage for simple VRT mosaics over GeoTIFF sources (#2369). + +Part of epic #2342. The release contract lists "simple GDAL VRT mosaics +backed by GeoTIFF sources" as a supported subset on both the eager +numpy and dask read paths. Other VRT tests in this directory hit that +subset indirectly while exercising codecs, dtypes, or chunk policies, +but nothing pins the contract down on its own. + +This module covers the supported mosaic shape explicitly: + +* 2x1 horizontal mosaic of two compatible tiles +* 2x2 mosaic of four compatible tiles +* Windowed read where the request window lines up with source pixels +* Dask read of both mosaics with multiple chunks +* One multi-band 2x1 mosaic + +Each test asserts values, coords, and key attrs (``crs``, ``transform``, +``nodata``). Shape-only assertions are not enough -- the point of these +tests is that the contract holds at the pixel level. + +Implementation uses ``to_geotiff`` to build the source tiles and the +internal ``write_vrt`` to wire up the mosaic XML, mirroring the pattern +already used in ``test_vrt_lazy_chunks_1814.py``. +""" +from __future__ import annotations + +import os +import tempfile + +import dask.array as da +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import read_vrt, to_geotiff +from xrspatial.geotiff._vrt import write_vrt as _write_vrt_internal + + +# --------------------------------------------------------------------------- +# Tile builders +# --------------------------------------------------------------------------- + +# Per-tile pixel size and CRS shared by every mosaic in this module. +# Picking constants once keeps the assertion blocks below short and makes +# transform comparisons unambiguous. +_PIXEL_W = 0.001 +_PIXEL_H = -0.001 +_CRS = 4326 +_NODATA = -9999.0 + + +def _make_tile( + tmp_dir, + name: str, + data: np.ndarray, + origin_x: float, + origin_y: float, + *, + nodata: float | None = _NODATA, +) -> str: + """Write ``data`` as a single-band GeoTIFF anchored at the given origin. + + Returns the on-disk path. ``data`` shape is ``(H, W)``. + """ + height, width = data.shape + y = np.array([origin_y + _PIXEL_H * (i + 0.5) for i in range(height)]) + x = np.array([origin_x + _PIXEL_W * (j + 0.5) for j in range(width)]) + attrs = {'crs': _CRS} + if nodata is not None: + attrs['nodata'] = nodata + raster = xr.DataArray( + data, dims=['y', 'x'], + coords={'y': y, 'x': x}, + attrs=attrs, + ) + path = os.path.join(tmp_dir, name) + to_geotiff(raster, path, nodata=nodata) + return path + + +def _make_multiband_tile( + tmp_dir, + name: str, + data: np.ndarray, + origin_x: float, + origin_y: float, +) -> str: + """Write a multi-band GeoTIFF anchored at the given origin. + + ``data`` shape is ``(H, W, B)``. + """ + height, width, nbands = data.shape + y = np.array([origin_y + _PIXEL_H * (i + 0.5) for i in range(height)]) + x = np.array([origin_x + _PIXEL_W * (j + 0.5) for j in range(width)]) + raster = xr.DataArray( + data, + dims=['y', 'x', 'band'], + coords={'y': y, 'x': x, 'band': np.arange(nbands)}, + attrs={'crs': _CRS}, + ) + path = os.path.join(tmp_dir, name) + to_geotiff(raster, path) + return path + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture +def mosaic_2x1(): + """Two 32x32 float32 tiles side-by-side, west and east. + + Yields ``(vrt_path, expected_array, origin_x, origin_y)``. The + expected array is the horizontal concatenation of the two source + arrays. + """ + td = tempfile.mkdtemp(prefix='tmp_2369_2x1_') + height, width = 32, 32 + left_data = np.arange(height * width, dtype=np.float32).reshape(height, width) + right_data = ( + left_data + (height * width) + ).astype(np.float32) + + origin_x, origin_y = -120.0, 45.0 + left_path = _make_tile(td, 'left.tif', left_data, origin_x, origin_y) + right_path = _make_tile( + td, 'right.tif', right_data, + origin_x + _PIXEL_W * width, origin_y, + ) + + vrt_path = os.path.join(td, 'mosaic_2x1.vrt') + _write_vrt_internal(vrt_path, [left_path, right_path]) + + expected = np.concatenate([left_data, right_data], axis=1) + yield vrt_path, expected, origin_x, origin_y + + +@pytest.fixture +def mosaic_2x2(): + """Four 32x32 float32 tiles arranged 2 rows by 2 cols. + + Yields ``(vrt_path, expected_array, origin_x, origin_y)`` with the + expected array stitched in (row, col) order. + """ + td = tempfile.mkdtemp(prefix='tmp_2369_2x2_') + h, w = 32, 32 + # Use distinct constant values per tile so any swap shows up loudly + # in the assertion diff. + tile_nw = np.full((h, w), 1.0, dtype=np.float32) + tile_ne = np.full((h, w), 2.0, dtype=np.float32) + tile_sw = np.full((h, w), 3.0, dtype=np.float32) + tile_se = np.full((h, w), 4.0, dtype=np.float32) + + origin_x, origin_y = -120.0, 45.0 + nw_path = _make_tile(td, 'nw.tif', tile_nw, origin_x, origin_y) + ne_path = _make_tile( + td, 'ne.tif', tile_ne, + origin_x + _PIXEL_W * w, origin_y, + ) + sw_path = _make_tile( + td, 'sw.tif', tile_sw, + origin_x, origin_y + _PIXEL_H * h, + ) + se_path = _make_tile( + td, 'se.tif', tile_se, + origin_x + _PIXEL_W * w, origin_y + _PIXEL_H * h, + ) + + vrt_path = os.path.join(td, 'mosaic_2x2.vrt') + _write_vrt_internal( + vrt_path, [nw_path, ne_path, sw_path, se_path], + ) + + top = np.concatenate([tile_nw, tile_ne], axis=1) + bottom = np.concatenate([tile_sw, tile_se], axis=1) + expected = np.concatenate([top, bottom], axis=0) + yield vrt_path, expected, origin_x, origin_y + + +@pytest.fixture +def mosaic_multiband_2x1(): + """Two 3-band 32x32 float32 tiles side-by-side.""" + td = tempfile.mkdtemp(prefix='tmp_2369_mb_2x1_') + h, w, b = 32, 32, 3 + rng = np.random.default_rng(2369) + left_data = rng.random((h, w, b), dtype=np.float32) + right_data = rng.random((h, w, b), dtype=np.float32) + + origin_x, origin_y = -120.0, 45.0 + left_path = _make_multiband_tile( + td, 'left_mb.tif', left_data, origin_x, origin_y, + ) + right_path = _make_multiband_tile( + td, 'right_mb.tif', right_data, + origin_x + _PIXEL_W * w, origin_y, + ) + + vrt_path = os.path.join(td, 'mosaic_mb.vrt') + _write_vrt_internal(vrt_path, [left_path, right_path]) + + # read_vrt mirrors the on-disk layout: per-band plane stacked on + # the trailing axis to match what ``to_geotiff`` wrote, so the + # expected mosaic is (H, W, B). + expected = np.stack( + [ + np.concatenate([left_data[..., k], right_data[..., k]], axis=1) + for k in range(b) + ], + axis=-1, + ) + yield vrt_path, expected, origin_x, origin_y + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _assert_attrs_ok(result, *, expected_nodata=None): + """Common attr assertions for VRT reads in this module. + + Checks that ``crs`` and ``transform`` are present and consistent + with the fixture constants, and optionally that ``nodata`` matches. + """ + assert 'crs' in result.attrs, ( + f"crs missing from attrs; have {sorted(result.attrs)}" + ) + # write_vrt may normalise the CRS to a WKT string; accept either + # form as long as it is non-empty and references EPSG:4326. + crs_val = result.attrs['crs'] + if isinstance(crs_val, int): + assert crs_val == _CRS + else: + assert crs_val, "crs attr is present but empty" + # Best-effort string check: WKT for 4326 mentions WGS 84 + assert 'WGS' in str(crs_val) or '4326' in str(crs_val), ( + f"crs attr does not look like EPSG:4326: {crs_val!r}" + ) + + assert 'transform' in result.attrs, ( + f"transform missing from attrs; have {sorted(result.attrs)}" + ) + transform = result.attrs['transform'] + # transform format is (pixel_w, 0, origin_x, 0, pixel_h, origin_y) + # in the affine 6-tuple convention this module uses elsewhere. + assert len(transform) == 6, ( + f"transform should be a 6-tuple, got {transform!r}" + ) + assert transform[0] == pytest.approx(_PIXEL_W), ( + f"transform pixel width = {transform[0]}, expected {_PIXEL_W}" + ) + assert transform[4] == pytest.approx(_PIXEL_H), ( + f"transform pixel height = {transform[4]}, expected {_PIXEL_H}" + ) + + if expected_nodata is not None: + assert 'nodata' in result.attrs, ( + f"nodata missing from attrs; have {sorted(result.attrs)}" + ) + assert result.attrs['nodata'] == pytest.approx(expected_nodata) + + +def _assert_coords_monotonic(result, *, expected_origin_x, expected_origin_y): + """Check that x/y coords are monotonic and start at the expected origin + (within half a pixel: TIFF coords are pixel centers, not corners). + """ + x = np.asarray(result['x'].values) + y = np.asarray(result['y'].values) + + # x increases west to east, y decreases north to south. + assert np.all(np.diff(x) > 0), "x coord is not strictly increasing" + assert np.all(np.diff(y) < 0), "y coord is not strictly decreasing" + + # First pixel center: origin + half pixel. + assert x[0] == pytest.approx(expected_origin_x + _PIXEL_W * 0.5) + assert y[0] == pytest.approx(expected_origin_y + _PIXEL_H * 0.5) + + +# --------------------------------------------------------------------------- +# 1. Eager numpy reads of horizontal and 2x2 mosaics +# --------------------------------------------------------------------------- + +def test_eager_2x1_mosaic_values_coords_attrs(mosaic_2x1): + """Eager read of a 2x1 horizontal mosaic returns the concatenated + pixel block, with monotonic coords and the fixture's crs / transform + / nodata on attrs. + """ + vrt_path, expected, ox, oy = mosaic_2x1 + + result = read_vrt(vrt_path) + + assert result.shape == expected.shape, ( + f"eager 2x1 shape {result.shape}, expected {expected.shape}" + ) + np.testing.assert_array_equal(result.values, expected) + _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) + _assert_attrs_ok(result, expected_nodata=_NODATA) + + +def test_eager_2x2_mosaic_values_coords_attrs(mosaic_2x2): + """Eager read of a 2x2 mosaic stitches tiles in the right order. + + Each tile has a distinct constant value, so a misordered placement + surfaces immediately in the value assertion rather than appearing + only as a numeric diff. + """ + vrt_path, expected, ox, oy = mosaic_2x2 + + result = read_vrt(vrt_path) + + assert result.shape == expected.shape, ( + f"eager 2x2 shape {result.shape}, expected {expected.shape}" + ) + np.testing.assert_array_equal(result.values, expected) + _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) + _assert_attrs_ok(result, expected_nodata=_NODATA) + + +# --------------------------------------------------------------------------- +# 2. Windowed read that maps cleanly into source windows +# --------------------------------------------------------------------------- + +def test_windowed_read_aligned_with_source_boundary(mosaic_2x1): + """A window crossing the seam between the two source tiles returns + the same pixels as slicing the full mosaic. + + The window picked here covers the right half of the left tile and + the left half of the right tile: both halves land on whole-pixel + boundaries inside their respective sources, so this is the + "request lines up with source pixels" case from the issue. + """ + vrt_path, expected, ox, oy = mosaic_2x1 + h, w_total = expected.shape # (32, 64) + + # Window: full height, last 16 cols of left tile through first + # 16 cols of right tile. 32 / 2 = 16 keeps each side aligned to + # the source's own pixel grid. + r0, c0, r1, c1 = 0, 16, h, 48 + + result = read_vrt(vrt_path, window=(r0, c0, r1, c1)) + + np.testing.assert_array_equal(result.values, expected[r0:r1, c0:c1]) + + # Coords on the windowed result should correspond to the windowed + # slice of the full mosaic's coords. + full = read_vrt(vrt_path) + np.testing.assert_array_equal( + np.asarray(result['x'].values), + np.asarray(full['x'].values)[c0:c1], + ) + np.testing.assert_array_equal( + np.asarray(result['y'].values), + np.asarray(full['y'].values)[r0:r1], + ) + _assert_attrs_ok(result, expected_nodata=_NODATA) + + +# --------------------------------------------------------------------------- +# 3. Dask reads with multiple chunks +# --------------------------------------------------------------------------- + +def test_dask_2x1_mosaic_multi_chunk_matches_eager(mosaic_2x1): + """Dask read with chunks smaller than the mosaic returns the same + pixels as the eager read, and uses a real multi-block dask graph. + """ + vrt_path, expected, ox, oy = mosaic_2x1 + + chunked = read_vrt(vrt_path, chunks=(16, 16)) + + # Real dask graph with the expected number of blocks: 32/16 = 2 + # rows, 64/16 = 4 cols. + assert isinstance(chunked.data, da.Array), ( + f"expected dask Array, got {type(chunked.data).__name__}" + ) + assert chunked.data.numblocks == (2, 4), ( + f"expected 2x4 blocks, got {chunked.data.numblocks}" + ) + + computed = chunked.compute() + np.testing.assert_array_equal(computed.values, expected) + _assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) + _assert_attrs_ok(computed, expected_nodata=_NODATA) + + +def test_dask_2x2_mosaic_multi_chunk_matches_eager(mosaic_2x2): + """Dask read of the 2x2 mosaic with chunk size below tile size. + + Chunks of 16 split each 32x32 tile into 2x2 blocks. The full + mosaic is 64x64 so the resulting dask array is 4x4 blocks. + """ + vrt_path, expected, ox, oy = mosaic_2x2 + + chunked = read_vrt(vrt_path, chunks=(16, 16)) + + assert isinstance(chunked.data, da.Array) + assert chunked.data.numblocks == (4, 4), ( + f"expected 4x4 blocks, got {chunked.data.numblocks}" + ) + + computed = chunked.compute() + np.testing.assert_array_equal(computed.values, expected) + _assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) + _assert_attrs_ok(computed, expected_nodata=_NODATA) + + +# --------------------------------------------------------------------------- +# 4. Multi-band mosaic +# --------------------------------------------------------------------------- + +def test_eager_multiband_2x1_mosaic(mosaic_multiband_2x1): + """Eager read of a multi-band 2x1 mosaic returns one stitched plane + per band. + + Multi-band VRT reads return shape ``(H, W, B)`` to match the + on-disk layout; assert per-band values against the stack built in + the fixture. + """ + vrt_path, expected, ox, oy = mosaic_multiband_2x1 + + result = read_vrt(vrt_path) + + # Multi-band path: trailing axis is band. + assert result.shape == expected.shape, ( + f"multiband 2x1 shape {result.shape}, expected {expected.shape}" + ) + np.testing.assert_array_equal(result.values, expected) + + # Coords use the same x/y dims as the single-band case; only the + # number of bands changed. + _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) + _assert_attrs_ok(result) From 85e0caf42e88b9665e634481940e31b65facb1ff Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 24 May 2026 17:33:45 -0700 Subject: [PATCH 4/4] Address review: tmp_path, transform origin, multi-band dask (#2369) - Switch the three fixtures from tempfile.mkdtemp to tmp_path subdirectories so pytest cleans them up between runs while keeping per-issue prefix protection. - _assert_attrs_ok now optionally checks transform[2] (origin_x) and transform[5] (origin_y); every call site passes the fixture origin so a translation bug would not slip through. - Drop the unused w_total in the windowed test. - Add test_dask_multiband_2x1_mosaic_matches_eager so the multi-band contract is covered on the dask path too. --- .../tests/test_vrt_simple_mosaic_2369.py | 117 +++++++++++++++--- 1 file changed, 102 insertions(+), 15 deletions(-) diff --git a/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py b/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py index 9e694544..b0b8f6f8 100644 --- a/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py +++ b/xrspatial/geotiff/tests/test_vrt_simple_mosaic_2369.py @@ -25,7 +25,6 @@ from __future__ import annotations import os -import tempfile import dask.array as da import numpy as np @@ -108,14 +107,16 @@ def _make_multiband_tile( # --------------------------------------------------------------------------- @pytest.fixture -def mosaic_2x1(): +def mosaic_2x1(tmp_path): """Two 32x32 float32 tiles side-by-side, west and east. Yields ``(vrt_path, expected_array, origin_x, origin_y)``. The expected array is the horizontal concatenation of the two source arrays. """ - td = tempfile.mkdtemp(prefix='tmp_2369_2x1_') + td = tmp_path / 'tmp_2369_2x1' + td.mkdir() + td = str(td) height, width = 32, 32 left_data = np.arange(height * width, dtype=np.float32).reshape(height, width) right_data = ( @@ -137,13 +138,15 @@ def mosaic_2x1(): @pytest.fixture -def mosaic_2x2(): +def mosaic_2x2(tmp_path): """Four 32x32 float32 tiles arranged 2 rows by 2 cols. Yields ``(vrt_path, expected_array, origin_x, origin_y)`` with the expected array stitched in (row, col) order. """ - td = tempfile.mkdtemp(prefix='tmp_2369_2x2_') + td = tmp_path / 'tmp_2369_2x2' + td.mkdir() + td = str(td) h, w = 32, 32 # Use distinct constant values per tile so any swap shows up loudly # in the assertion diff. @@ -179,9 +182,11 @@ def mosaic_2x2(): @pytest.fixture -def mosaic_multiband_2x1(): +def mosaic_multiband_2x1(tmp_path): """Two 3-band 32x32 float32 tiles side-by-side.""" - td = tempfile.mkdtemp(prefix='tmp_2369_mb_2x1_') + td = tmp_path / 'tmp_2369_mb_2x1' + td.mkdir() + td = str(td) h, w, b = 32, 32, 3 rng = np.random.default_rng(2369) left_data = rng.random((h, w, b), dtype=np.float32) @@ -216,11 +221,20 @@ def mosaic_multiband_2x1(): # Helpers # --------------------------------------------------------------------------- -def _assert_attrs_ok(result, *, expected_nodata=None): +def _assert_attrs_ok( + result, + *, + expected_nodata=None, + expected_origin_x=None, + expected_origin_y=None, +): """Common attr assertions for VRT reads in this module. Checks that ``crs`` and ``transform`` are present and consistent with the fixture constants, and optionally that ``nodata`` matches. + When ``expected_origin_x`` / ``expected_origin_y`` are passed, the + transform's origin entries are checked too -- pixel size alone is + not enough to catch a translation bug. """ assert 'crs' in result.attrs, ( f"crs missing from attrs; have {sorted(result.attrs)}" @@ -252,6 +266,16 @@ def _assert_attrs_ok(result, *, expected_nodata=None): assert transform[4] == pytest.approx(_PIXEL_H), ( f"transform pixel height = {transform[4]}, expected {_PIXEL_H}" ) + if expected_origin_x is not None: + assert transform[2] == pytest.approx(expected_origin_x), ( + f"transform origin_x = {transform[2]}, " + f"expected {expected_origin_x}" + ) + if expected_origin_y is not None: + assert transform[5] == pytest.approx(expected_origin_y), ( + f"transform origin_y = {transform[5]}, " + f"expected {expected_origin_y}" + ) if expected_nodata is not None: assert 'nodata' in result.attrs, ( @@ -294,7 +318,12 @@ def test_eager_2x1_mosaic_values_coords_attrs(mosaic_2x1): ) np.testing.assert_array_equal(result.values, expected) _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok(result, expected_nodata=_NODATA) + _assert_attrs_ok( + result, + expected_nodata=_NODATA, + expected_origin_x=ox, + expected_origin_y=oy, + ) def test_eager_2x2_mosaic_values_coords_attrs(mosaic_2x2): @@ -313,7 +342,12 @@ def test_eager_2x2_mosaic_values_coords_attrs(mosaic_2x2): ) np.testing.assert_array_equal(result.values, expected) _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok(result, expected_nodata=_NODATA) + _assert_attrs_ok( + result, + expected_nodata=_NODATA, + expected_origin_x=ox, + expected_origin_y=oy, + ) # --------------------------------------------------------------------------- @@ -330,7 +364,7 @@ def test_windowed_read_aligned_with_source_boundary(mosaic_2x1): "request lines up with source pixels" case from the issue. """ vrt_path, expected, ox, oy = mosaic_2x1 - h, w_total = expected.shape # (32, 64) + h = expected.shape[0] # 32 # Window: full height, last 16 cols of left tile through first # 16 cols of right tile. 32 / 2 = 16 keeps each side aligned to @@ -352,7 +386,16 @@ def test_windowed_read_aligned_with_source_boundary(mosaic_2x1): np.asarray(result['y'].values), np.asarray(full['y'].values)[r0:r1], ) - _assert_attrs_ok(result, expected_nodata=_NODATA) + # Windowed transform origin shifts by the row / col offset times + # pixel size. (r0=0 keeps origin_y at the fixture's oy.) + expected_window_ox = ox + _PIXEL_W * c0 + expected_window_oy = oy + _PIXEL_H * r0 + _assert_attrs_ok( + result, + expected_nodata=_NODATA, + expected_origin_x=expected_window_ox, + expected_origin_y=expected_window_oy, + ) # --------------------------------------------------------------------------- @@ -379,7 +422,12 @@ def test_dask_2x1_mosaic_multi_chunk_matches_eager(mosaic_2x1): computed = chunked.compute() np.testing.assert_array_equal(computed.values, expected) _assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok(computed, expected_nodata=_NODATA) + _assert_attrs_ok( + computed, + expected_nodata=_NODATA, + expected_origin_x=ox, + expected_origin_y=oy, + ) def test_dask_2x2_mosaic_multi_chunk_matches_eager(mosaic_2x2): @@ -400,7 +448,12 @@ def test_dask_2x2_mosaic_multi_chunk_matches_eager(mosaic_2x2): computed = chunked.compute() np.testing.assert_array_equal(computed.values, expected) _assert_coords_monotonic(computed, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok(computed, expected_nodata=_NODATA) + _assert_attrs_ok( + computed, + expected_nodata=_NODATA, + expected_origin_x=ox, + expected_origin_y=oy, + ) # --------------------------------------------------------------------------- @@ -428,4 +481,38 @@ def test_eager_multiband_2x1_mosaic(mosaic_multiband_2x1): # Coords use the same x/y dims as the single-band case; only the # number of bands changed. _assert_coords_monotonic(result, expected_origin_x=ox, expected_origin_y=oy) - _assert_attrs_ok(result) + _assert_attrs_ok( + result, expected_origin_x=ox, expected_origin_y=oy, + ) + + +def test_dask_multiband_2x1_mosaic_matches_eager(mosaic_multiband_2x1): + """Dask read of the multi-band 2x1 mosaic with sub-tile chunks must + match the eager read pixel-for-pixel across every band. + + Chunking exercises per-block band handling: a bug that loses a + band on one chunk but not another would not appear in the eager + test above. + """ + vrt_path, expected, ox, oy = mosaic_multiband_2x1 + + eager = read_vrt(vrt_path) + chunked = read_vrt(vrt_path, chunks=(16, 16)) + + # The chunked array should be dask-backed; the band axis may or + # may not be split depending on the implementation, but the + # spatial axes must be. + assert isinstance(chunked.data, da.Array), ( + f"expected dask Array, got {type(chunked.data).__name__}" + ) + + computed = chunked.compute() + assert computed.shape == eager.shape + np.testing.assert_array_equal(computed.values, eager.values) + np.testing.assert_array_equal(computed.values, expected) + _assert_coords_monotonic( + computed, expected_origin_x=ox, expected_origin_y=oy, + ) + _assert_attrs_ok( + computed, expected_origin_x=ox, expected_origin_y=oy, + )