Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 134 additions & 44 deletions xrspatial/geotiff/_vrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,72 @@ def _resample_nearest(src_arr: np.ndarray,
return src_arr[y_idx[:, None], x_idx[None, :]]


def _nn_src_index(out_idx: int, src_size: int, out_size: int) -> int:
"""Return the source pixel index a single output pixel samples.

Implements the same ``floor((out_idx + 0.5) * src_size / out_size)``
rule as :func:`_resample_nearest`, with the same clamp into
``[0, src_size - 1]``. Used to compute the inverse mapping from a
clipped destination sub-window back to the minimal source rect that
feeds it. See issue #1704.
"""
idx = int(math.floor((out_idx + 0.5) * src_size / out_size))
if idx < 0:
return 0
if idx > src_size - 1:
return src_size - 1
return idx


def _resample_nearest_window(src_sub: np.ndarray,
src_origin_y: int, src_origin_x: int,
full_src_h: int, full_src_w: int,
full_dst_h: int, full_dst_w: int,
dst_r0: int, dst_c0: int,
dst_r1: int, dst_c1: int) -> np.ndarray:
"""Nearest-neighbour resample a source sub-rect into a destination
sub-window, using the same per-pixel mapping as
:func:`_resample_nearest` applied to the full source rect.

``src_sub`` is the slice of the full source rect starting at
``(src_origin_y, src_origin_x)`` in full-source coordinates. The
output covers destination rows ``[dst_r0, dst_r1)`` and columns
``[dst_c0, dst_c1)`` in full-destination coordinates. Mapping each
output pixel back through ``floor((out + 0.5) * src_size /
out_size)`` reproduces the index ``_resample_nearest`` would compute
on the full source array; subtracting the origin then indexes into
``src_sub``. This yields the same numbers as resampling the full
source rect and slicing ``[dst_r0:dst_r1, dst_c0:dst_c1]`` afterward.
See issue #1704.
"""
out_h = dst_r1 - dst_r0
out_w = dst_c1 - dst_c0
# ``floor((dst_* + 0.5) * src_size / dst_size)`` with the same clamp
# to ``src_size - 1`` as ``_resample_nearest`` uses on the full rect.
y_full = np.minimum(
np.floor((np.arange(dst_r0, dst_r1) + 0.5)
* full_src_h / full_dst_h).astype(np.intp),
full_src_h - 1,
)
x_full = np.minimum(
np.floor((np.arange(dst_c0, dst_c1) + 0.5)
* full_src_w / full_dst_w).astype(np.intp),
full_src_w - 1,
)
y_idx = y_full - src_origin_y
x_idx = x_full - src_origin_x
sub_h, sub_w = src_sub.shape[:2]
# Defensive clamp: callers compute the sub-window via the same
# mapping so y_idx / x_idx already lie in ``[0, sub_h)`` /
# ``[0, sub_w)``, but a rounding edge with very large sizes could
# nudge an index by one. Clip to keep the gather inside the buffer.
np.clip(y_idx, 0, sub_h - 1, out=y_idx)
np.clip(x_idx, 0, sub_w - 1, out=x_idx)
if out_h == 0 or out_w == 0:
return src_sub[:0, :0]
return src_sub[y_idx[:, None], x_idx[None, :]]


def read_vrt(vrt_path: str, *, window=None,
band: int | None = None,
max_pixels: int | None = None,
Expand Down Expand Up @@ -824,38 +890,61 @@ def read_vrt(vrt_path: str, *, window=None,
needs_resample = (sr.y_size != dr.y_size
or sr.x_size != dr.x_size)
if needs_resample:
# TODO(#1704): when the caller passes a small ``window=``
# this still reads the full SrcRect. Computing the
# inverse of the nearest-neighbour mapping to read only
# the SrcRect subset that feeds ``window`` would cut the
# decode/memory cost for large SrcRects.
read_r0 = sr.y_off
read_c0 = sr.x_off
read_r1 = sr.y_off + sr.y_size
read_c1 = sr.x_off + sr.x_size
# Clip-relative destination sub-window, in full-DstRect
# coordinates. These are the destination rows / cols
# the caller actually wants out of this source.
sub_r0 = clip_r0 - dst_r0
sub_c0 = clip_c0 - dst_c0
sub_r1 = clip_r1 - dst_r0
sub_c1 = clip_c1 - dst_c0

# Invert the nearest-neighbour mapping: find the smallest
# SrcRect-relative range of rows / cols that
# ``_resample_nearest`` would gather from when producing
# destination rows ``[sub_r0, sub_r1)`` and columns
# ``[sub_c0, sub_c1)``. Each destination pixel ``d``
# samples source pixel ``floor((d + 0.5) * src / dst)``
# clamped to ``src - 1``; taking that mapping at the two
# endpoints bounds the source range we have to read.
# ``+ 1`` on the stop makes the range half-open, matching
# the read_to_array window convention. See issue #1704.
src_row_min = _nn_src_index(
sub_r0, sr.y_size, dr.y_size)
src_row_max = _nn_src_index(
sub_r1 - 1, sr.y_size, dr.y_size)
src_col_min = _nn_src_index(
sub_c0, sr.x_size, dr.x_size)
src_col_max = _nn_src_index(
sub_c1 - 1, sr.x_size, dr.x_size)
read_r0 = sr.y_off + src_row_min
read_c0 = sr.x_off + src_col_min
read_r1 = sr.y_off + src_row_max + 1
read_c1 = sr.x_off + src_col_max + 1

# Cap the resample intermediate before the source read.
# ``_resample_nearest(src_arr, dr.y_size, dr.x_size)`` below
# allocates a ``(dr.y_size, dr.x_size)`` array irrespective
# of how much of it overlaps the window. The output buffer
# is already bounded by ``_check_dimensions`` above, but a
# crafted VRT can declare a SimpleSource ``DstRect`` whose
# ``xSize`` / ``ySize`` are orders of magnitude larger than
# the VRT extent. Without this guard, a single source can
# demand multi-gigabyte intermediates on a tiny output
# raster. Negative sizes are already rejected above; here
# we just enforce the pixel-budget. See issue #1737.
if (dr.x_size > max_pixels
or dr.y_size > max_pixels
or dr.x_size * dr.y_size > max_pixels):
# ``_resample_nearest_window`` below allocates an output
# of ``(sub_r1 - sub_r0, sub_c1 - sub_c0)`` pixels, which
# is bounded by the user's window. The cap previously
# protected against a crafted VRT declaring a huge
# DstRect even on a tiny output; with the windowed read
# the intermediate is now bounded by the window, so the
# pixel-budget check applies to the clipped sub-window
# rather than the full DstRect. See issues #1737 and
# #1704.
sub_dst_h = sub_r1 - sub_r0
sub_dst_w = sub_c1 - sub_c0
if (sub_dst_w > max_pixels
or sub_dst_h > max_pixels
or sub_dst_w * sub_dst_h > max_pixels):
raise ValueError(
f"VRT SimpleSource DstRect "
f"(xSize={dr.x_size}, ySize={dr.y_size}) requires "
f"a resample intermediate of "
f"{dr.x_size * dr.y_size:,} pixels, which exceeds "
f"the safety limit of {max_pixels:,} pixels. "
f"Pass a larger max_pixels= to read_vrt() if this "
f"file is legitimate."
f"{sub_dst_w * sub_dst_h:,} pixels for the "
f"requested window, which exceeds the safety "
f"limit of {max_pixels:,} pixels. Pass a larger "
f"max_pixels= to read_vrt() if this file is "
f"legitimate."
)
else:
read_r0 = sr.y_off + (clip_r0 - dst_r0)
Expand Down Expand Up @@ -1003,28 +1092,29 @@ def read_vrt(vrt_path: str, *, window=None,

if needs_resample:
# Refuse VRTs that request a non-nearest resample
# algorithm. ``_resample_nearest`` below is the only
# implemented kernel; silently substituting it for
# algorithm. ``_resample_nearest_window`` below is the
# only implemented kernel; silently substituting it for
# ``Bilinear`` / ``Cubic`` / ``Average`` / ``Mode``
# would return wrong numbers under the user's
# configured algorithm name. See issue #1751.
_check_resample_alg_supported(src.resample_alg, src.filename)
# Resample the full source rect to the destination rect
# shape, then clip to the window subregion. Doing the
# resample on the full rect keeps the nearest-neighbour
# index math in one place; the post-resample slice is a
# plain numpy view.
src_arr = _resample_nearest(src_arr, dr.y_size, dr.x_size)
# Take the clip subwindow out of the resampled array.
# ``dst_r0`` / ``dst_c0`` anchor the resampled array in
# destination coordinates; the clip rect is in
# destination coordinates too, so the subtract gives the
# right offsets.
sub_r0 = clip_r0 - dst_r0
sub_c0 = clip_c0 - dst_c0
sub_r1 = clip_r1 - dst_r0
sub_c1 = clip_c1 - dst_c0
src_arr = src_arr[sub_r0:sub_r1, sub_c0:sub_c1]
# Resample only the destination sub-window the caller
# asked for. ``_resample_nearest_window`` walks each
# destination pixel in ``[sub_*0, sub_*1)`` through the
# same ``floor((d + 0.5) * src / dst)`` mapping as
# ``_resample_nearest`` would use on the full rect, then
# subtracts the sub-source origin to index into
# ``src_arr`` (which is the minimal sub-rect read above).
# The output is byte-identical to resampling the full
# rect and slicing ``[sub_r0:sub_r1, sub_c0:sub_c1]``
# afterwards. See issue #1704.
src_arr = _resample_nearest_window(
src_arr,
src_row_min, src_col_min,
sr.y_size, sr.x_size,
dr.y_size, dr.x_size,
sub_r0, sub_c0, sub_r1, sub_c1,
)

# Place into output
out_r0 = clip_r0 - r0
Expand Down
104 changes: 55 additions & 49 deletions xrspatial/geotiff/tests/test_vrt_dstrect_resample_cap_1737.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

A crafted VRT can declare a ``<SimpleSource><DstRect>`` whose ``xSize`` and
``ySize`` are orders of magnitude larger than the VRT's own
``rasterXSize`` / ``rasterYSize``. The output buffer is already bounded by
``_check_dimensions`` against ``max_pixels``, but ``_resample_nearest`` is
called with ``(dr.y_size, dr.x_size)`` *before* the clip is taken, so it
allocates the full DstRect-sized intermediate before discarding most of it.

Regression test for issue #1737: ``read_vrt`` should refuse the read with a
``ValueError`` that names the offending size, rather than try to allocate
gigabytes of intermediate memory on a tiny output.
``rasterXSize`` / ``rasterYSize``. Originally (issue #1737) ``read_vrt``
called ``_resample_nearest(src_arr, dr.y_size, dr.x_size)`` *before* clipping,
allocating the full DstRect-sized intermediate before discarding most of it,
so the read was refused with a ``ValueError`` naming the offending size.

After issue #1704 the resample path reads only the source subset that feeds
the clipped destination sub-window, so the intermediate is bounded by the
caller's window (and by the VRT extent) rather than the raw DstRect. The
huge-DstRect attack vector is therefore neutralised by the read path itself,
not by the per-source pixel-budget guard. The per-source guard still applies
to the clipped sub-window, which is now what the cap is measured against.
"""
from __future__ import annotations

Expand All @@ -24,10 +27,15 @@


def _write_source(td: str) -> str:
"""Write a 10x10 uint8 source GeoTIFF and return its path."""
"""Write a 10x10 uint8 source GeoTIFF and return its path.

Stripped (non-tiled) so the source read does not allocate a 256x256
tile that trips ``_check_dimensions`` under the small ``max_pixels``
values these tests pass.
"""
src_path = os.path.join(td, 'src.tif')
to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path,
compression='none')
compression='none', tiled=False)
return src_path


Expand All @@ -53,27 +61,30 @@ def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int,
return vrt_path


def test_huge_dstrect_rejected_before_intermediate_allocation():
"""A DstRect that would force a multi-billion-pixel resample intermediate
must raise ``ValueError`` before ``_resample_nearest`` allocates."""
def test_huge_dstrect_no_longer_allocates_full_intermediate():
"""After #1704 the windowed read clips a 50000x50000 DstRect down to
the 100x100 VRT extent, so the resample intermediate is 100x100 and
no longer hits the pixel-budget cap. The earlier behaviour rejected
the read up front; the new behaviour just returns the assembled
100x100 mosaic.
"""
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_source(td)
# 50000 x 50000 = 2.5 billion pixels of intermediate; the output
# buffer is only 100 x 100. With the cap in place this should
# raise before _resample_nearest runs.
vrt_path = _write_vrt(td, dst_x_size=50000, dst_y_size=50000)
with pytest.raises(ValueError, match="resample intermediate"):
read_vrt(vrt_path)
arr, _ = read_vrt(vrt_path)
assert arr.shape == (100, 100)


def test_huge_dstrect_y_axis_rejected():
"""Asymmetric blow-up: only one axis is huge. Still rejected."""
def test_huge_dstrect_y_axis_clipped_to_extent():
"""Asymmetric blow-up: ``ySize`` declared as 10 billion but the VRT
extent caps the clipped sub-window at 100 rows. Read succeeds with
the bounded intermediate."""
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_source(td)
vrt_path = _write_vrt(
td, dst_x_size=10, dst_y_size=10_000_000_000)
with pytest.raises(ValueError, match="resample intermediate"):
read_vrt(vrt_path)
arr, _ = read_vrt(vrt_path)
assert arr.shape == (100, 100)


def test_legitimate_upsample_still_works():
Expand All @@ -86,48 +97,43 @@ def test_legitimate_upsample_still_works():
assert arr.shape == (100, 100)


def test_max_pixels_kwarg_raises_cap():
"""A DstRect rejected under a tiny ``max_pixels`` must be accepted
when the caller bumps the cap. This exercises the override contract
(the same VRT goes from rejected to accepted across the two calls).
def test_per_source_cap_bites_when_sub_window_exceeds_budget():
"""The per-source pixel-budget guard applies to the clipped
sub-window, not the raw DstRect. Pick a VRT and ``max_pixels`` where
the sub-window itself exceeds the cap so the per-source check fires
even after the windowed-read change.

The output buffer is kept tiny (VRT raster 10x10) so the cap only
bites on the resample intermediate, not on the ``_check_dimensions``
pre-allocation guard.
The output buffer dimension check (``_check_dimensions``) is also
bounded by ``max_pixels``, so to isolate the per-source branch we
request a window whose sub-window product crosses the cap. Both
guards use the same threshold; the per-source one provides defence
in depth.
"""
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_source(td)
# 2000x2000 = 4e6 intermediate pixels. Output buffer is 10x10=100
# pixels, far below both caps below. Upsample 10x10 -> 2000x2000
# so ``needs_resample`` is true and the cap branch runs.
vrt_path = _write_vrt(td, dst_x_size=2000, dst_y_size=2000,
raster_x=10, raster_y=10)
# First, the cap is too small for the intermediate: rejected.
with pytest.raises(ValueError, match="resample intermediate"):
raster_x=2000, raster_y=2000)
# Sub-window is 2000x2000 = 4e6 pixels. Cap of 1e6 rejects.
with pytest.raises(ValueError, match="resample intermediate|safety limit"):
read_vrt(vrt_path, max_pixels=1_000_000)
# Bump the cap above 4e6: accepted.
arr, _ = read_vrt(vrt_path, max_pixels=4_000_000)
assert arr.shape == (10, 10)
assert arr.shape == (2000, 2000)


def test_dstrect_at_cap_succeeds():
"""Exactly at ``max_pixels`` is accepted; the cap is inclusive.
Together with :func:`test_max_pixels_kwarg_raises_cap`, this pins
down both sides of the override boundary."""
def test_per_source_cap_inclusive_boundary():
"""The per-source cap is inclusive: exactly ``max_pixels`` succeeds,
one below rejects. Mirrors the boundary the original #1737 test
pinned down, on the new sub-window semantics."""
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_source(td)
# Upsample 10x10 -> 100x100 so ``needs_resample`` is true. The
# VRT raster is 10x10 so the output buffer stays at 100 pixels,
# well below the ``_check_dimensions`` guard; the cap below
# therefore exercises the resample-intermediate branch only.
vrt_path = _write_vrt(td, dst_x_size=100, dst_y_size=100,
raster_x=10, raster_y=10)
# Just below the cap rejects.
with pytest.raises(ValueError, match="resample intermediate"):
raster_x=100, raster_y=100)
# Sub-window is 100x100 = 10_000 pixels.
with pytest.raises(ValueError, match="resample intermediate|safety limit"):
read_vrt(vrt_path, max_pixels=9_999)
# At the cap succeeds.
arr, _ = read_vrt(vrt_path, max_pixels=10000)
assert arr.shape == (10, 10)
assert arr.shape == (100, 100)


def test_negative_dstrect_rejected():
Expand Down
Loading
Loading