Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .claude/sweep-accuracy-state.csv

Large diffs are not rendered by default.

99 changes: 84 additions & 15 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,11 @@ def _read_geo_info(source, *, overview_level: int | None = None):
n_bands = (
_ifd.samples_per_pixel if _ifd.samples_per_pixel > 1 else 0
)
# Stash photometric + samples_per_pixel so the dask graph builder
# can detect MinIsWhite and invert ``geo_info.nodata`` before
# binding it into the chunk closure (#1809).
geo_info._ifd_photometric = _ifd.photometric
geo_info._ifd_samples_per_pixel = _ifd.samples_per_pixel
return geo_info, _ifd.height, _ifd.width, file_dtype, n_bands
if _is_file_like(source):
# File-like: read its full bytes; we don't try to mmap arbitrary
Expand Down Expand Up @@ -540,6 +545,11 @@ def _read_geo_info(source, *, overview_level: int | None = None):
bps = resolve_bits_per_sample(ifd.bits_per_sample)
file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format)
n_bands = ifd.samples_per_pixel if ifd.samples_per_pixel > 1 else 0
# Stash photometric + samples_per_pixel so the dask graph builder
# can detect MinIsWhite and invert ``geo_info.nodata`` before
# binding it into the chunk closure (#1809).
geo_info._ifd_photometric = ifd.photometric
geo_info._ifd_samples_per_pixel = ifd.samples_per_pixel
return geo_info, ifd.height, ifd.width, file_dtype, n_bands
finally:
if close_data:
Expand Down Expand Up @@ -906,9 +916,15 @@ def open_geotiff(source: str | BinaryIO, *,
nodata = geo_info.nodata
if nodata is not None:
attrs['nodata'] = nodata
# When the reader applied MinIsWhite, the sentinel-equality mask
# must compare against the inverted sentinel value (issue #1809).
# ``read_to_array`` / ``_read_cog_http`` stash that value on
# ``geo_info._mask_nodata``; fall back to the original sentinel
# on non-MinIsWhite files.
mask_nodata = getattr(geo_info, '_mask_nodata', nodata)
if arr.dtype.kind == 'f':
if not np.isnan(nodata):
arr[arr == arr.dtype.type(nodata)] = np.nan
if mask_nodata is not None and not np.isnan(mask_nodata):
arr[arr == arr.dtype.type(mask_nodata)] = np.nan
elif arr.dtype.kind in ('u', 'i'):
# Integer arrays: convert to float to represent NaN.
# An out-of-range sentinel (e.g. uint16 file with
Expand All @@ -927,8 +943,10 @@ def open_geotiff(source: str | BinaryIO, *,
# ``_writer.py`` / ``_vrt.py`` pattern used for #1564 / #1616).
# attrs['nodata'] still carries the raw sentinel so a write
# round-trip preserves the tag.
if np.isfinite(nodata) and float(nodata).is_integer():
nodata_int = int(nodata)
if (mask_nodata is not None
and np.isfinite(mask_nodata)
and float(mask_nodata).is_integer()):
nodata_int = int(mask_nodata)
info = np.iinfo(arr.dtype)
if info.min <= nodata_int <= info.max:
mask = arr == arr.dtype.type(nodata_int)
Expand Down Expand Up @@ -2213,11 +2231,31 @@ def read_geotiff_dask(source: str, *,
http_ifd.samples_per_pixel
if http_ifd.samples_per_pixel > 1 else 0
)
# Stash IFD photometric for the MinIsWhite nodata-inversion check below.
geo_info._ifd_photometric = http_ifd.photometric
geo_info._ifd_samples_per_pixel = http_ifd.samples_per_pixel
else:
# Metadata-only read: O(1) memory via mmap, no pixel decompression
geo_info, full_h, full_w, file_dtype, n_bands = _read_geo_info(
source, overview_level=overview_level)
nodata = geo_info.nodata
nodata_attr = nodata # original sentinel preserved for attrs['nodata']
# When the source is MinIsWhite (photometric == 0, samples_per_pixel == 1),
# the per-chunk reader inverts pixel values before this closure's nodata
# mask runs. Track the inverted sentinel so the mask compares against the
# post-inversion value, not the original (#1809).
if nodata is not None:
_phm = getattr(geo_info, '_ifd_photometric', None)
_spp = getattr(geo_info, '_ifd_samples_per_pixel', None)
if _phm == 0 and _spp == 1:
if file_dtype.kind == 'u' and np.isfinite(nodata) and \
float(nodata).is_integer():
vi = int(nodata)
info = np.iinfo(file_dtype)
if info.min <= vi <= info.max:
nodata = info.max - vi
elif file_dtype.kind == 'f' and not np.isnan(nodata):
nodata = -float(nodata)

# Nodata masking promotes integer arrays to float64 (for NaN).
# Validate against the effective dtype, not the raw file dtype.
Expand Down Expand Up @@ -2328,8 +2366,8 @@ def read_geotiff_dask(source: str, *,

attrs = {}
_populate_attrs_from_geo_info(attrs, geo_info, window=window)
if nodata is not None:
attrs['nodata'] = nodata
if nodata_attr is not None:
attrs['nodata'] = nodata_attr

if isinstance(chunks, int):
ch_h = ch_w = chunks
Expand Down Expand Up @@ -3029,7 +3067,7 @@ def read_geotiff_gpu(source: str, *,
# so ``window`` is None whenever ``geo_info`` will be remapped
# below.
src.close()
arr_cpu, _ = _read_to_array(
arr_cpu, _stripped_geo = _read_to_array(
source, overview_level=overview_level,
window=window, band=band, max_pixels=max_pixels)
arr_gpu = cupy.asarray(arr_cpu)
Expand All @@ -3046,10 +3084,14 @@ def read_geotiff_gpu(source: str, *,
# with the CPU eager path. Without this, integer rasters keep
# the literal sentinel value and float rasters keep the
# sentinel rather than NaN -- a silent backend divergence.
# ``read_to_array`` stashes the post-MinIsWhite sentinel on
# ``_mask_nodata`` when applicable; fall back to the original
# sentinel otherwise (#1809).
nodata = geo_info.nodata
if nodata is not None:
attrs['nodata'] = nodata
arr_gpu = _apply_nodata_mask_gpu(arr_gpu, nodata)
mask_value = getattr(_stripped_geo, '_mask_nodata', nodata)
arr_gpu = _apply_nodata_mask_gpu(arr_gpu, mask_value)
if dtype is not None:
target = np.dtype(dtype)
_validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
Expand Down Expand Up @@ -3161,6 +3203,12 @@ def read_geotiff_gpu(source: str, *,
# the orientation remap from PR #1521 + #1537 for free; the pure GPU
# paths still need the explicit remap added in #1540.
arr_was_cpu_decoded = False
# When a CPU fallback runs, ``read_to_array`` has already applied the
# MinIsWhite inversion and stashed the post-inversion sentinel on
# ``_mask_nodata``. Keep that geo_info alongside the pre-extracted one
# so the downstream nodata mask compares against the correct value
# (Copilot review of #1817).
_cpu_fallback_geo = None

# PlanarConfiguration=2 (separate bands): each band has its own list
# of tiles back-to-back in TileOffsets / TileByteCounts. The GPU
Expand Down Expand Up @@ -3224,10 +3272,12 @@ def _read_once():
break
band_arrays.append(band_arr)
if cpu_fallback_needed:
# Drop read_to_array's geo_info: orientation transform handling
# below operates on our pre-extracted geo_info so the 2/3/4 case
# is covered regardless of #1539's merge state.
arr_cpu, _ = _read_to_array(
# Drop read_to_array's geo_info for orientation transform
# handling (below operates on our pre-extracted geo_info so the
# 2/3/4 case is covered regardless of #1539's merge state), but
# keep it on ``_cpu_fallback_geo`` so the MinIsWhite-aware nodata
# mask below sees ``_mask_nodata``.
arr_cpu, _cpu_fallback_geo = _read_to_array(
source, overview_level=overview_level)
arr_gpu = cupy.asarray(arr_cpu)
arr_was_cpu_decoded = True
Expand All @@ -3240,7 +3290,7 @@ def _read_once():
f"({height}, {width}, {samples})"
)
elif has_sparse_tile:
arr_cpu, _ = _read_to_array(
arr_cpu, _cpu_fallback_geo = _read_to_array(
source, overview_level=overview_level)
arr_gpu = cupy.asarray(arr_cpu)
arr_was_cpu_decoded = True
Expand Down Expand Up @@ -3297,7 +3347,7 @@ def _read_once():
RuntimeWarning,
stacklevel=2,
)
arr_cpu, _ = _read_to_array(
arr_cpu, _cpu_fallback_geo = _read_to_array(
source, overview_level=overview_level)
arr_gpu = cupy.asarray(arr_cpu)
arr_was_cpu_decoded = True
Expand Down Expand Up @@ -3328,8 +3378,14 @@ def _read_once():
geo_info = _apply_orientation_geo_info(
geo_info, orientation, file_h=height, file_w=width)

_mw_mask_nodata = None
if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded):
from ._reader import _miniswhite_inverted_nodata as _miw_inv_nd
gpu_dtype = np.dtype(str(arr_gpu.dtype))
# Compute the post-MinIsWhite sentinel BEFORE inverting the array,
# so the downstream ``_apply_nodata_mask_gpu`` call compares
# against the right value (#1809).
_mw_mask_nodata = _miw_inv_nd(geo_info.nodata, ifd, gpu_dtype)
if gpu_dtype.kind == 'u':
arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu
elif gpu_dtype.kind == 'f':
Expand All @@ -3343,7 +3399,20 @@ def _read_once():
# surprise a user-supplied dtype.
nodata = geo_info.nodata
if nodata is not None:
arr_gpu = _apply_nodata_mask_gpu(arr_gpu, nodata)
# When MinIsWhite was applied, the mask must use the inverted
# sentinel; otherwise the original sentinel. The pure GPU path
# records the inverted sentinel in ``_mw_mask_nodata`` above; the
# CPU-fallback paths (sparse-tile, planar=2 auto-fallback, and
# post-decode CPU fallback) get it from ``read_to_array`` via
# ``_cpu_fallback_geo._mask_nodata`` (Copilot review of #1817).
if _mw_mask_nodata is not None:
_gpu_mask_value = _mw_mask_nodata
elif _cpu_fallback_geo is not None:
_gpu_mask_value = getattr(
_cpu_fallback_geo, '_mask_nodata', nodata)
else:
_gpu_mask_value = nodata
arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value)

if dtype is not None:
target = np.dtype(dtype)
Expand Down
91 changes: 79 additions & 12 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1560,9 +1560,14 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader,
raise ValueError(
f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}")

# Reject crafted tile dims that would force huge per-tile allocations.
# A single tile's decoded bytes must also fit under the pixel budget.
_check_dimensions(tw, th, samples, max_pixels)
# Reject crafted tile dims (e.g. TileWidth = 2**31). This guards the
# TIFF header against malformed values; it is not the caller's output
# budget. The output-window check below uses ``max_pixels`` and is
# what enforces the user's per-call memory cap. The source-read path
# under ``read_vrt`` (#1796) relies on that output check to honour a
# small caller ``max_pixels`` against a normal-tile source; see
# #1823.
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT)

# Per-tile compressed-byte cap (issue #1664). Same env var as the
# HTTP path. mmap slicing is bounded by the file size, but the slice
Expand Down Expand Up @@ -1954,7 +1959,15 @@ def _read_cog_http(url: str, overview_level: int | None = None,
arr, geo_info = _apply_orientation_with_geo(
arr, geo_info, ifd.orientation)

arr = _apply_photometric_miniswhite(arr, ifd)
if ifd.photometric == 0 and ifd.samples_per_pixel == 1:
# Stash the inverted sentinel on geo_info so the caller's
# sentinel-to-NaN mask runs against the post-MinIsWhite value
# while ``attrs['nodata']`` keeps the original sentinel for
# round-trip on write (issue #1809).
inverted_nodata = _miniswhite_inverted_nodata(
geo_info.nodata, ifd, arr.dtype)
arr = _apply_photometric_miniswhite(arr, ifd)
geo_info._mask_nodata = inverted_nodata

return arr, geo_info

Expand Down Expand Up @@ -2016,10 +2029,14 @@ def _fetch_decode_cog_http_tiles(
# A windowed HTTP read of a multi-billion-pixel COG only allocates
# the window, so capping the full image would reject legitimate
# tiled reads. The full-image cap still applies for whole-file
# reads (window is None). The single-tile budget always applies.
# reads (window is None). The per-tile dim check below guards the
# TIFF header against absurd ``TileWidth`` / ``TileLength`` values
# (e.g. 2**31) and uses ``MAX_PIXELS_DEFAULT`` so a caller's small
# ``max_pixels`` -- intended as an output-window budget -- does not
# reject normal 256x256 tiles. See #1823.
if window is None:
_check_dimensions(width, height, samples, max_pixels)
_check_dimensions(tw, th, samples, max_pixels)
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT)

# Reject malformed TIFFs whose declared tile grid exceeds the supplied
# TileOffsets length. See issue #1219.
Expand Down Expand Up @@ -2307,14 +2324,53 @@ def _apply_orientation_with_geo(

def _apply_photometric_miniswhite(arr: np.ndarray, ifd: IFD) -> np.ndarray:
"""Apply TIFF MinIsWhite inversion for single-band grayscale images."""
if ifd.photometric == 0 and ifd.samples_per_pixel == 1:
if arr.dtype.kind == 'u':
return np.iinfo(arr.dtype).max - arr
if arr.dtype.kind == 'f':
return -arr
if ifd.photometric != 0 or ifd.samples_per_pixel != 1:
return arr
if arr.dtype.kind == 'u':
return np.iinfo(arr.dtype).max - arr
if arr.dtype.kind == 'f':
return -arr
return arr


def _miniswhite_inverted_nodata(nodata, ifd: IFD, dtype: np.dtype):
"""Return the nodata sentinel value after MinIsWhite inversion.

When the reader applied MinIsWhite (``photometric == 0``,
``samples_per_pixel == 1``), the original integer sentinel ``s`` is
rewritten to ``iinfo(dtype).max - s`` and the float sentinel ``s`` to
``-s``. Downstream nodata-to-NaN masks must compare against the
inverted sentinel rather than the original, otherwise they flag the
wrong pixels: inverted real data colliding with the original
sentinel value is incorrectly masked while the real nodata cells
keep their inverted-sentinel value (issue #1809).

Returns the inverted nodata sentinel, or the original ``nodata``
when MinIsWhite was not applied / not applicable. Non-finite or
out-of-range nodata is returned unchanged so callers' downstream
skip-the-mask logic stays unchanged.
"""
if nodata is None:
return nodata
if ifd.photometric != 0 or ifd.samples_per_pixel != 1:
return nodata
if dtype.kind == 'u':
if not np.isfinite(nodata):
return nodata
if not float(nodata).is_integer():
return nodata
vi = int(nodata)
info = np.iinfo(dtype)
if not (info.min <= vi <= info.max):
return nodata
return info.max - vi
if dtype.kind == 'f':
if np.isnan(nodata):
return nodata
return -float(nodata)
return nodata


def read_to_array(source, *, window=None, overview_level: int | None = None,
band: int | None = None,
max_pixels: int = MAX_PIXELS_DEFAULT,
Expand Down Expand Up @@ -2456,7 +2512,18 @@ def read_to_array(source, *, window=None, overview_level: int | None = None,
arr, geo_info = _apply_orientation_with_geo(
arr, geo_info, orientation)

arr = _apply_photometric_miniswhite(arr, ifd)
if ifd.photometric == 0 and ifd.samples_per_pixel == 1:
# The MinIsWhite inversion rewrites the original sentinel
# value, so any downstream nodata-to-NaN mask must compare
# against the inverted sentinel instead. Stash the inverted
# sentinel on geo_info as a private attribute so callers can
# apply the mask post-inversion while keeping the original
# sentinel on ``geo_info.nodata`` for the attrs round-trip
# (issue #1809).
inverted_nodata = _miniswhite_inverted_nodata(
geo_info.nodata, ifd, arr.dtype)
arr = _apply_photometric_miniswhite(arr, ifd)
geo_info._mask_nodata = inverted_nodata
finally:
src.close()

Expand Down
Loading
Loading