diff --git a/xrspatial/geotiff/tests/golden_corpus/_oracle.py b/xrspatial/geotiff/tests/golden_corpus/_oracle.py index ccfe907a7..8ce11f7a5 100644 --- a/xrspatial/geotiff/tests/golden_corpus/_oracle.py +++ b/xrspatial/geotiff/tests/golden_corpus/_oracle.py @@ -245,6 +245,76 @@ def _pixels_equal(ref: np.ndarray, cand: np.ndarray) -> bool: return np.array_equal(ref, cand) +# --------------------------------------------------------------------------- +# Masked-nodata normalisation (issue #1988) +# --------------------------------------------------------------------------- + +def _has_masked_nodata(candidate_da: xr.DataArray) -> bool: + """True when the candidate reports xrspatial's masked-nodata contract. + + The contract (issue #1988): xrspatial reads an integer GeoTIFF whose + nodata tag carries an integer sentinel, masks the sentinel-equal + pixels to NaN, and upcasts the array to float so NaN can live in + it. The reader stamps ``attrs['masked_nodata'] = True`` to record + that the masking happened; ``attrs['nodata']`` still carries the + original integer sentinel so a write round-trip can put the tag + back. + """ + return bool(candidate_da.attrs.get('masked_nodata', False)) + + +def _normalise_for_masked_nodata( + ref_pixels: np.ndarray, + ref_dtype: np.dtype, + ref_nodata, + candidate_da: xr.DataArray, +) -> tuple[np.ndarray, np.dtype]: + """Apply xrspatial's masked-nodata contract to the rasterio reference. + + When the candidate reports ``attrs['masked_nodata']=True``, the + reference is rewritten so it matches what the candidate produced: + cast to the candidate's float dtype, then any pixel equal to the + integer sentinel ``ref_nodata`` becomes ``NaN``. The ``ref_dtype`` + returned alongside is the candidate's dtype, so the downstream + ``_assert_dtype`` check passes by design. + + If the candidate does not report masked nodata, or if ``ref_nodata`` + is not a finite integer sentinel, the inputs pass through + unchanged. That keeps every existing fixture's behaviour intact. + """ + if not _has_masked_nodata(candidate_da): + return ref_pixels, ref_dtype + cand_dtype = np.dtype(candidate_da.dtype) + if cand_dtype.kind != 'f': + # The masked_nodata contract requires a float dtype on the + # candidate so NaN can live in the array. If it is anything + # else, fall through to the normal strict comparison and let + # it fail with a clear message. + return ref_pixels, ref_dtype + # The sentinel must be a finite, integer-valued real number that + # fits in the source integer dtype. The upstream xrspatial reader + # at xrspatial/geotiff/__init__.py only sets attrs['masked_nodata'] + # under the same three guards; the oracle mirrors them so a + # candidate that drifts (sets the flag with an out-of-range or + # fractional sentinel) cannot trick the oracle into masking the + # wrong pixels via integer truncation or unsigned wraparound. + try: + nd_float = float(ref_nodata) + except (TypeError, ValueError): + return ref_pixels, ref_dtype + if not np.isfinite(nd_float): + return ref_pixels, ref_dtype + if not float(nd_float).is_integer(): + return ref_pixels, ref_dtype + if ref_pixels.dtype.kind in ('i', 'u'): + info = np.iinfo(ref_pixels.dtype) + if not (info.min <= int(nd_float) <= info.max): + return ref_pixels, ref_dtype + new_ref = ref_pixels.astype(cand_dtype, copy=True) + new_ref[ref_pixels == ref_pixels.dtype.type(nd_float)] = np.nan + return new_ref, cand_dtype + + # --------------------------------------------------------------------------- # Canonical-attrs hook (issue #1984) # --------------------------------------------------------------------------- @@ -459,6 +529,14 @@ def compare_to_oracle( 'dtype': ref_dtype, } + # When the candidate reports the masked-nodata contract (#1988), + # rewrite the rasterio reference to match: cast to the candidate's + # float dtype and replace sentinel-equal pixels with NaN. Then the + # dtype + pixel assertions run on directly comparable arrays. + ref_pixels, ref_dtype = _normalise_for_masked_nodata( + ref_pixels, ref_dtype, ref_nodata, candidate_da + ) + _assert_dtype(ref_dtype, candidate_da) _assert_transform(ref_transform, candidate_da, ref_has_georef=ref_has_georef) _assert_crs(ref_crs, candidate_da) diff --git a/xrspatial/geotiff/tests/golden_corpus/test_oracle.py b/xrspatial/geotiff/tests/golden_corpus/test_oracle.py index 634a463b8..91495058a 100644 --- a/xrspatial/geotiff/tests/golden_corpus/test_oracle.py +++ b/xrspatial/geotiff/tests/golden_corpus/test_oracle.py @@ -502,3 +502,214 @@ def test_crs_equal_rejects_empty_proj_dict() -> None: assert b.to_epsg() is None assert a.to_dict() == {} == b.to_dict() assert _crs_equal(a, b) is False + + +# --------------------------------------------------------------------------- +# Masked-nodata contract (issue #1988) +# --------------------------------------------------------------------------- + +def _masked_nodata_pair( + tmp_path: Path, sentinel: int = 0 +) -> tuple[Path, xr.DataArray, np.ndarray]: + """Build a uint16 fixture with an integer nodata sentinel and a + float candidate that has masked the sentinel pixels to NaN. + + Returns ``(fixture_path, candidate, candidate_float_array)``. + """ + raw = np.array( + [ + [1, 2, sentinel, 4], + [5, sentinel, 7, 8], + [9, 10, 11, 12], + [13, 14, 15, 16], + ], + dtype=np.uint16, + ) + transform = from_origin(0.0, 4.0, 1.0, 1.0) + fixture = _write_tiff( + tmp_path / 'masked_nodata.tif', + raw, + transform=transform, + nodata=sentinel, + ) + masked = raw.astype(np.float64) + masked[raw == sentinel] = np.nan + cand = _build_candidate( + masked, transform=transform, nodata=sentinel, + ) + cand.attrs['masked_nodata'] = True + return fixture, cand, masked + + +def test_masked_nodata_success(tmp_path: Path) -> None: + """Candidate that masks the integer sentinel to NaN passes the oracle. + + The oracle rewrites the rasterio reference to match the candidate's + float-plus-NaN view before comparing, so dtype shift + pixel mask + no longer registers as a mismatch. + """ + fixture, cand, _ = _masked_nodata_pair(tmp_path) + compare_to_oracle(fixture, cand) + + +def test_masked_nodata_off_keeps_strict_dtype_check(tmp_path: Path) -> None: + """Without ``masked_nodata=True``, the oracle keeps strict dtype parity. + + Drop the flag and the candidate's float dtype no longer matches the + rasterio uint16 reference; the dtype assertion fires. + """ + fixture, cand, _ = _masked_nodata_pair(tmp_path) + del cand.attrs['masked_nodata'] + with pytest.raises(AssertionError, match='dtype mismatch'): + compare_to_oracle(fixture, cand) + + +def test_masked_nodata_candidate_left_sentinel_pixel_fails( + tmp_path: Path, +) -> None: + """If the candidate forgot to mask a sentinel pixel, the pixel + comparison fires. + + The reference is rewritten to have NaN at the sentinel positions; + a candidate that still carries the raw integer value (here 0, + cast to float) at one of those positions cannot pass NaN-aware + equality. + """ + fixture, cand, masked = _masked_nodata_pair(tmp_path) + # Restore the first sentinel pixel to the raw value, simulating + # a reader that mis-masked. + bad = masked.copy() + bad[0, 2] = 0.0 + cand2 = _build_candidate( + bad, transform=from_origin(0.0, 4.0, 1.0, 1.0), nodata=0, + ) + cand2.attrs['masked_nodata'] = True + with pytest.raises(AssertionError, match='pixel arrays differ'): + compare_to_oracle(fixture, cand2) + + +def test_masked_nodata_wrong_nodata_attr_fails(tmp_path: Path) -> None: + """``attrs['nodata']`` must still match the fixture even with masking. + + The oracle masks the reference using the *reference's* nodata, so + a wrong sentinel in the candidate's attrs is caught by the + independent ``_assert_nodata`` check before pixels are compared. + """ + fixture, cand, _ = _masked_nodata_pair(tmp_path) + cand.attrs['nodata'] = 999 # not the real sentinel + with pytest.raises(AssertionError, match='nodata mismatch'): + compare_to_oracle(fixture, cand) + + +def test_masked_nodata_does_not_engage_for_float_nan_fixture( + tmp_path: Path, +) -> None: + """Float-NaN-nodata fixtures take the plain pixel path unchanged. + + The masked-nodata normaliser only fires when the candidate reports + ``masked_nodata=True``. A float fixture whose nodata is NaN never + needs the dtype shift, and the new path must not muck with it. + """ + data = np.array([[1.0, np.nan], [np.nan, 4.0]], dtype=np.float32) + transform = from_origin(0.0, 2.0, 1.0, 1.0) + fixture = _write_tiff( + tmp_path / 'float_nan.tif', data, transform=transform, + nodata=float('nan'), + ) + cand = _build_candidate( + data.copy(), transform=transform, nodata=float('nan'), + ) + # No masked_nodata flag -- the candidate is float to begin with. + compare_to_oracle(fixture, cand) + + +def test_masked_nodata_with_non_integer_sentinel_passes_through( + tmp_path: Path, +) -> None: + """A NaN sentinel in the rasterio reference does not trip the integer + masking path even when ``masked_nodata`` is set. + + The normaliser is gated on a finite sentinel via ``np.isfinite``. + A NaN-nodata fixture should compare like any other float-NaN + fixture: candidate keeps its own NaN pixels, reference keeps + its own NaN pixels, and NaN-aware equality does the rest. + """ + data = np.array([[1.0, np.nan], [np.nan, 4.0]], dtype=np.float32) + transform = from_origin(0.0, 2.0, 1.0, 1.0) + fixture = _write_tiff( + tmp_path / 'masked_nan.tif', data, transform=transform, + nodata=float('nan'), + ) + cand = _build_candidate( + data.copy(), transform=transform, nodata=float('nan'), + ) + # Set the flag to confirm the normaliser early-exits cleanly on + # NaN sentinels rather than trying to do something silly. + cand.attrs['masked_nodata'] = True + compare_to_oracle(fixture, cand) + + +def test_masked_nodata_fractional_sentinel_does_not_mask( + tmp_path: Path, +) -> None: + """A non-integer sentinel cannot match an integer pixel; the + normaliser declines to mask. + + Without this guard the oracle would cast ``3.5`` to ``3`` and + silently mask every 3-valued pixel. The upstream xrspatial reader + never sets ``attrs['masked_nodata']`` for a fractional sentinel, + so the oracle's stricter check mirrors that contract. Outcome: + the oracle stays on the raw-pixel path and the dtype mismatch + surfaces normally. + """ + raw = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint16) + transform = from_origin(0.0, 2.0, 1.0, 1.0) + fixture = _write_tiff( + tmp_path / 'frac_sentinel.tif', raw, transform=transform, + nodata=3.5, + ) + masked = raw.astype(np.float64) + cand = _build_candidate( + masked, transform=transform, nodata=3.5, + ) + cand.attrs['masked_nodata'] = True + with pytest.raises(AssertionError, match='dtype mismatch'): + compare_to_oracle(fixture, cand) + + +def test_masked_nodata_out_of_range_sentinel_does_not_mask() -> None: + """A sentinel outside the source integer range cannot match any pixel. + + Without the range guard, ``np.uint16(-1.0)`` wraps to ``65535`` and + masks the dtype-max pixel. The reader rejects such sentinels via + ``info.min <= nodata_int <= info.max``; the oracle mirrors that + check. + + Rasterio refuses to write an out-of-range nodata at the writer + level, so we cannot reach this code path through a real fixture. + The test calls the helper directly with synthesised inputs to + confirm the guard fires. + """ + from xrspatial.geotiff.tests.golden_corpus._oracle import ( + _normalise_for_masked_nodata, + ) + + ref_pixels = np.array( + [[1, 2, 65535], [4, 5, 6]], dtype=np.uint16, + ) + ref_dtype = ref_pixels.dtype + cand = _build_candidate( + ref_pixels.astype(np.float64), + transform=from_origin(0.0, 2.0, 1.0, 1.0), + nodata=-1, + ) + cand.attrs['masked_nodata'] = True + + out_pixels, out_dtype = _normalise_for_masked_nodata( + ref_pixels, ref_dtype, -1, cand, + ) + # Out-of-range sentinel must abort the rewrite. The helper returns + # the inputs unchanged; the unsigned wraparound that would otherwise + # mask the dtype-max pixel does not happen. + assert out_dtype == ref_dtype + assert out_pixels is ref_pixels diff --git a/xrspatial/geotiff/tests/test_golden_corpus_dask_numpy_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_dask_numpy_1930.py index ed3e7e27e..78c105e01 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_dask_numpy_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_dask_numpy_1930.py @@ -46,16 +46,11 @@ CHUNK_SIZE = 32 -_NODATA_MASKING_REASON = ( - "integer nodata masking: xrspatial masks sentinel pixels to NaN and " - "upcasts to float64 per #1988 (attrs['masked_nodata']=True). The oracle " - "compares raw integer pixels; needs an oracle extension that consults " - "attrs['masked_nodata']." -) - # Shared parity gaps. These live in the codec / attrs layer, not in the # dask plumbing, so the dask backend inherits them verbatim from the -# eager backend. +# eager backend. Integer nodata masking used to live here too; the +# oracle's _normalise_for_masked_nodata helper closes that gap so it +# is no longer xfailed on any backend. _PARITY_GAPS: dict[str, str] = { "compression_jpeg_uint8_ycbcr": ( "RGB band axis order divergence: rasterio reads (bands, y, x) while " @@ -67,11 +62,6 @@ "attrs['geog_citation'] but does not emit a canonical attrs['crs'] " "or attrs['crs_wkt']. Real parity gap; needs a fix in _crs.py." ), - "nodata_int_sentinel_uint16": _NODATA_MASKING_REASON, - "stripped_le_uint16": _NODATA_MASKING_REASON, - "stripped_be_uint16": _NODATA_MASKING_REASON, - "tiled_le_uint16": _NODATA_MASKING_REASON, - "tiled_be_uint16": _NODATA_MASKING_REASON, } # Dask-only gaps go here. Empty in the first pass; add an entry only when diff --git a/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py index a48c3c61b..81a51da22 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py @@ -32,13 +32,16 @@ deprecated ``attrs['geog_citation']`` but does not emit a canonical ``attrs['crs']`` or ``attrs['crs_wkt']``. Real parity gap; needs a fix in ``_crs.py`` to round-trip citation WKT. -* ``nodata_int_sentinel_uint16``, ``stripped_le_uint16``, - ``stripped_be_uint16``, ``tiled_le_uint16``, ``tiled_be_uint16`` -- - integer nodata masking. xrspatial masks sentinel pixels to NaN and - upcasts to float64 per #1988 (``attrs['masked_nodata']=True``); the - oracle compares the raw integer pixel array. Needs a small oracle - extension that consults ``attrs['masked_nodata']`` and applies the - equivalent mask to the rasterio reference before comparing. + +Resolved gaps (no longer xfail): + +* Integer nodata masking. The oracle now consults + ``attrs['masked_nodata']`` and rewrites the rasterio reference to + match the candidate's float-plus-NaN view (see + ``_normalise_for_masked_nodata`` in ``_oracle.py``), so the five + fixtures ``nodata_int_sentinel_uint16``, ``stripped_le_uint16``, + ``stripped_be_uint16``, ``tiled_le_uint16``, ``tiled_be_uint16`` + pass directly. Intentional skip (``skip``): @@ -71,13 +74,6 @@ ) -_NODATA_MASKING_REASON = ( - "integer nodata masking: xrspatial masks sentinel pixels to NaN and " - "upcasts to float64 per #1988 (attrs['masked_nodata']=True). The oracle " - "compares raw integer pixels; needs an oracle extension that consults " - "attrs['masked_nodata']." -) - _PARITY_GAPS: dict[str, str] = { "compression_jpeg_uint8_ycbcr": ( "RGB band axis order divergence: rasterio reads (bands, y, x) while " @@ -89,11 +85,6 @@ "attrs['geog_citation'] but does not emit a canonical attrs['crs'] " "or attrs['crs_wkt']. Real parity gap; needs a fix in _crs.py." ), - "nodata_int_sentinel_uint16": _NODATA_MASKING_REASON, - "stripped_le_uint16": _NODATA_MASKING_REASON, - "stripped_be_uint16": _NODATA_MASKING_REASON, - "tiled_le_uint16": _NODATA_MASKING_REASON, - "tiled_be_uint16": _NODATA_MASKING_REASON, } _INTENTIONAL_SKIPS: dict[str, str] = { diff --git a/xrspatial/geotiff/tests/test_golden_corpus_gpu_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_gpu_1930.py index 1c061b8aa..60b0ef48a 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_gpu_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_gpu_1930.py @@ -56,24 +56,15 @@ ) -_NODATA_MASKING_REASON = ( - "integer nodata masking: xrspatial masks sentinel pixels to NaN and " - "upcasts to float64 per #1988 (attrs['masked_nodata']=True). The oracle " - "compares raw integer pixels; needs an oracle extension that consults " - "attrs['masked_nodata']." -) - +# Integer-nodata masking used to live here too; the oracle's +# _normalise_for_masked_nodata helper now closes that gap so it is no +# longer xfailed on any backend. _PARITY_GAPS: dict[str, str] = { "crs_citation_only": ( "citation-only CRS: xrspatial decodes the citation into deprecated " "attrs['geog_citation'] but does not emit a canonical attrs['crs'] " "or attrs['crs_wkt']. Real parity gap; needs a fix in _crs.py." ), - "nodata_int_sentinel_uint16": _NODATA_MASKING_REASON, - "stripped_le_uint16": _NODATA_MASKING_REASON, - "stripped_be_uint16": _NODATA_MASKING_REASON, - "tiled_le_uint16": _NODATA_MASKING_REASON, - "tiled_be_uint16": _NODATA_MASKING_REASON, } # GPU-only gaps. Failures here are GPU-specific (the eager and dask