diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index 0381aaff..677a4ad0 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,3 +1,3 @@ module,last_inspected,issue,severity_max,categories_found,notes -geotiff,2026-05-11,1616,HIGH,4,_vrt.read_vrt leaked integer source nodata sentinel through int->float cast when VRT declared a float dataType; downstream NaN-aware code saw sentinel as valid data (#1616). +geotiff,2026-05-11,1632,HIGH,1;5,extract_geo_info dropped user-defined CRS WKT: GeoKey *CSTypeGeoKey=32767 files emitted attrs[crs_name] only (no crs/crs_wkt) so to_geotiff silently lost the projection on read->write. Fixed by promoting WKT-looking citation to crs_wkt across all four read backends (#1632). reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch. diff --git a/xrspatial/geotiff/_geotags.py b/xrspatial/geotiff/_geotags.py index 3895dbd1..46189cd7 100644 --- a/xrspatial/geotiff/_geotags.py +++ b/xrspatial/geotiff/_geotags.py @@ -243,6 +243,46 @@ def _epsg_to_wkt(epsg: int) -> str | None: return None +# Top-level WKT 1 / WKT 2 keywords. GDAL stores user-defined CRSes +# (GeoKey *CSTypeGeoKey == 32767) as WKT in the citation, so callers +# of ``extract_geo_info`` need a cheap test to decide whether the +# citation string round-trips as a CRS or is just a free-form name. +# Listed in rough order of how often they appear in real-world files. +_WKT_PREFIXES = ( + 'PROJCS[', + 'GEOGCS[', + 'PROJCRS[', + 'GEOGCRS[', + 'COMPD_CS[', + 'COMPOUNDCRS[', + 'BOUNDCRS[', + 'VERT_CS[', + 'VERTCRS[', + 'LOCAL_CS[', + 'ENGCRS[', + 'PARAMETRICCRS[', + 'TIMECRS[', + 'DERIVEDPROJCRS[', +) + + +def _looks_like_wkt(text) -> bool: + """Return True iff ``text`` opens with a known WKT root keyword. + + The check is intentionally surface-level: only the first + non-whitespace stretch is inspected, no parser is invoked. The + callers only need to distinguish "this citation is actually a WKT + string GDAL stuffed in here" from "this is a human-readable CRS + name" -- a deeper parse would be both slower and a different bug + surface. False positives on names that happen to start with a WKT + keyword followed by '[' are vanishingly rare in practice. + """ + if not isinstance(text, str): + return False + head = text.lstrip().upper() + return any(head.startswith(p) for p in _WKT_PREFIXES) + + def _parse_geokeys(ifd: IFD, data: bytes | memoryview, byte_order: str) -> dict[int, int | float | str]: """Parse the GeoKeyDirectory and resolve values from param tags. @@ -586,6 +626,15 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview, crs_wkt = None if epsg is not None: crs_wkt = _epsg_to_wkt(epsg) + elif _looks_like_wkt(crs_name): + # User-defined CRS: GeoKey GEOKEY_*_CS_TYPE == 32767 and the WKT + # lives in the citation. Expose it as crs_wkt so the writer can + # round-trip it. The citation itself stays in crs_name for callers + # that expect that key. Without this branch a read -> write cycle + # silently drops the projection on user-defined CRS files because + # to_geotiff only consults attrs['crs'] / attrs['crs_wkt']. See + # issue #1632. + crs_wkt = crs_name return GeoInfo( transform=transform, diff --git a/xrspatial/geotiff/tests/test_user_defined_crs_wkt_1632.py b/xrspatial/geotiff/tests/test_user_defined_crs_wkt_1632.py new file mode 100644 index 00000000..c93c4817 --- /dev/null +++ b/xrspatial/geotiff/tests/test_user_defined_crs_wkt_1632.py @@ -0,0 +1,241 @@ +"""Regression tests for issue #1632. + +Files with a user-defined CRS (no EPSG, WKT stored in the GeoTIFF +citation under ``GEOKEY_*_CS_TYPE == 32767``) used to round-trip with +``attrs['crs_name']`` set but ``attrs['crs_wkt']`` and ``attrs['crs']`` +unset. ``to_geotiff`` only consults the latter two, so a read -> write +cycle silently dropped the projection. + +The fix promotes the citation to ``attrs['crs_wkt']`` whenever no EPSG +is resolved and the citation parses as WKT (starts with one of the +known WKT 1 / WKT 2 root keywords). ``crs_name`` stays populated for +back-compat. Tests pin the contract across all four read backends and +across the read -> write -> read round trip. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest +import xarray as xr + +tifffile = pytest.importorskip("tifffile") + +from xrspatial.geotiff import open_geotiff, to_geotiff +from xrspatial.geotiff._geotags import _looks_like_wkt + + +# A user-defined Lambert Conformal Conic that pyproj cannot identify +# as a registered EPSG. Trimmed to keep test fixtures readable. +_USER_DEFINED_WKT = ( + 'PROJCS["User defined LCC",' + 'GEOGCS["NAD83",' + 'DATUM["North American Datum 1983",' + 'SPHEROID["GRS 1980",6378137,298.257222101]],' + 'PRIMEM["Greenwich",0],' + 'UNIT["degree",0.0174532925199433]],' + 'PROJECTION["Lambert Conformal Conic 2SP"],' + 'PARAMETER["central_meridian",-100],' + 'PARAMETER["latitude_of_origin",40],' + 'PARAMETER["standard_parallel_1",30],' + 'PARAMETER["standard_parallel_2",50],' + 'UNIT["metre",1]]' +) + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") + + +def _write_user_defined_crs_tif(path): + """Write a tiny GeoTIFF with WKT-only CRS and return the DataArray written.""" + arr = np.ones((4, 4), dtype=np.float32) + da = xr.DataArray( + arr, dims=['y', 'x'], + coords={'y': np.linspace(50.0, 47.0, 4), + 'x': np.linspace(10.0, 13.0, 4)}, + attrs={'crs_wkt': _USER_DEFINED_WKT}, + ) + to_geotiff(da, path, compression='none') + return da + + +# --------------------------------------------------------------------------- +# _looks_like_wkt unit tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("text", [ + 'PROJCS["foo"]', + 'GEOGCS["foo"]', + 'PROJCRS["foo"]', + 'GEOGCRS["foo"]', + 'COMPD_CS["foo"]', + 'COMPOUNDCRS["foo"]', + 'BOUNDCRS["foo"]', + 'VERT_CS["foo"]', + 'VERTCRS["foo"]', + 'LOCAL_CS["foo"]', + ' PROJCS["leading whitespace"]', + 'projcs["lowercase"]', +]) +def test_looks_like_wkt_positive(text): + """Top-level WKT 1 / WKT 2 keywords parse as WKT.""" + assert _looks_like_wkt(text) + + +def test_looks_like_wkt_requires_bracket(): + """A keyword without the opening bracket is not WKT.""" + # "PROJCS" alone is a token, not a complete WKT element. The check + # demands the bracket so plain-text references to WKT keywords in + # human-readable names do not collide with the WKT path. + assert not _looks_like_wkt("PROJCS") + assert not _looks_like_wkt("PROJCS no bracket here") + + +@pytest.mark.parametrize("text", [ + None, + '', + 'NAD83 / UTM Zone 12N', # human-readable name, not WKT + 'epsg:4326', # urn-like + 'WGS 84', + 'Some random string', + 'PROJ string +proj=longlat +datum=WGS84', + 42, # non-string input + b'PROJCS["bytes input"]', # bytes, not str +]) +def test_looks_like_wkt_negative(text): + """Non-WKT inputs return False (including non-string types).""" + assert not _looks_like_wkt(text) + + +# --------------------------------------------------------------------------- +# Read-side: backends emit crs_wkt for user-defined CRS files +# --------------------------------------------------------------------------- + + +def test_eager_emits_crs_wkt_for_user_defined_crs(tmp_path): + """The eager numpy read populates attrs['crs_wkt'] when the file's + citation carries WKT, even without an EPSG.""" + p = str(tmp_path / "user_defined_crs.tif") + _write_user_defined_crs_tif(p) + + rd = open_geotiff(p) + assert rd.attrs.get("crs") is None # no EPSG + assert rd.attrs.get("crs_wkt") is not None + assert rd.attrs["crs_wkt"].startswith("PROJCS[") + # crs_name is kept for back-compat + assert rd.attrs.get("crs_name") == rd.attrs["crs_wkt"] + + +def test_dask_emits_crs_wkt_for_user_defined_crs(tmp_path): + """The dask read path emits the same crs_wkt as numpy.""" + p = str(tmp_path / "user_defined_crs_dask.tif") + _write_user_defined_crs_tif(p) + + rd = open_geotiff(p, chunks=4) + assert rd.attrs.get("crs_wkt") is not None + assert rd.attrs["crs_wkt"].startswith("PROJCS[") + + +@_gpu_only +def test_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path): + """The cupy / GPU read path emits the same crs_wkt as numpy.""" + p = str(tmp_path / "user_defined_crs_gpu.tif") + _write_user_defined_crs_tif(p) + + rd = open_geotiff(p, gpu=True) + assert rd.attrs.get("crs_wkt") is not None + assert rd.attrs["crs_wkt"].startswith("PROJCS[") + + +@_gpu_only +def test_dask_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path): + """The dask+cupy read path emits the same crs_wkt as numpy.""" + p = str(tmp_path / "user_defined_crs_dask_gpu.tif") + _write_user_defined_crs_tif(p) + + rd = open_geotiff(p, gpu=True, chunks=4) + assert rd.attrs.get("crs_wkt") is not None + assert rd.attrs["crs_wkt"].startswith("PROJCS[") + + +# --------------------------------------------------------------------------- +# Read -> write -> read round trip: WKT survives the second write +# --------------------------------------------------------------------------- + + +def test_user_defined_crs_round_trips_through_to_geotiff(tmp_path): + """A read -> write of a user-defined CRS file keeps the projection. + + Pre-fix, ``to_geotiff(open_geotiff(src), dst)`` produced ``dst`` with + no GeoKey CRS entries and no GeoAsciiParams tag because the read path + only set ``attrs['crs_name']`` and the writer never consults that. + """ + src = str(tmp_path / "round_trip_src.tif") + _write_user_defined_crs_tif(src) + + rd = open_geotiff(src) + dst = str(tmp_path / "round_trip_dst.tif") + to_geotiff(rd, dst, compression='none') + + # The second file should carry the same WKT in its citation. + rd2 = open_geotiff(dst) + assert rd2.attrs.get("crs_wkt") == rd.attrs.get("crs_wkt") + + # And the raw GeoKey + ASCII tags must be present. + with tifffile.TiffFile(dst) as tif: + keys = tif.pages[0].tags.get(34735) # GeoKeyDirectory + ascii_tag = tif.pages[0].tags.get(34737) # GeoAsciiParams + assert keys is not None + # GeoKeyDirectory header is 4 entries; a real CRS adds 3+ key + # entries (model type, raster type, GTCitation -> ascii ref). + assert len(keys.value) > 4 + assert ascii_tag is not None + assert "PROJCS[" in ascii_tag.value + + +def test_epsg_crs_unchanged_by_fix(tmp_path): + """The fix must not regress the EPSG path: files with attrs['crs'] = + should still emit both crs and crs_wkt on read.""" + arr = np.ones((4, 4), dtype=np.float32) + da = xr.DataArray( + arr, dims=['y', 'x'], + coords={'y': np.linspace(50.0, 47.0, 4), + 'x': np.linspace(10.0, 13.0, 4)}, + attrs={'crs': 4326}, + ) + p = str(tmp_path / "epsg.tif") + to_geotiff(da, p, compression='none') + + rd = open_geotiff(p) + assert rd.attrs.get("crs") == 4326 + assert rd.attrs.get("crs_wkt") is not None + # The WKT here is pyproj's canonical 4326 WKT; the citation is the + # short EPSG-style name "WGS 84", not WKT, so crs_name should not + # be promoted to crs_wkt. + assert rd.attrs.get("crs_name") != rd.attrs.get("crs_wkt") + + +def test_human_readable_crs_name_not_promoted_to_crs_wkt(tmp_path): + """A citation that is a human-readable name (not WKT) must stay in + crs_name only. The _looks_like_wkt gate prevents accidental promotion.""" + # tifffile-built file with citation 'NAD83 / UTM Zone 12N' as the + # citation, no EPSG. We can't easily build the GeoKey table from + # scratch here without recapitulating extract_geo_info; instead we + # exercise the path via the helper directly. + assert not _looks_like_wkt("NAD83 / UTM Zone 12N") + assert not _looks_like_wkt("WGS 84") + assert not _looks_like_wkt("") + assert not _looks_like_wkt(None)