Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-metadata-state.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module,last_inspected,issue,severity_max,categories_found,notes
geotiff,2026-05-11,1616,HIGH,4,_vrt.read_vrt leaked integer source nodata sentinel through int->float cast when VRT declared a float dataType; downstream NaN-aware code saw sentinel as valid data (#1616).
geotiff,2026-05-11,1632,HIGH,1;5,extract_geo_info dropped user-defined CRS WKT: GeoKey *CSTypeGeoKey=32767 files emitted attrs[crs_name] only (no crs/crs_wkt) so to_geotiff silently lost the projection on read->write. Fixed by promoting WKT-looking citation to crs_wkt across all four read backends (#1632).
reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch.
49 changes: 49 additions & 0 deletions xrspatial/geotiff/_geotags.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,46 @@ def _epsg_to_wkt(epsg: int) -> str | None:
return None


# Top-level WKT 1 / WKT 2 keywords. GDAL stores user-defined CRSes
# (GeoKey *CSTypeGeoKey == 32767) as WKT in the citation, so callers
# of ``extract_geo_info`` need a cheap test to decide whether the
# citation string round-trips as a CRS or is just a free-form name.
# Listed in rough order of how often they appear in real-world files.
_WKT_PREFIXES = (
'PROJCS[',
'GEOGCS[',
'PROJCRS[',
'GEOGCRS[',
'COMPD_CS[',
'COMPOUNDCRS[',
'BOUNDCRS[',
'VERT_CS[',
'VERTCRS[',
'LOCAL_CS[',
'ENGCRS[',
'PARAMETRICCRS[',
'TIMECRS[',
'DERIVEDPROJCRS[',
)


def _looks_like_wkt(text) -> bool:
"""Return True iff ``text`` opens with a known WKT root keyword.

The check is intentionally surface-level: only the first
non-whitespace stretch is inspected, no parser is invoked. The
callers only need to distinguish "this citation is actually a WKT
string GDAL stuffed in here" from "this is a human-readable CRS
name" -- a deeper parse would be both slower and a different bug
surface. False positives on names that happen to start with a WKT
keyword followed by '[' are vanishingly rare in practice.
"""
if not isinstance(text, str):
return False
head = text.lstrip().upper()
return any(head.startswith(p) for p in _WKT_PREFIXES)


def _parse_geokeys(ifd: IFD, data: bytes | memoryview,
byte_order: str) -> dict[int, int | float | str]:
"""Parse the GeoKeyDirectory and resolve values from param tags.
Expand Down Expand Up @@ -586,6 +626,15 @@ def extract_geo_info(ifd: IFD, data: bytes | memoryview,
crs_wkt = None
if epsg is not None:
crs_wkt = _epsg_to_wkt(epsg)
elif _looks_like_wkt(crs_name):
# User-defined CRS: GeoKey GEOKEY_*_CS_TYPE == 32767 and the WKT
# lives in the citation. Expose it as crs_wkt so the writer can
# round-trip it. The citation itself stays in crs_name for callers
# that expect that key. Without this branch a read -> write cycle
# silently drops the projection on user-defined CRS files because
# to_geotiff only consults attrs['crs'] / attrs['crs_wkt']. See
# issue #1632.
crs_wkt = crs_name

return GeoInfo(
transform=transform,
Expand Down
241 changes: 241 additions & 0 deletions xrspatial/geotiff/tests/test_user_defined_crs_wkt_1632.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
"""Regression tests for issue #1632.

Files with a user-defined CRS (no EPSG, WKT stored in the GeoTIFF
citation under ``GEOKEY_*_CS_TYPE == 32767``) used to round-trip with
``attrs['crs_name']`` set but ``attrs['crs_wkt']`` and ``attrs['crs']``
unset. ``to_geotiff`` only consults the latter two, so a read -> write
cycle silently dropped the projection.

The fix promotes the citation to ``attrs['crs_wkt']`` whenever no EPSG
is resolved and the citation parses as WKT (starts with one of the
known WKT 1 / WKT 2 root keywords). ``crs_name`` stays populated for
back-compat. Tests pin the contract across all four read backends and
across the read -> write -> read round trip.
"""
from __future__ import annotations

import importlib.util

import numpy as np
import pytest
import xarray as xr

tifffile = pytest.importorskip("tifffile")

from xrspatial.geotiff import open_geotiff, to_geotiff
from xrspatial.geotiff._geotags import _looks_like_wkt


# A user-defined Lambert Conformal Conic that pyproj cannot identify
# as a registered EPSG. Trimmed to keep test fixtures readable.
_USER_DEFINED_WKT = (
'PROJCS["User defined LCC",'
'GEOGCS["NAD83",'
'DATUM["North American Datum 1983",'
'SPHEROID["GRS 1980",6378137,298.257222101]],'
'PRIMEM["Greenwich",0],'
'UNIT["degree",0.0174532925199433]],'
'PROJECTION["Lambert Conformal Conic 2SP"],'
'PARAMETER["central_meridian",-100],'
'PARAMETER["latitude_of_origin",40],'
'PARAMETER["standard_parallel_1",30],'
'PARAMETER["standard_parallel_2",50],'
'UNIT["metre",1]]'
)


def _gpu_available() -> bool:
if importlib.util.find_spec("cupy") is None:
return False
try:
import cupy
return bool(cupy.cuda.is_available())
except Exception:
return False


_HAS_GPU = _gpu_available()
_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required")


def _write_user_defined_crs_tif(path):
"""Write a tiny GeoTIFF with WKT-only CRS and return the DataArray written."""
arr = np.ones((4, 4), dtype=np.float32)
da = xr.DataArray(
arr, dims=['y', 'x'],
coords={'y': np.linspace(50.0, 47.0, 4),
'x': np.linspace(10.0, 13.0, 4)},
attrs={'crs_wkt': _USER_DEFINED_WKT},
)
to_geotiff(da, path, compression='none')
return da


# ---------------------------------------------------------------------------
# _looks_like_wkt unit tests
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("text", [
'PROJCS["foo"]',
'GEOGCS["foo"]',
'PROJCRS["foo"]',
'GEOGCRS["foo"]',
'COMPD_CS["foo"]',
'COMPOUNDCRS["foo"]',
'BOUNDCRS["foo"]',
'VERT_CS["foo"]',
'VERTCRS["foo"]',
'LOCAL_CS["foo"]',
' PROJCS["leading whitespace"]',
'projcs["lowercase"]',
])
def test_looks_like_wkt_positive(text):
"""Top-level WKT 1 / WKT 2 keywords parse as WKT."""
assert _looks_like_wkt(text)


def test_looks_like_wkt_requires_bracket():
"""A keyword without the opening bracket is not WKT."""
# "PROJCS" alone is a token, not a complete WKT element. The check
# demands the bracket so plain-text references to WKT keywords in
# human-readable names do not collide with the WKT path.
assert not _looks_like_wkt("PROJCS")
assert not _looks_like_wkt("PROJCS no bracket here")


@pytest.mark.parametrize("text", [
None,
'',
'NAD83 / UTM Zone 12N', # human-readable name, not WKT
'epsg:4326', # urn-like
'WGS 84',
'Some random string',
'PROJ string +proj=longlat +datum=WGS84',
42, # non-string input
b'PROJCS["bytes input"]', # bytes, not str
])
def test_looks_like_wkt_negative(text):
"""Non-WKT inputs return False (including non-string types)."""
assert not _looks_like_wkt(text)


# ---------------------------------------------------------------------------
# Read-side: backends emit crs_wkt for user-defined CRS files
# ---------------------------------------------------------------------------


def test_eager_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The eager numpy read populates attrs['crs_wkt'] when the file's
citation carries WKT, even without an EPSG."""
p = str(tmp_path / "user_defined_crs.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p)
assert rd.attrs.get("crs") is None # no EPSG
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")
# crs_name is kept for back-compat
assert rd.attrs.get("crs_name") == rd.attrs["crs_wkt"]


def test_dask_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The dask read path emits the same crs_wkt as numpy."""
p = str(tmp_path / "user_defined_crs_dask.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p, chunks=4)
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")


@_gpu_only
def test_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The cupy / GPU read path emits the same crs_wkt as numpy."""
p = str(tmp_path / "user_defined_crs_gpu.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p, gpu=True)
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")


@_gpu_only
def test_dask_cupy_emits_crs_wkt_for_user_defined_crs(tmp_path):
"""The dask+cupy read path emits the same crs_wkt as numpy."""
p = str(tmp_path / "user_defined_crs_dask_gpu.tif")
_write_user_defined_crs_tif(p)

rd = open_geotiff(p, gpu=True, chunks=4)
assert rd.attrs.get("crs_wkt") is not None
assert rd.attrs["crs_wkt"].startswith("PROJCS[")


# ---------------------------------------------------------------------------
# Read -> write -> read round trip: WKT survives the second write
# ---------------------------------------------------------------------------


def test_user_defined_crs_round_trips_through_to_geotiff(tmp_path):
"""A read -> write of a user-defined CRS file keeps the projection.

Pre-fix, ``to_geotiff(open_geotiff(src), dst)`` produced ``dst`` with
no GeoKey CRS entries and no GeoAsciiParams tag because the read path
only set ``attrs['crs_name']`` and the writer never consults that.
"""
src = str(tmp_path / "round_trip_src.tif")
_write_user_defined_crs_tif(src)

rd = open_geotiff(src)
dst = str(tmp_path / "round_trip_dst.tif")
to_geotiff(rd, dst, compression='none')

# The second file should carry the same WKT in its citation.
rd2 = open_geotiff(dst)
assert rd2.attrs.get("crs_wkt") == rd.attrs.get("crs_wkt")

# And the raw GeoKey + ASCII tags must be present.
with tifffile.TiffFile(dst) as tif:
keys = tif.pages[0].tags.get(34735) # GeoKeyDirectory
ascii_tag = tif.pages[0].tags.get(34737) # GeoAsciiParams
assert keys is not None
# GeoKeyDirectory header is 4 entries; a real CRS adds 3+ key
# entries (model type, raster type, GTCitation -> ascii ref).
assert len(keys.value) > 4
assert ascii_tag is not None
assert "PROJCS[" in ascii_tag.value


def test_epsg_crs_unchanged_by_fix(tmp_path):
"""The fix must not regress the EPSG path: files with attrs['crs'] = <int>
should still emit both crs and crs_wkt on read."""
arr = np.ones((4, 4), dtype=np.float32)
da = xr.DataArray(
arr, dims=['y', 'x'],
coords={'y': np.linspace(50.0, 47.0, 4),
'x': np.linspace(10.0, 13.0, 4)},
attrs={'crs': 4326},
)
p = str(tmp_path / "epsg.tif")
to_geotiff(da, p, compression='none')

rd = open_geotiff(p)
assert rd.attrs.get("crs") == 4326
assert rd.attrs.get("crs_wkt") is not None
# The WKT here is pyproj's canonical 4326 WKT; the citation is the
# short EPSG-style name "WGS 84", not WKT, so crs_name should not
# be promoted to crs_wkt.
assert rd.attrs.get("crs_name") != rd.attrs.get("crs_wkt")


def test_human_readable_crs_name_not_promoted_to_crs_wkt(tmp_path):
"""A citation that is a human-readable name (not WKT) must stay in
crs_name only. The _looks_like_wkt gate prevents accidental promotion."""
# tifffile-built file with citation 'NAD83 / UTM Zone 12N' as the
# citation, no EPSG. We can't easily build the GeoKey table from
# scratch here without recapitulating extract_geo_info; instead we
# exercise the path via the helper directly.
assert not _looks_like_wkt("NAD83 / UTM Zone 12N")
assert not _looks_like_wkt("WGS 84")
assert not _looks_like_wkt("")
assert not _looks_like_wkt(None)
Loading