diff --git a/docs/source/reference/release_gate_geotiff.rst b/docs/source/reference/release_gate_geotiff.rst index 8c4433b32..3d81ddfd2 100644 --- a/docs/source/reference/release_gate_geotiff.rst +++ b/docs/source/reference/release_gate_geotiff.rst @@ -683,7 +683,7 @@ Internal-only surfaces (not promised) ``allow_internal_only_jpeg=True``; not covered by ``allow_experimental_codecs``. - ``xrspatial/geotiff/tests/unit/test_photometric.py``, - ``xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py`` + ``xrspatial/geotiff/tests/gpu/test_codec.py`` - `#2340`_ Cross-cutting CI gates diff --git a/xrspatial/geotiff/tests/CLUSTER_AUDIT_GPU_C.md b/xrspatial/geotiff/tests/CLUSTER_AUDIT_GPU_C.md new file mode 100644 index 000000000..58d7a446f --- /dev/null +++ b/xrspatial/geotiff/tests/CLUSTER_AUDIT_GPU_C.md @@ -0,0 +1,153 @@ +# Cluster 14 (Sub-PR C) audit: GPU codec test consolidation + +Folds 11 GPU codec test files into `xrspatial/geotiff/tests/gpu/test_codec.py`. +Baseline collection: 71 tests across the 11 source files. Consolidated +collection: 71 tests in the new file. Run-time outcome on this checkout +(no nvCOMP / nvJPEG / lerc beyond what the host ships): 68 passed, 3 +skipped. + +Source -> destination mapping (every old test landed under the same +issue-suffixed name unless noted): + +## test_nvcomp_batch_compress_batched_1712.py + +| old `file::test` | new `test_codec.py::test_id` | +| --------------------------------------------------- | ------------------------------------------------------------- | +| `test_no_per_tile_cupy_empty_in_compressed_pool` | `test_no_per_tile_cupy_empty_in_compressed_pool_1712` | +| `test_no_per_tile_get_in_result_loop` | `test_no_per_tile_get_in_result_loop_1712` | +| `test_gpu_write_roundtrip_after_batched_compress[deflate]` | `test_gpu_write_roundtrip_after_batched_compress_1712[deflate]` | +| `test_gpu_write_roundtrip_after_batched_compress[zstd]` | `test_gpu_write_roundtrip_after_batched_compress_1712[zstd]` | +| `test_gpu_write_zero_tile_edge_case` | `test_gpu_write_zero_tile_edge_case_1712` | + +## test_nvcomp_batch_upload_p3.py + +| old `file::test` | new `test_codec.py::test_id` | +| ------------------------------------------------------ | ------------------------------------------------------- | +| `test_nvcomp_batch_upload_correctness[256-tile0]` | `test_nvcomp_batch_upload_correctness_p3[256-tile0]` | +| `test_nvcomp_batch_upload_correctness[1024-tile1]` | `test_nvcomp_batch_upload_correctness_p3[1024-tile1]` | +| `test_nvcomp_batch_upload_correctness[2048-tile2]` | `test_nvcomp_batch_upload_correctness_p3[2048-tile2]` | +| `test_nvcomp_kvikio_fallback_skips_zstd` | `test_nvcomp_kvikio_fallback_skips_zstd_p3` | +| `test_nvcomp_batch_upload_perf_regression_guard` | `test_nvcomp_batch_upload_perf_regression_guard_p3` | + +## test_nvcomp_decompress_cumsum_offsets_1950.py + +| old `file::test` | new `test_codec.py::test_id` | +| --------------------------------------------------------- | ------------------------------------------------------- | +| `test_nvcomp_decompress_uses_cumsum_for_offsets_1950` | `test_nvcomp_decompress_uses_cumsum_for_offsets_1950` | +| `test_cumsum_matches_loop_prefix_sum_1950` | `test_cumsum_matches_loop_prefix_sum_1950` | +| `test_nvcomp_batch_decompress_roundtrip_1950` | `test_nvcomp_batch_decompress_roundtrip_1950` | + +## test_nvcomp_from_device_bufs_single_alloc_1659.py + +| old `file::test` | new `test_codec.py::test_id` | +| --------------------------------------------------------------- | ------------------------------------------------------------------ | +| `test_unsupported_codec_short_circuits_before_allocation` | `test_unsupported_codec_short_circuits_before_allocation_1659` | +| `test_no_nvcomp_lib_returns_none` | `test_no_nvcomp_lib_returns_none_1659` | +| `test_memory_guard_runs_with_full_decomp_size` | `test_memory_guard_runs_with_full_decomp_size_1659` | +| `test_zstd_decompress_roundtrip_returns_single_contiguous_buffer` | `test_zstd_decompress_roundtrip_returns_single_contiguous_buffer_1659` | +| `test_no_orphan_decomp_buffers_after_call` | `test_no_orphan_decomp_buffers_after_call_1659` | + +## test_nvjpeg_encode_stream_sync_2212.py + +| old `file::test` | new `test_codec.py::test_id` | +| ------------------------------------------------------------------------- | ----------------------------------------------------------------------------- | +| `TestNvjpegEncodeStreamSync::test_no_device_synchronize_inside_encode_loop` | `TestNvjpegEncodeStreamSync_2212::test_no_device_synchronize_inside_encode_loop` | +| `TestNvjpegEncodeStreamSync::test_stream_null_synchronize_present` | `TestNvjpegEncodeStreamSync_2212::test_stream_null_synchronize_present` | +| `TestNvjpeg2kEncodeStreamSync::test_no_device_synchronize_inside_encode_loop` | `TestNvjpeg2kEncodeStreamSync_2212::test_no_device_synchronize_inside_encode_loop` | +| `TestNvjpeg2kEncodeStreamSync::test_stream_null_synchronize_present` | `TestNvjpeg2kEncodeStreamSync_2212::test_stream_null_synchronize_present` | +| `TestDecodeReferencePattern::test_decoder_uses_stream_null_sync_in_loop` | `TestDecodeReferencePattern_2212::test_decoder_uses_stream_null_sync_in_loop` | + +## test_nvjpeg2k_single_alloc_2107.py + +| old `file::test` | new `test_codec.py::test_id` | +| --------------------------------------------------------------------------- | ---------------------------------------------------------------------------- | +| `TestNvjpeg2kSingleAllocStructural::test_no_cupy_empty_inside_decode_loop` | `TestNvjpeg2kSingleAllocStructural_2107::test_no_cupy_empty_inside_decode_loop` | +| `TestNvjpeg2kSingleAllocStructural::test_no_device_synchronize_inside_decode_loop` | `TestNvjpeg2kSingleAllocStructural_2107::test_no_device_synchronize_inside_decode_loop` | +| `TestNvjpeg2kSingleAllocStructural::test_pool_allocation_present` | `TestNvjpeg2kSingleAllocStructural_2107::test_pool_allocation_present` | +| `TestNvjpeg2kSingleAllocStructural::test_check_gpu_memory_guard_present` | `TestNvjpeg2kSingleAllocStructural_2107::test_check_gpu_memory_guard_present` | +| `TestNvjpeg2kLibAbsentShortCircuit::test_returns_none_when_lib_missing` | `TestNvjpeg2kLibAbsentShortCircuit_2107::test_returns_none_when_lib_missing` | +| `TestNvjpeg2kLibAbsentShortCircuit::test_returns_none_for_unsupported_dtype` | `TestNvjpeg2kLibAbsentShortCircuit_2107::test_returns_none_for_unsupported_dtype` | +| `TestNvjpeg2kPoolWithCupy::test_pool_slabs_are_non_overlapping` | `TestNvjpeg2kPoolWithCupy_2107::test_pool_slabs_are_non_overlapping` | + +Note: the source had `@pytest.mark.gpu` on `TestNvjpeg2kPoolWithCupy`, +which raised an `UnknownMark` warning because the project does not +register a `gpu` mark. The new section uses `@requires_gpu` from +`_helpers/markers.py` -- same skip behaviour, no warning. + +## test_jpeg_gpu_1549.py + +| old `file::test` | new `test_codec.py::test_id` | +| ----------------------------------------------- | ----------------------------------------------------- | +| `test_rgb_jpeg_gpu_no_crash` | `test_rgb_jpeg_gpu_no_crash_1549` | +| `test_rgb_jpeg_gpu_matches_cpu` | `test_rgb_jpeg_gpu_matches_cpu_1549` | +| `test_grayscale_jpeg_gpu_matches_cpu` | `test_grayscale_jpeg_gpu_matches_cpu_1549` | +| `test_cuda_context_survives_after_jpeg_gpu_read` | `test_cuda_context_survives_after_jpeg_gpu_read_1549` | + +## test_lerc_valid_mask_gpu.py + +| old `file::test` | new `test_codec.py::test_id` | +| ------------------------------------------------------ | ------------------------------------------------------- | +| `TestGpuLercValidMask::test_float32_nan_nodata` | `TestGpuLercValidMask::test_float32_nan_nodata` | +| `TestGpuLercValidMask::test_float32_sentinel_nodata` | `TestGpuLercValidMask::test_float32_sentinel_nodata` | +| `TestGpuLercValidMask::test_uint16_sentinel_nodata` | `TestGpuLercValidMask::test_uint16_sentinel_nodata` | +| `TestGpuLercValidMask::test_no_mask_roundtrip_bitexact` | `TestGpuLercValidMask::test_no_mask_roundtrip_bitexact` | + +## test_predictor2_big_endian_gpu_1517.py + +| old `file::test` | new `test_codec.py::test_id` | +| ----------------------------------------------------------- | ------------------------------------------------------------ | +| `test_gpu_predictor2_big_endian_int32_tiled_reproducer` | `test_gpu_predictor2_big_endian_int32_tiled_reproducer_1517` | +| `test_gpu_predictor2_big_endian_dtypes_tiled[uint16]` | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[uint16]` | +| `test_gpu_predictor2_big_endian_dtypes_tiled[int16]` | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[int16]` | +| `test_gpu_predictor2_big_endian_dtypes_tiled[uint32]` | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[uint32]` | +| `test_gpu_predictor2_big_endian_dtypes_tiled[int32]` | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[int32]` | +| `test_gpu_predictor2_big_endian_stripped_uint16` | `test_gpu_predictor2_big_endian_stripped_uint16_1517` | +| `test_gpu_predictor2_little_endian_still_works` | `test_gpu_predictor2_little_endian_still_works_1517` | +| `test_gpu_predictor3_big_endian_still_works` | `test_gpu_predictor3_big_endian_still_works_1517` | +| `test_swap_byte_lanes_numpy_bps2` | `test_swap_byte_lanes_numpy_bps2_1517` | +| `test_swap_byte_lanes_numpy_bps4` | `test_swap_byte_lanes_numpy_bps4_1517` | +| `test_swap_byte_lanes_numpy_bps8` | `test_swap_byte_lanes_numpy_bps8_1517` | +| `test_swap_byte_lanes_uint8_noop` | `test_swap_byte_lanes_uint8_noop_1517` | +| `test_swap_byte_lanes_rejects_unsupported_bps` | `test_swap_byte_lanes_rejects_unsupported_bps_1517` | +| `test_swap_byte_lanes_rejects_misaligned_size` | `test_swap_byte_lanes_rejects_misaligned_size_1517` | +| `test_swap_byte_lanes_numpy_is_zero_temp` | `test_swap_byte_lanes_numpy_is_zero_temp_1517` | +| `test_swap_byte_lanes_cupy_kernel[2-uint16]` | `test_swap_byte_lanes_cupy_kernel_1517[2-uint16]` | +| `test_swap_byte_lanes_cupy_kernel[4-uint32]` | `test_swap_byte_lanes_cupy_kernel_1517[4-uint32]` | +| `test_swap_byte_lanes_cupy_kernel[8-uint64]` | `test_swap_byte_lanes_cupy_kernel_1517[8-uint64]` | +| `test_swap_byte_lanes_cupy_uint8_noop` | `test_swap_byte_lanes_cupy_uint8_noop_1517` | + +## test_predictor3_int_dtype_gpu_1933.py + +| old `file::test` | new `test_codec.py::test_id` | +| ----------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | +| `TestGPUEagerRejectsMalformedFile::test_gpu_eager_stripped_raises` | `TestGPUEagerRejectsMalformedFile_1933::test_gpu_eager_stripped_raises` | +| `TestGPUEagerRejectsMalformedFile::test_gpu_eager_tiled_raises` | `TestGPUEagerRejectsMalformedFile_1933::test_gpu_eager_tiled_raises` | +| `TestGPUEagerRejectsMalformedFile::test_gpu_dispatcher_eager_raises` | `TestGPUEagerRejectsMalformedFile_1933::test_gpu_dispatcher_eager_raises` | +| `TestGPUChunkedRejectsMalformedFile::test_read_geotiff_gpu_chunked_stripped_raises` | `TestGPUChunkedRejectsMalformedFile_1933::test_read_geotiff_gpu_chunked_stripped_raises` | +| `TestGPUChunkedRejectsMalformedFile::test_read_geotiff_gpu_chunked_tiled_raises` | `TestGPUChunkedRejectsMalformedFile_1933::test_read_geotiff_gpu_chunked_tiled_raises` | +| `TestGPUChunkedRejectsMalformedFile::test_open_geotiff_chunks_gpu_dispatcher_raises` | `TestGPUChunkedRejectsMalformedFile_1933::test_open_geotiff_chunks_gpu_dispatcher_raises` | +| `TestValidPredictor3StillWorksOnGPU::test_predictor3_float32_gpu_round_trip` | `TestValidPredictor3StillWorksOnGPU_1933::test_predictor3_float32_gpu_round_trip` | +| `TestValidPredictor3StillWorksOnGPU::test_predictor3_float32_dask_gpu_round_trip` | `TestValidPredictor3StillWorksOnGPU_1933::test_predictor3_float32_dask_gpu_round_trip` | +| `TestErrorMessageStable::test_gpu_error_message_matches_eager` | `TestErrorMessageStable_1933::test_gpu_error_message_matches_eager` | + +## test_gpu_jpeg_interop_reject_issue_D_1845.py + +| old `file::test` | new `test_codec.py::test_id` | +| ----------------------------------------------------------------- | ------------------------------------------------------------------ | +| `test_write_geotiff_gpu_rejects_jpeg_without_opt_in` | `test_write_geotiff_gpu_rejects_jpeg_without_opt_in_1845` | +| `test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives` | `test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives_1845` | +| `test_write_geotiff_gpu_rejects_jpeg_case_insensitive` | `test_write_geotiff_gpu_rejects_jpeg_case_insensitive_1845` | +| `test_write_geotiff_gpu_jpeg_opt_in_emits_warning` | `test_write_geotiff_gpu_jpeg_opt_in_emits_warning_1845` | +| `test_write_geotiff_gpu_non_jpeg_unaffected_by_flag` | `test_write_geotiff_gpu_non_jpeg_unaffected_by_flag_1845` | + +## Cross-references updated + +* `docs/source/reference/release_gate_geotiff.rst` -- codec ``jpeg`` + row now cites `gpu/test_codec.py` instead of the deleted + `test_gpu_jpeg_interop_reject_issue_D_1845.py`. +* `xrspatial/geotiff/tests/unit/test_predictor.py` -- the GPU + predictor file pointers in the module docstring now point at + `gpu/test_codec.py`. + +This audit file is deleted in a final pre-merge commit on this branch +(epic #2424 hard gate). diff --git a/xrspatial/geotiff/tests/gpu/__init__.py b/xrspatial/geotiff/tests/gpu/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/xrspatial/geotiff/tests/gpu/test_codec.py b/xrspatial/geotiff/tests/gpu/test_codec.py new file mode 100644 index 000000000..c33e80f63 --- /dev/null +++ b/xrspatial/geotiff/tests/gpu/test_codec.py @@ -0,0 +1,1904 @@ +"""GPU codec coverage: nvCOMP, nvJPEG / nvJPEG2000, JPEG, LERC, predictor. + +Cluster 14 of long-tail epic #2424 (Sub-PR C) folds the GPU codec test +files into one home. Sections in source-order below: + +* ``test_nvcomp_batch_compress_batched_1712.py`` -- batched nvCOMP + compress: single contiguous output alloc + single batched D2H concat. +* ``test_nvcomp_batch_upload_p3.py`` -- batched H2D upload on the + nvCOMP decompress side; cumulative-sum offset pattern. +* ``test_nvcomp_decompress_cumsum_offsets_1950.py`` -- decompress-side + prefix-sum offsets via ``np.cumsum`` rather than a Python loop. +* ``test_nvcomp_from_device_bufs_single_alloc_1659.py`` -- single + contiguous output buffer for the device-buf nvCOMP path. +* ``test_nvjpeg_encode_stream_sync_2212.py`` -- the per-tile encode- + loop sync uses ``Stream.null.synchronize()`` (not + ``Device().synchronize()``). +* ``test_nvjpeg2k_single_alloc_2107.py`` -- pool the per-tile alloc + + per-tile sync in ``_try_nvjpeg2k_batch_decode``. +* ``test_jpeg_gpu_1549.py`` -- nvJPEG output-format constants match + the SDK; cross-backend pixel parity + context survival. +* ``test_lerc_valid_mask_gpu.py`` -- the GPU LERC tile-decode path + honours the file's valid-mask, matching the CPU reader. +* ``test_predictor2_big_endian_gpu_1517.py`` -- byte-swap helper + + predictor=2 BE files match CPU baseline. +* ``test_predictor3_int_dtype_gpu_1933.py`` -- predictor=3 + integer + SampleFormat is rejected at every GPU entry point. +* ``test_gpu_jpeg_interop_reject_issue_D_1845.py`` -- the GPU writer + rejects ``compression='jpeg'`` by default and emits a + ``GeoTIFFFallbackWarning`` on the opt-in. + +Every test in this module is gated through the shared ``requires_gpu`` +marker from ``_helpers/markers.py``. Module-level helpers carry the +source issue number suffix (e.g. ``_write_jpeg_rgb_tiff_1549``) so +sibling sections stay collision-free. +""" +from __future__ import annotations + +import ast +import importlib.util +import inspect +import os +import pathlib +import re +import tempfile +import time +import uuid +import warnings as _warnings + +import numpy as np +import pytest +import xarray as xr + +from .._helpers.markers import gpu_available, requires_gpu + +# Aliased so the per-file ``_gpu_only`` decorators read the same as +# before the consolidation; the underlying check is the shared +# ``requires_gpu`` marker. +_gpu_only = requires_gpu +needs_cupy = requires_gpu + +# A handful of sections additionally gate on optional libraries (tifffile, +# imagecodecs, nvJPEG, etc.). Those gates layer on top of ``requires_gpu`` +# below; they need separate skipif decorators because the missing-library +# reason text is informative. +_HAS_GPU = gpu_available() +_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None +_HAS_PIL = importlib.util.find_spec("PIL") is not None +_HAS_IMAGECODECS = importlib.util.find_spec("imagecodecs") is not None + + +# ============================================================ +# Section: nvCOMP batched compress (#1712) +# ============================================================ +# Source: test_nvcomp_batch_compress_batched_1712.py +# +# The pre-fix function allocated compressed-output device buffers one +# ``cupy.empty`` per tile and then read each tile back to host with one +# ``.get()`` per tile. Both patterns serialised on the default CUDA +# stream and were dominant in large-N writes. The fix folds both into +# a single contiguous device allocation + a single batched D2H concat- +# and-``.get()``. These tests pin the new shape and confirm the deflate +# / zstd GPU write paths still round-trip end-to-end. + +# nvCOMP is the entry point that exercises this code path. +from xrspatial.geotiff import _gpu_decode # noqa: E402 + + +def test_no_per_tile_cupy_empty_in_compressed_pool_1712(): + """The per-tile cupy.empty list comprehension is gone (#1712).""" + source = inspect.getsource(_gpu_decode._nvcomp_batch_compress) + assert "cupy.empty(max_cs, dtype=cupy.uint8) for _ in range" not in source, ( + "_nvcomp_batch_compress regressed to per-tile cupy.empty " + "allocations for the compressed output pool. See #1712." + ) + + +def test_no_per_tile_get_in_result_loop_1712(): + """The per-tile ``d_comp_bufs[i][:cs].get().tobytes()`` is gone (#1712).""" + source = inspect.getsource(_gpu_decode._nvcomp_batch_compress) + bad_fragment = "d_comp_bufs[i][:cs].get().tobytes()" + assert bad_fragment not in source, ( + "_nvcomp_batch_compress regressed to per-tile .get().tobytes() " + "D2H readback. See #1712." + ) + + +@requires_gpu +@pytest.mark.parametrize("compression", ["deflate", "zstd"]) +def test_gpu_write_roundtrip_after_batched_compress_1712(compression): + """GPU compress path round-trips uncorrupted for deflate + zstd.""" + import cupy + + from xrspatial.geotiff import open_geotiff, write_geotiff_gpu + + rng = np.random.default_rng(seed=1712) + arr_cpu = rng.random((512, 512), dtype=np.float32) + arr_gpu = cupy.asarray(arr_cpu) + darr = xr.DataArray(arr_gpu, dims=["y", "x"]) + + with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td: + path = os.path.join(td, f"roundtrip_{compression}.tif") + try: + write_geotiff_gpu( + darr, path, + compression=compression, + tiled=True, + tile_size=64, + ) + except RuntimeError as e: + pytest.skip(f"nvCOMP unavailable for {compression}: {e}") + + back = open_geotiff(path) + np.testing.assert_allclose(back.values, arr_cpu, rtol=0, atol=0) + + +@requires_gpu +def test_gpu_write_zero_tile_edge_case_1712(): + """A 0-tile compress returns an empty list without indexing into None.""" + import cupy + + from xrspatial.geotiff import open_geotiff, write_geotiff_gpu + + arr_gpu = cupy.zeros((32, 32), dtype=cupy.float32) + darr = xr.DataArray(arr_gpu, dims=["y", "x"]) + with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td: + path = os.path.join(td, "tiny.tif") + try: + write_geotiff_gpu(darr, path, compression="zstd", + tiled=True, tile_size=32) + except RuntimeError as e: + pytest.skip(f"nvCOMP unavailable: {e}") + back = open_geotiff(path) + assert back.shape == (32, 32) + + +# ============================================================ +# Section: nvCOMP batched H2D upload (P3 perf audit) +# ============================================================ +# Source: test_nvcomp_batch_upload_p3.py +# +# The decompress-side fast path used to do one ``cupy.asarray`` per +# compressed tile. The fix concatenates all tiles into a single host +# buffer, performs one H2D transfer, and derives per-tile device +# pointers via ``base_ptr + offsets``. + + +def _kvikio_nvcomp_importable_p3() -> bool: + """True iff ``import kvikio.nvcomp`` actually succeeds.""" + try: + import kvikio.nvcomp # noqa: F401 + except Exception: + return False + return True + + +def _nvcomp_path_available_p3() -> bool: + """True when at least one nvCOMP backend is loadable on this host.""" + if not _HAS_GPU: + return False + try: + from xrspatial.geotiff._gpu_decode import _get_nvcomp + except Exception: + return False + if _get_nvcomp() is not None: + return True + return _kvikio_nvcomp_importable_p3() + + +_HAS_NVCOMP_P3 = _nvcomp_path_available_p3() +_nvcomp_only_p3 = pytest.mark.skipif( + not (_HAS_GPU and _HAS_TIFFFILE and _HAS_NVCOMP_P3), + reason="cupy + CUDA + tifffile + (libnvcomp or kvikio.nvcomp) required", +) + + +def _write_deflate_tiled_p3(path, arr, tile=(256, 256)): + import tifffile + tifffile.imwrite( + str(path), arr, compression="deflate", tile=tile, + ) + + +def _wrap_nvcomp_with_call_recorder_p3(monkeypatch): + """Replace ``_try_nvcomp_batch_decompress`` with a recording wrapper.""" + from xrspatial.geotiff import _gpu_decode + + records: list[tuple[int, bool]] = [] + original = _gpu_decode._try_nvcomp_batch_decompress + + def _recording(compressed_tiles, tile_bytes, compression): + result = original(compressed_tiles, tile_bytes, compression) + records.append((compression, result is not None)) + return result + + monkeypatch.setattr( + _gpu_decode, + '_try_nvcomp_batch_decompress', + _recording, + raising=True, + ) + return records + + +@_nvcomp_only_p3 +@pytest.mark.parametrize("size,tile", [ + (256, (128, 128)), # 4 tiles + (1024, (256, 256)), # 16 tiles + (2048, (128, 128)), # 256 tiles -- matches the audit measurement +]) +def test_nvcomp_batch_upload_correctness_p3(tmp_path, monkeypatch, size, tile): + """GPU decode of Deflate-tiled TIFFs is bit-exact vs CPU.""" + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260508) + arr = rng.randint(0, 4096, size=(size, size), dtype=np.uint16) + + name = f"deflate_{size}_{tile[0]}_{uuid.uuid4().hex[:8]}.tif" + path = tmp_path / name + _write_deflate_tiled_p3(path, arr, tile=tile) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + records = _wrap_nvcomp_with_call_recorder_p3(monkeypatch) + gpu_da = read_geotiff_gpu(str(path)) + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + assert any(success for _, success in records), ( + "_try_nvcomp_batch_decompress was never invoked or always returned " + f"None; records={records}. The optimised path was not exercised, so " + f"this test would pass even if the rewrite were broken." + ) + + +@_nvcomp_only_p3 +def test_nvcomp_kvikio_fallback_skips_zstd_p3(monkeypatch): + """ZSTD-compressed input must NOT take the kvikio DeflateManager path.""" + import xrspatial.geotiff._gpu_decode as _gpu_decode + + if not _kvikio_nvcomp_importable_p3(): + pytest.skip("kvikio.nvcomp not importable; the kvikio branch " + "is never entered on this host") + monkeypatch.setattr(_gpu_decode, '_get_nvcomp', lambda: None) + + result = _gpu_decode._try_nvcomp_batch_decompress( + compressed_tiles=[b'\x28\xb5\x2f\xfd' + b'\x00' * 16], + tile_bytes=1024, + compression=50000, # ZSTD + ) + assert result is None, ( + "_try_nvcomp_batch_decompress returned non-None for ZSTD via the " + "kvikio fallback; this would feed ZSTD bytes through DeflateManager " + "and produce garbage." + ) + + +@_nvcomp_only_p3 +def test_nvcomp_batch_upload_perf_regression_guard_p3(tmp_path, monkeypatch): + """Sanity guard: 2048x2048 Deflate-tiled GPU decode finishes quickly.""" + from xrspatial.geotiff import read_geotiff_gpu + + rng = np.random.RandomState(20260508) + arr = rng.randint(0, 4096, size=(2048, 2048), dtype=np.uint16) + path = tmp_path / f"deflate_2048_perf_{uuid.uuid4().hex[:8]}.tif" + _write_deflate_tiled_p3(path, arr, tile=(128, 128)) + + # Warm up. + _ = read_geotiff_gpu(str(path)) + + records = _wrap_nvcomp_with_call_recorder_p3(monkeypatch) + t0 = time.perf_counter() + out = read_geotiff_gpu(str(path)) + elapsed = time.perf_counter() - t0 + + assert any(success for _, success in records), ( + "nvCOMP fast-path did not run during the timed call; the threshold " + f"is meaningless without it. Records: {records}" + ) + + assert elapsed < 0.2, ( + f"read_geotiff_gpu on 2048x2048 deflate-tiled TIFF took " + f"{elapsed * 1000:.1f} ms (threshold 200 ms) -- possible " + f"regression in the nvCOMP batched H2D upload path" + ) + assert out.shape == (2048, 2048) + + +# ============================================================ +# Section: nvCOMP decompress cumsum offsets (#1950) +# ============================================================ +# Source: test_nvcomp_decompress_cumsum_offsets_1950.py +# +# ``_try_nvcomp_batch_decompress`` used to compute its per-tile host +# prefix-sum offsets via a Python ``for`` loop. The fix swaps in +# ``np.cumsum(sizes, out=offsets[1:])`` to align with the sibling +# batched-D2H helper and the compress-side prefix sum. + + +def test_nvcomp_decompress_uses_cumsum_for_offsets_1950(): + """Source-level guard against reintroducing the Python for loop.""" + src_path = pathlib.Path(__file__).parent.parent.parent / "_gpu_decode.py" + src = src_path.read_text() + + cumsum_call = re.compile( + r"np\.cumsum\(\s*comp_sizes_arr\[:-1\]\s*,\s*" + r"out\s*=\s*comp_offsets_h\[1:\]\s*\)" + ) + assert cumsum_call.search(src), ( + "decompress upload block should use " + "``np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])`` for " + "prefix-sum offsets, aligning with _batched_d2h_to_bytes " + "(issue #1950)." + ) + legacy_loop = re.compile( + r"for\s+i\s+in\s+range\(\s*1\s*,\s*n_tiles\s*\)\s*:\s*\n" + r"\s*comp_offsets_h\[i\]" + ) + assert not legacy_loop.search(src), ( + "decompress upload block should no longer compute prefix-sum " + "offsets with a Python for loop (issue #1950)." + ) + + +def test_cumsum_matches_loop_prefix_sum_1950(): + """Equivalence between the vectorised cumsum and the prior loop.""" + rng = np.random.RandomState(1950) + n = 1024 + sizes = rng.randint(100, 100_000, size=n).astype(np.int64) + + offsets_cumsum = np.zeros(n, dtype=np.int64) + if n > 1: + np.cumsum(sizes[:-1], out=offsets_cumsum[1:]) + + offsets_loop = np.zeros(n, dtype=np.int64) + for i in range(1, n): + offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1] + + np.testing.assert_array_equal(offsets_cumsum, offsets_loop) + + +@pytest.mark.skipif( + importlib.util.find_spec("cupy") is None, + reason="cupy required for nvCOMP path", +) +def test_nvcomp_batch_decompress_roundtrip_1950(): + """End-to-end check: a deflate-tiled raster still decodes correctly.""" + if os.environ.get("XRSPATIAL_GEOTIFF_STRICT_GPU") != "1": + pytest.skip( + "set XRSPATIAL_GEOTIFF_STRICT_GPU=1 to exercise the nvCOMP " + "prefix-sum site; without it the GPU path may fall back to " + "a CPU codec and bypass this regression." + ) + try: + import cupy + except ImportError: + pytest.skip("cupy not importable") + if not cupy.cuda.is_available(): + pytest.skip("CUDA device not available") + + from xrspatial.geotiff import open_geotiff, to_geotiff + + rng = np.random.RandomState(1950) + height, width = 1024, 1024 + arr = rng.rand(height, width).astype(np.float32) + da = xr.DataArray( + arr, dims=["y", "x"], + coords={"y": np.arange(height), "x": np.arange(width)}, + attrs={"crs": 4326}, + ) + + with tempfile.TemporaryDirectory() as td: + path = os.path.join(td, "tmp_1950_deflate.tif") + to_geotiff(da, path, compression="deflate", tile_size=256) + + result = open_geotiff(path, gpu=True) + assert result.shape == (height, width) + decoded = cupy.asnumpy(result.data) if hasattr( + result.data, "get") else np.asarray(result.data) + + np.testing.assert_allclose(decoded, arr, atol=0, rtol=0) + + +# ============================================================ +# Section: nvCOMP from-device-bufs single alloc (#1659) +# ============================================================ +# Source: test_nvcomp_from_device_bufs_single_alloc_1659.py +# +# ``_try_nvcomp_from_device_bufs`` used to allocate N separate +# ``cupy.empty(tile_bytes)`` output buffers and run ``cupy.concatenate`` +# after the nvCOMP decompress kernel returned. The fix matches the +# single-contiguous-buffer + pointer-offset pattern. + +from xrspatial.geotiff._gpu_decode import _try_nvcomp_from_device_bufs # noqa: E402 + + +def _nvcomp_available_1659() -> bool: + from xrspatial.geotiff._gpu_decode import _get_nvcomp + return _get_nvcomp() is not None + + +@requires_gpu +def test_unsupported_codec_short_circuits_before_allocation_1659(): + """Non-ZSTD codecs must return None without allocating output buffers.""" + import cupy + + d_tiles = [cupy.zeros(1024, dtype=cupy.uint8) for _ in range(4)] + assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 8) is None + + +@requires_gpu +def test_no_nvcomp_lib_returns_none_1659(monkeypatch): + """When the nvCOMP library is missing, the function must return None.""" + import cupy + + from xrspatial.geotiff import _gpu_decode + + monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: None) + + d_tiles = [cupy.zeros(1024, dtype=cupy.uint8)] + assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 50000) is None + + +@requires_gpu +def test_memory_guard_runs_with_full_decomp_size_1659(monkeypatch): + """The single-buffer allocation must be size-checked before cupy.empty.""" + import cupy + + from xrspatial.geotiff import _gpu_decode + + seen = {"total_bytes": None, "what": None, "called": False} + + def fake_check(required_bytes, what="tile buffer"): + seen["total_bytes"] = int(required_bytes) + seen["what"] = what + seen["called"] = True + raise MemoryError("simulated OOM") + + monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: object()) + monkeypatch.setattr(_gpu_decode, "_check_gpu_memory", fake_check) + + n_tiles = 8 + tile_bytes = 65536 + d_tiles = [cupy.zeros(128, dtype=cupy.uint8) for _ in range(n_tiles)] + + with pytest.raises(MemoryError): + _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) + + assert seen["called"], "_check_gpu_memory was not called" + expected_bytes = n_tiles * tile_bytes + assert seen["total_bytes"] == expected_bytes, ( + f"expected total {expected_bytes}, got {seen['total_bytes']}" + ) + assert "decompressed" in seen["what"] or "nvCOMP" in seen["what"], ( + f"unhelpful 'what' label: {seen['what']!r}" + ) + + +@pytest.mark.skipif( + not _HAS_GPU or not _nvcomp_available_1659(), + reason="cupy + CUDA + nvCOMP shared lib required", +) +def test_zstd_decompress_roundtrip_returns_single_contiguous_buffer_1659(): + """End-to-end: feed real ZSTD-compressed device buffers in.""" + import cupy + import zstandard as zstd + + rng = np.random.default_rng(seed=1659) + tile_bytes = 4096 + n_tiles = 8 + + cctx = zstd.ZstdCompressor() + host_tiles = [rng.integers(0, 256, size=tile_bytes, dtype=np.uint8) + for _ in range(n_tiles)] + compressed = [cctx.compress(t.tobytes()) for t in host_tiles] + d_tiles = [cupy.asarray(np.frombuffer(c, dtype=np.uint8)) + for c in compressed] + + result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) + + if result is None: + pytest.skip("nvCOMP returned None; library may be unusable on this host") + + assert isinstance(result, cupy.ndarray) + assert result.dtype == cupy.uint8 + assert result.shape == (n_tiles * tile_bytes,) + assert result.flags.c_contiguous + + host_out = result.get() + for i, expected in enumerate(host_tiles): + decoded = host_out[i * tile_bytes:(i + 1) * tile_bytes] + assert np.array_equal(decoded, expected), ( + f"tile {i} decoded payload differs from input" + ) + + +@requires_gpu +def test_no_orphan_decomp_buffers_after_call_1659(monkeypatch): + """A successful call returns a single contiguous buffer.""" + import cupy + + from xrspatial.geotiff import _gpu_decode + + monkeypatch.setattr(_gpu_decode, "_get_nvcomp", + lambda: _FakeNvcompLib_1659()) + + n_tiles = 4 + tile_bytes = 2048 + d_tiles = [cupy.zeros(64, dtype=cupy.uint8) for _ in range(n_tiles)] + result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) + + assert result is not None + assert isinstance(result, cupy.ndarray) + assert result.size == n_tiles * tile_bytes + assert result.flags.c_contiguous + assert result.dtype == cupy.uint8 + + +class _FakeNvcompLib_1659: + """Stand-in for the nvCOMP CDLL handle used in tests.""" + + def __getattr__(self, name): + if name == 'nvcompBatchedZstdDecompressGetTempSizeAsync': + return _fake_temp_size_fn_1659 + if name == 'nvcompBatchedZstdDecompressAsync': + return _fake_decompress_fn_1659 + raise AttributeError(name) + + +def _fake_temp_size_fn_1659(n, tile_bytes, opts, p_temp_size, total): + """Stub for nvcompBatchedZstdDecompressGetTempSizeAsync.""" + p_temp_size._obj.value = 1 + return 0 + + +def _fake_decompress_fn_1659(*args): + """Stub for nvcompBatchedZstdDecompressAsync (success).""" + return 0 + + +# ============================================================ +# Section: nvJPEG encode stream-null sync (#2212) +# ============================================================ +# Source: test_nvjpeg_encode_stream_sync_2212.py +# +# Replace ``Device().synchronize()`` inside the per-tile encode loops +# in ``_nvjpeg_batch_encode`` and ``_nvjpeg2k_batch_encode`` with +# ``Stream.null.synchronize()`` so the per-tile sync is scoped to the +# default stream rather than the whole device. + + +def _function_source_2212(func): + src = inspect.getsource(func) + start_line = func.__code__.co_firstlineno + return src, start_line + + +def _parent_map_2212(tree: ast.AST) -> dict: + mapping: dict = {} + for parent in ast.walk(tree): + for child in ast.iter_child_nodes(parent): + mapping[id(child)] = parent + return mapping + + +def _inside_for_loop_2212(node: ast.AST, parents: dict) -> bool: + cur = parents.get(id(node)) + while cur is not None: + if isinstance(cur, ast.For): + return True + cur = parents.get(id(cur)) + return False + + +def _device_synchronize_lines_2212(tree: ast.AST, start_line: int, + parents: dict, *, only_in_loop: bool): + out = [] + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if not isinstance(func, ast.Attribute): + continue + if func.attr != 'synchronize': + continue + parent_call = func.value + if not isinstance(parent_call, ast.Call): + continue + if not isinstance(parent_call.func, ast.Attribute): + continue + if parent_call.func.attr != 'Device': + continue + if only_in_loop and not _inside_for_loop_2212(node, parents): + continue + if not only_in_loop and _inside_for_loop_2212(node, parents): + continue + out.append(start_line + node.lineno - 1) + return out + + +def _stream_null_synchronize_lines_2212(tree: ast.AST, start_line: int, + parents: dict, *, only_in_loop: bool): + out = [] + for node in ast.walk(tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if not isinstance(func, ast.Attribute): + continue + if func.attr != 'synchronize': + continue + chain = func.value + if isinstance(chain, ast.Call): + continue + if not isinstance(chain, ast.Attribute): + continue + found_stream_null = False + cur = chain + if cur.attr == 'null': + inner = cur.value + if isinstance(inner, ast.Attribute) and inner.attr == 'Stream': + found_stream_null = True + if not found_stream_null: + continue + if only_in_loop and not _inside_for_loop_2212(node, parents): + continue + if not only_in_loop and _inside_for_loop_2212(node, parents): + continue + out.append(start_line + node.lineno - 1) + return out + + +class TestNvjpegEncodeStreamSync_2212: + """Structural assertions on the encoder sync fix (no GPU required).""" + + def setup_method(self): + from xrspatial.geotiff import _gpu_decode + self._fn = _gpu_decode._nvjpeg_batch_encode + src, start = _function_source_2212(self._fn) + self._src = src + self._start_line = start + self._tree = ast.parse(src) + self._parents = _parent_map_2212(self._tree) + + def test_no_device_synchronize_inside_encode_loop(self): + offending = _device_synchronize_lines_2212( + self._tree, self._start_line, self._parents, only_in_loop=True, + ) + assert offending == [], ( + "_nvjpeg_batch_encode contains cupy.cuda.Device().synchronize() " + f"calls inside a for-loop at file lines {offending}. The fix " + "in #2212 scopes the per-tile sync to the default stream via " + "cupy.cuda.Stream.null.synchronize()." + ) + + def test_stream_null_synchronize_present(self): + found = _stream_null_synchronize_lines_2212( + self._tree, self._start_line, self._parents, only_in_loop=True, + ) + assert len(found) >= 1, ( + "_nvjpeg_batch_encode no longer calls " + "cupy.cuda.Stream.null.synchronize() inside the encode loop." + ) + + +class TestNvjpeg2kEncodeStreamSync_2212: + """Structural assertions on the nvJPEG2000 encoder sync fix.""" + + def setup_method(self): + from xrspatial.geotiff import _gpu_decode + self._fn = _gpu_decode._nvjpeg2k_batch_encode + src, start = _function_source_2212(self._fn) + self._src = src + self._start_line = start + self._tree = ast.parse(src) + self._parents = _parent_map_2212(self._tree) + + def test_no_device_synchronize_inside_encode_loop(self): + offending = _device_synchronize_lines_2212( + self._tree, self._start_line, self._parents, only_in_loop=True, + ) + assert offending == [], ( + "_nvjpeg2k_batch_encode contains Device().synchronize() inside " + f"a for-loop at file lines {offending}. The fix in #2212 " + "requires Stream.null.synchronize()." + ) + + def test_stream_null_synchronize_present(self): + found = _stream_null_synchronize_lines_2212( + self._tree, self._start_line, self._parents, only_in_loop=True, + ) + assert len(found) >= 1, ( + "_nvjpeg2k_batch_encode no longer calls " + "Stream.null.synchronize() inside the encode loop." + ) + + +class TestDecodeReferencePattern_2212: + """The decoder pattern is the contract we mirror. Pin it as the reference.""" + + def setup_method(self): + from xrspatial.geotiff import _gpu_decode + self._fn = _gpu_decode._try_nvjpeg_batch_decode + src, start = _function_source_2212(self._fn) + self._src = src + self._start_line = start + self._tree = ast.parse(src) + self._parents = _parent_map_2212(self._tree) + + def test_decoder_uses_stream_null_sync_in_loop(self): + found = _stream_null_synchronize_lines_2212( + self._tree, self._start_line, self._parents, only_in_loop=True, + ) + assert len(found) >= 1, ( + "_try_nvjpeg_batch_decode no longer uses " + "Stream.null.synchronize() inside the decode loop." + ) + + +# ============================================================ +# Section: nvJPEG2000 single-alloc pool (#2107) +# ============================================================ +# Source: test_nvjpeg2k_single_alloc_2107.py +# +# Replace per-tile / per-component ``cupy.empty`` allocations and per- +# tile ``Device().synchronize()`` inside the decode loop with a single +# contiguous device pool and a single batch-end sync. + + +def _function_source_2107(func): + src = inspect.getsource(func) + start_line = func.__code__.co_firstlineno + return src, start_line + + +def _inside_for_loop_2107(node: ast.AST, parents: dict) -> bool: + cur = parents.get(id(node)) + while cur is not None: + if isinstance(cur, ast.For): + return True + cur = parents.get(id(cur)) + return False + + +def _parent_map_2107(tree: ast.AST) -> dict: + mapping: dict = {} + for parent in ast.walk(tree): + for child in ast.iter_child_nodes(parent): + mapping[id(child)] = parent + return mapping + + +class TestNvjpeg2kSingleAllocStructural_2107: + """Structural assertions on the refactored helper (no GPU required).""" + + def setup_method(self): + from xrspatial.geotiff import _gpu_decode + + self._fn = _gpu_decode._try_nvjpeg2k_batch_decode + src, start = _function_source_2107(self._fn) + self._src = src + self._start_line = start + self._tree = ast.parse(src) + self._parents = _parent_map_2107(self._tree) + + def test_no_cupy_empty_inside_decode_loop(self): + """``cupy.empty`` must NOT appear inside the per-tile ``for`` loop.""" + offending = [] + for node in ast.walk(self._tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if not isinstance(func, ast.Attribute): + continue + if func.attr != 'empty': + continue + if (not isinstance(func.value, ast.Name) + or func.value.id not in ('cupy', 'cp')): + continue + if _inside_for_loop_2107(node, self._parents): + offending.append(self._start_line + node.lineno - 1) + assert offending == [], ( + f"_try_nvjpeg2k_batch_decode contains cupy.empty(...) calls " + f"inside a for-loop at file lines {offending}. The refactor " + f"in #2107 moved every output allocation outside the per-tile " + f"loop." + ) + + def test_no_device_synchronize_inside_decode_loop(self): + """``Device().synchronize()`` must NOT live inside the decode loop.""" + offending = [] + for node in ast.walk(self._tree): + if not isinstance(node, ast.Call): + continue + func = node.func + if not isinstance(func, ast.Attribute): + continue + if func.attr != 'synchronize': + continue + parent_call = func.value + if (not isinstance(parent_call, ast.Call) + or not isinstance(parent_call.func, ast.Attribute) + or parent_call.func.attr != 'Device'): + continue + if _inside_for_loop_2107(node, self._parents): + offending.append(self._start_line + node.lineno - 1) + assert offending == [], ( + f"_try_nvjpeg2k_batch_decode contains Device().synchronize() " + f"calls inside a for-loop at file lines {offending}. The " + f"refactor in #2107 keeps exactly one batch-end sync outside " + f"the loop." + ) + + def test_pool_allocation_present(self): + """Source contains the expected pool buffer name and slab math.""" + assert 'd_comp_pool' in self._src, ( + "_try_nvjpeg2k_batch_decode no longer references the shared " + "d_comp_pool buffer; the refactor in #2107 is missing or " + "reverted." + ) + assert 'per_tile_comp_bytes' in self._src, ( + "_try_nvjpeg2k_batch_decode no longer references " + "per_tile_comp_bytes." + ) + + def test_check_gpu_memory_guard_present(self): + """The pool allocation must be guarded by ``_check_gpu_memory``.""" + assert '_check_gpu_memory(' in self._src, ( + "_try_nvjpeg2k_batch_decode no longer calls _check_gpu_memory." + ) + + +class TestNvjpeg2kLibAbsentShortCircuit_2107: + """When the shared library is missing, the function returns None.""" + + def test_returns_none_when_lib_missing(self, monkeypatch): + from xrspatial.geotiff import _gpu_decode + + monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: None) + + result = _gpu_decode._try_nvjpeg2k_batch_decode( + compressed_tiles=[b''], + tile_width=8, + tile_height=8, + dtype=np.dtype('uint8'), + samples=1, + ) + assert result is None + + def test_returns_none_for_unsupported_dtype(self, monkeypatch): + """Unsupported dtypes short-circuit before any device allocation.""" + from xrspatial.geotiff import _gpu_decode + + class _FakeLib: + def __init__(self): + self.calls = [] + + def nvjpeg2kCreateSimple(self, *_args): + return 0 + + def nvjpeg2kDecodeStateCreate(self, *_args): + return 0 + + def nvjpeg2kStreamCreate(self, *_args): + return 0 + + def nvjpeg2kDecodeParamsCreate(self, *_args): + return 0 + + def nvjpeg2kDecodeParamsDestroy(self, *_args): + self.calls.append('params_destroy') + + def nvjpeg2kStreamDestroy(self, *_args): + self.calls.append('stream_destroy') + + def nvjpeg2kDecodeStateDestroy(self, *_args): + self.calls.append('state_destroy') + + def nvjpeg2kDestroy(self, *_args): + self.calls.append('handle_destroy') + + fake = _FakeLib() + monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: fake) + + result = _gpu_decode._try_nvjpeg2k_batch_decode( + compressed_tiles=[b''], + tile_width=8, + tile_height=8, + dtype=np.dtype('float32'), + samples=1, + ) + assert result is None + assert fake.calls == [ + 'params_destroy', + 'stream_destroy', + 'state_destroy', + 'handle_destroy', + ] + + +@requires_gpu +class TestNvjpeg2kPoolWithCupy_2107: + """Lightweight cupy-only smoke tests for the pool layout.""" + + def test_pool_slabs_are_non_overlapping(self): + """Tile-component slabs into the pool must not overlap.""" + cupy = pytest.importorskip('cupy') + + n_tiles = 4 + tile_width = 32 + tile_height = 32 + samples = 3 + dtype = np.dtype('uint16') + pitch = tile_width * dtype.itemsize + per_tile_comp_bytes = samples * tile_height * pitch + pool = cupy.empty(n_tiles * per_tile_comp_bytes, dtype=cupy.uint8) + + seen = set() + for i in range(n_tiles): + tile_pool_start = i * per_tile_comp_bytes + for c in range(samples): + start = tile_pool_start + c * tile_height * pitch + end = start + tile_height * pitch + for byte in range(start, end): + assert byte not in seen, ( + f"pool byte {byte} appears in two slabs " + f"(tile={i}, comp={c}); per-tile slab math is " + f"wrong." + ) + seen.add(byte) + assert len(seen) == int(pool.nbytes) + + +# ============================================================ +# Section: nvJPEG output-format constants (#1549) +# ============================================================ +# Source: test_jpeg_gpu_1549.py +# +# Off-by-two on the ``nvjpegOutputFormat_t`` constants in +# ``_gpu_decode.py`` caused ``cudaErrorIllegalAddress`` on 3-band JPEG +# TIFFs and silently-wrong pixels on single-band JPEG TIFFs. + + +def _nvjpeg_available_1549() -> bool: + """True when libnvjpeg.so loads on this host.""" + if not _HAS_GPU: + return False + try: + from xrspatial.geotiff._gpu_decode import _get_nvjpeg + return _get_nvjpeg() is not None + except Exception: + return False + + +_HAS_NVJPEG_1549 = _nvjpeg_available_1549() + +_gpu_only_1549 = pytest.mark.skipif( + not (_HAS_GPU and _HAS_TIFFFILE and _HAS_PIL + and _HAS_IMAGECODECS and _HAS_NVJPEG_1549), + reason="cupy + CUDA + tifffile + Pillow + imagecodecs + nvJPEG required", +) + + +def _write_jpeg_rgb_tiff_1549(path: str, seed: int = 0, + noise: bool = True) -> np.ndarray: + """Write a 3-band 256x256 tiled JPEG TIFF using tifffile.""" + import tifffile + if noise: + rng = np.random.default_rng(seed) + arr = rng.integers(0, 256, size=(256, 256, 3), dtype=np.uint8) + else: + ys, xs = np.mgrid[0:256, 0:256].astype(np.int32) + r = (ys + xs) // 2 + g = ys + b = xs + arr = np.stack([r, g, b], axis=2).clip(0, 255).astype(np.uint8) + tifffile.imwrite(path, arr, photometric='rgb', tile=(128, 128), + compression='jpeg') + return arr + + +def _write_jpeg_gray_tiff_1549(path: str, seed: int = 42) -> np.ndarray: + """Write a 1-band 256x256 tiled JPEG TIFF using tifffile.""" + import tifffile + rng = np.random.default_rng(seed) + arr = rng.integers(0, 256, size=(256, 256), dtype=np.uint8) + tifffile.imwrite(path, arr, photometric='minisblack', tile=(128, 128), + compression='jpeg') + return arr + + +@_gpu_only_1549 +def test_rgb_jpeg_gpu_no_crash_1549(tmp_path, monkeypatch): + """3-band JPEG must not raise CUDARuntimeError on GPU read.""" + import cupy + + from xrspatial.geotiff import _gpu_decode, read_geotiff_gpu + + spy = {"calls": 0, "successes": 0} + original = _gpu_decode._try_nvjpeg_batch_decode + + def wrapped(*args, **kwargs): + spy["calls"] += 1 + result = original(*args, **kwargs) + if result is not None: + spy["successes"] += 1 + return result + + monkeypatch.setattr(_gpu_decode, "_try_nvjpeg_batch_decode", wrapped) + + path = str(tmp_path / "rgb_jpeg_1549.tif") + _write_jpeg_rgb_tiff_1549(path) + + arr = read_geotiff_gpu(path, gpu='strict', allow_internal_only_jpeg=True) + assert isinstance(arr.data, cupy.ndarray) + decoded = arr.data.get() + assert decoded.shape == (256, 256, 3) + assert decoded.dtype == np.uint8 + + assert spy["calls"] >= 1, ( + "nvJPEG branch was never called -- test did not exercise the " + "code path the #1549 fix lives on" + ) + assert spy["successes"] >= 1, ( + "nvJPEG returned None -- CPU Pillow fallback ran and the fix was " + "not exercised" + ) + + +@_gpu_only_1549 +def test_rgb_jpeg_gpu_matches_cpu_1549(tmp_path): + """GPU pixels must be within JPEG decoder tolerance of CPU pixels.""" + from xrspatial.geotiff import open_geotiff + + path = str(tmp_path / "rgb_jpeg_match_1549.tif") + _write_jpeg_rgb_tiff_1549(path, noise=False) + + cpu = open_geotiff(path, allow_internal_only_jpeg=True) + gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True) + assert cpu.shape == gpu.shape == (256, 256, 3) + + cpu_arr = np.asarray(cpu.data) + gpu_arr = np.asarray(gpu.data.get()) + + diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int)) + assert diff.mean() < 1.0, f"mean diff {diff.mean():.3f} too large" + assert diff.max() < 8, f"max diff {diff.max()} too large" + + +@_gpu_only_1549 +def test_grayscale_jpeg_gpu_matches_cpu_1549(tmp_path): + """Single-band JPEG GPU read must also produce correct pixels.""" + from xrspatial.geotiff import open_geotiff + + path = str(tmp_path / "gray_jpeg_1549.tif") + _write_jpeg_gray_tiff_1549(path) + + cpu = open_geotiff(path, allow_internal_only_jpeg=True) + gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True) + assert cpu.shape == gpu.shape == (256, 256) + + cpu_arr = np.asarray(cpu.data) + gpu_arr = np.asarray(gpu.data.get()) + diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int)) + assert diff.max() <= 2, ( + f"grayscale max diff {diff.max()} indicates corruption, " + f"not just rounding" + ) + + +@_gpu_only_1549 +def test_cuda_context_survives_after_jpeg_gpu_read_1549(tmp_path): + """Verify the CUDA context is healthy after a GPU JPEG read.""" + import cupy + + from xrspatial.geotiff import open_geotiff + + path = str(tmp_path / "rgb_ctx_1549.tif") + _write_jpeg_rgb_tiff_1549(path) + + arr = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True) + _ = arr.data.get() + + x = cupy.arange(1024, dtype=cupy.float32) + s = float(cupy.sum(x).item()) + assert s == 1023 * 1024 / 2 + + other_path = str(tmp_path / "other_1549.tif") + _write_jpeg_gray_tiff_1549(other_path, seed=7) + other = open_geotiff(other_path, gpu=True, allow_internal_only_jpeg=True) + assert other.shape == (256, 256) + assert other.dtype == np.uint8 + + +# ============================================================ +# Section: LERC valid-mask GPU (PR #1529 follow-up) +# ============================================================ +# Source: test_lerc_valid_mask_gpu.py +# +# The CPU LERC reader honours the LERC valid-mask. The GPU LERC tile- +# decode path used to discard the mask. These tests confirm the GPU +# path now matches the CPU path for representative mask combinations. + +# Module-level skip: this whole section is LERC-only. +lerc_lerc = pytest.importorskip("lerc", reason="lerc required for LERC GPU tests") + +from xrspatial.geotiff._compression import LERC_AVAILABLE # noqa: E402 + +_gpu_only_lerc = pytest.mark.skipif( + not (_HAS_GPU and LERC_AVAILABLE), + reason="cupy + CUDA + lerc required", +) + + +@pytest.fixture +def lerc_writer_with_mask_gpu(monkeypatch): + """Patch ``lerc_compress`` to embed a valid-mask the writer can't pass.""" + holder = {"invalid": None} + + def _patched(data, width, height, samples=1, + dtype=np.dtype('float32'), max_z_error=0.0): + if samples == 1: + arr = np.frombuffer(data, dtype=dtype).reshape(height, width) + else: + arr = np.frombuffer(data, dtype=dtype).reshape( + height, width, samples) + invalid_pred = holder["invalid"] + if invalid_pred is None: + mask = None + has_mask = False + else: + invalid = invalid_pred(arr) + mask = np.where(invalid, np.uint8(0), np.uint8(1)) + has_mask = True + result = lerc_lerc.encode( + arr, samples, has_mask, mask, max_z_error, 1, + ) + if result[0] != 0: + raise RuntimeError( + f"LERC encode failed with error code {result[0]}") + return bytes(result[2]) + + monkeypatch.setattr( + "xrspatial.geotiff._compression.lerc_compress", _patched, + ) + return holder + + +def _read_cpu_gpu_lerc(path): + """Read *path* with both readers and return ``(cpu_array, gpu_host_array)``.""" + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + cpu, _geo = read_to_array(path, allow_experimental_codecs=True) + gpu_da = read_geotiff_gpu( + path, gpu='strict', allow_experimental_codecs=True, + ) + gpu_host = gpu_da.data.get() + return cpu, gpu_host + + +def _restore_sentinel_lerc(arr, nodata): + """Replace NaN positions in *arr* with *nodata* for bit-exact compare.""" + if nodata is None or arr.dtype.kind != 'f' or np.isnan(nodata): + return arr + out = arr.copy() + out[np.isnan(out)] = arr.dtype.type(nodata) + return out + + +@_gpu_only_lerc +class TestGpuLercValidMask: + """End-to-end TIFF round-trips comparing GPU vs CPU output.""" + + def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask_gpu): + """Float32 LERC + NaN nodata: GPU output matches CPU output.""" + from xrspatial.geotiff._writer import write + + arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8) + invalid_positions = {(0, 1), (5, 4)} + + def invalid_pred(a): + m = np.zeros(a.shape[:2], dtype=bool) + for r, c in invalid_positions: + m[r, c] = True + return m + lerc_writer_with_mask_gpu["invalid"] = invalid_pred + + path = str(tmp_path / "lerc_mask_nan_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8, + nodata=float("nan")) + + cpu, gpu = _read_cpu_gpu_lerc(path) + for (r, c) in invalid_positions: + assert np.isnan(cpu[r, c]) + assert np.isnan(gpu[r, c]) + cpu_valid = np.where(np.isnan(cpu), 0.0, cpu) + gpu_valid = np.where(np.isnan(gpu), 0.0, gpu) + np.testing.assert_array_equal(cpu_valid, gpu_valid) + + def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask_gpu): + """Float32 LERC + sentinel nodata (-9999): GPU matches CPU.""" + from xrspatial.geotiff._writer import write + + arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8) + invalid_positions = {(0, 1), (3, 3), (7, 7)} + + def invalid_pred(a): + m = np.zeros(a.shape[:2], dtype=bool) + for r, c in invalid_positions: + m[r, c] = True + return m + lerc_writer_with_mask_gpu["invalid"] = invalid_pred + + path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8, + nodata=-9999.0) + + cpu, gpu = _read_cpu_gpu_lerc(path) + gpu_with_sentinel = _restore_sentinel_lerc(gpu, -9999.0) + np.testing.assert_array_equal(cpu, gpu_with_sentinel) + for (r, c) in invalid_positions: + assert np.isnan(gpu[r, c]) + assert gpu_with_sentinel[r, c] == np.float32(-9999.0) + + def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask_gpu): + """Uint16 LERC + sentinel nodata (65535): GPU matches CPU.""" + from xrspatial.geotiff._writer import write + + arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8) + invalid_positions = {(0, 1), (4, 4)} + + def invalid_pred(a): + m = np.zeros(a.shape[:2], dtype=bool) + for r, c in invalid_positions: + m[r, c] = True + return m + lerc_writer_with_mask_gpu["invalid"] = invalid_pred + + path = str(tmp_path / "lerc_mask_uint16_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8, + nodata=65535) + + cpu, gpu = _read_cpu_gpu_lerc(path) + assert gpu.dtype == np.float64 + gpu_no_nan = np.where(np.isnan(gpu), 65535.0, gpu) + gpu_u16 = gpu_no_nan.astype(np.uint16) + np.testing.assert_array_equal(cpu, gpu_u16) + for (r, c) in invalid_positions: + assert np.isnan(gpu[r, c]) + assert gpu_u16[r, c] == np.uint16(65535) + + def test_no_mask_roundtrip_bitexact(self, tmp_path): + """All-valid LERC (no encoded mask): GPU and CPU agree bit-exact.""" + from xrspatial.geotiff._writer import write + + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + path = str(tmp_path / "lerc_no_mask_gpu.tif") + write(arr, path, compression="lerc", tiled=True, tile_size=8) + + cpu, gpu = _read_cpu_gpu_lerc(path) + np.testing.assert_array_equal(cpu, arr) + np.testing.assert_array_equal(gpu, arr) + + +# ============================================================ +# Section: predictor=2 big-endian GPU (#1517) +# ============================================================ +# Source: test_predictor2_big_endian_gpu_1517.py +# +# Predictor=2 BE files used to come back with wrong values on the GPU +# tiled path. The per-dtype predictor kernels now byte-swap the buffer +# before running the prefix-sum. + +_gpu_only_1517 = pytest.mark.skipif( + not (_HAS_GPU and _HAS_TIFFFILE), + reason="cupy + CUDA + tifffile required", +) + + +def _block_cpu_fallback_1517(monkeypatch): + """Make any call to ``read_to_array`` from ``read_geotiff_gpu`` fail loudly.""" + from xrspatial.geotiff._backends import gpu as gpu_backend + + def _no_fallback(*args, **kwargs): + raise AssertionError( + "read_geotiff_gpu fell back to read_to_array; " + "the GPU decode path was not exercised." + ) + + monkeypatch.setattr( + gpu_backend, '_read_to_array', _no_fallback, raising=True, + ) + + +@_gpu_only_1517 +def test_gpu_predictor2_big_endian_int32_tiled_reproducer_1517(tmp_path, monkeypatch): + """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260507) + arr = rng.randint( + -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64 + ).astype(np.int32) + + path = tmp_path / "be_pred2_int32.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=2, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + _block_cpu_fallback_1517(monkeypatch) + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(np.int32) + assert gpu_da.data.dtype.isnative + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only_1517 +@pytest.mark.parametrize( + "dtype", + [np.uint16, np.int16, np.uint32, np.int32], +) +def test_gpu_predictor2_big_endian_dtypes_tiled_1517(tmp_path, monkeypatch, dtype): + """BE predictor=2 tiled files match CPU baseline across dtypes.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260508) + info = np.iinfo(dtype) + arr = rng.randint( + max(info.min, -1_000_000), + min(info.max, 1_000_000), + size=(32, 48), + dtype=np.int64, + ).astype(dtype) + + path = tmp_path / f"be_pred2_{np.dtype(dtype).name}.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=2, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + _block_cpu_fallback_1517(monkeypatch) + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(dtype) + assert gpu_da.data.dtype.isnative + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only_1517 +def test_gpu_predictor2_big_endian_stripped_uint16_1517(tmp_path): + """Stripped BE predictor=2 files take the CPU fallback but stay correct.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260509) + arr = rng.randint(0, 60000, size=(32, 48), dtype=np.uint16) + + path = tmp_path / "be_pred2_uint16_strip.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=2, compression="deflate", + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(np.uint16) + assert gpu_da.data.dtype.isnative + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only_1517 +def test_gpu_predictor2_little_endian_still_works_1517(tmp_path, monkeypatch): + """LE predictor=2 must still round-trip after the BE fix.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260510) + arr = rng.randint( + -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64 + ).astype(np.int32) + + path = tmp_path / "le_pred2_int32.tif" + tifffile.imwrite( + str(path), arr, byteorder="<", predictor=2, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + _block_cpu_fallback_1517(monkeypatch) + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(np.int32) + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +@_gpu_only_1517 +def test_gpu_predictor3_big_endian_still_works_1517(tmp_path, monkeypatch): + """Floating-point predictor BE must still match CPU after the fix.""" + import cupy + import tifffile + + from xrspatial.geotiff import read_geotiff_gpu + from xrspatial.geotiff._reader import read_to_array + + rng = np.random.RandomState(20260511) + arr = rng.standard_normal((32, 48)).astype(np.float32) + + path = tmp_path / "be_pred3_float32.tif" + tifffile.imwrite( + str(path), arr, byteorder=">", predictor=3, + compression="deflate", tile=(16, 16), + ) + + cpu, _ = read_to_array(str(path)) + np.testing.assert_array_equal(cpu, arr) + + _block_cpu_fallback_1517(monkeypatch) + gpu_da = read_geotiff_gpu(str(path)) + assert isinstance(gpu_da.data, cupy.ndarray) + assert gpu_da.data.dtype == np.dtype(np.float32) + np.testing.assert_array_equal(gpu_da.data.get(), cpu) + + +def test_swap_byte_lanes_numpy_bps2_1517(): + """The byte-swap helper reverses bytes per sample on a numpy buffer.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8) + _swap_byte_lanes(buf, 2) + np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03], + dtype=np.uint8)) + + +def test_swap_byte_lanes_numpy_bps4_1517(): + """bps=4: full byte reversal within each 4-byte sample.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.array([0x01, 0x02, 0x03, 0x04, + 0x05, 0x06, 0x07, 0x08], dtype=np.uint8) + _swap_byte_lanes(buf, 4) + np.testing.assert_array_equal( + buf, np.array([0x04, 0x03, 0x02, 0x01, + 0x08, 0x07, 0x06, 0x05], dtype=np.uint8)) + + +def test_swap_byte_lanes_numpy_bps8_1517(): + """bps=8: full byte reversal within each 8-byte sample.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + sample = np.arange(1, 9, dtype=np.uint8) + buf = np.tile(sample, 2).copy() + _swap_byte_lanes(buf, 8) + np.testing.assert_array_equal( + buf, np.tile(sample[::-1], 2)) + + +def test_swap_byte_lanes_uint8_noop_1517(): + """bps=1 must be a no-op.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.array([1, 2, 3], dtype=np.uint8) + _swap_byte_lanes(buf, 1) + np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8)) + + +def test_swap_byte_lanes_rejects_unsupported_bps_1517(): + """Unsupported bps values raise ValueError rather than corrupt data.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.zeros(6, dtype=np.uint8) + with pytest.raises(ValueError, match="unsupported bps"): + _swap_byte_lanes(buf, 3) + + +def test_swap_byte_lanes_rejects_misaligned_size_1517(): + """Buffer size must be a multiple of bps.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.zeros(5, dtype=np.uint8) + with pytest.raises(ValueError, match="not a multiple"): + _swap_byte_lanes(buf, 2) + + +def test_swap_byte_lanes_numpy_is_zero_temp_1517(): + """The numpy path must mutate the original buffer without realloc.""" + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8) + addr_before = buf.ctypes.data + _swap_byte_lanes(buf, 2) + assert buf.ctypes.data == addr_before + np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03], + dtype=np.uint8)) + + +@_gpu_only_1517 +@pytest.mark.parametrize("bps,dtype", [ + (2, np.uint16), + (4, np.uint32), + (8, np.uint64), +]) +def test_swap_byte_lanes_cupy_kernel_1517(bps, dtype): + """The cupy path runs the CUDA kernel and matches numpy.byteswap.""" + import cupy + + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + rng = np.random.RandomState(20260512 + bps) + n_samples = 1024 + src = rng.randint(0, np.iinfo(dtype).max, size=n_samples, + dtype=np.uint64).astype(dtype) + expected = src.byteswap() + + d_buf = cupy.asarray(src.view(np.uint8)) + addr_before = int(d_buf.data.ptr) + _swap_byte_lanes(d_buf, bps) + addr_after = int(d_buf.data.ptr) + + assert addr_after == addr_before, "kernel must operate in place" + np.testing.assert_array_equal( + d_buf.get().view(dtype), expected, + ) + + +@_gpu_only_1517 +def test_swap_byte_lanes_cupy_uint8_noop_1517(): + """bps=1 leaves cupy buffers untouched (no kernel launch).""" + import cupy + + from xrspatial.geotiff._gpu_decode import _swap_byte_lanes + + src = np.arange(16, dtype=np.uint8) + d_buf = cupy.asarray(src) + _swap_byte_lanes(d_buf, 1) + np.testing.assert_array_equal(d_buf.get(), src) + + +# ============================================================ +# Section: predictor=3 + integer SampleFormat rejection on GPU (#1933) +# ============================================================ +# Source: test_predictor3_int_dtype_gpu_1933.py +# +# ``_validate_predictor_sample_format`` is wired into every IFD-read +# site. This section closes the GPU coverage gap for the two GPU +# validator call sites (tiled eager + GDS chunked). + +from xrspatial.geotiff._compression import COMPRESSION_NONE # noqa: E402 +from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype # noqa: E402 +from xrspatial.geotiff._header import ( # noqa: E402 + TAG_BITS_PER_SAMPLE, + TAG_COMPRESSION, + TAG_IMAGE_LENGTH, + TAG_IMAGE_WIDTH, + TAG_PHOTOMETRIC, + TAG_PREDICTOR, + TAG_ROWS_PER_STRIP, + TAG_SAMPLE_FORMAT, + TAG_SAMPLES_PER_PIXEL, + TAG_STRIP_BYTE_COUNTS, + TAG_STRIP_OFFSETS, + TAG_TILE_BYTE_COUNTS, + TAG_TILE_LENGTH, + TAG_TILE_OFFSETS, + TAG_TILE_WIDTH, +) +from xrspatial.geotiff._writer import ( # noqa: E402 + _assemble_standard_layout, + _write_stripped, +) + + +def _build_predictor3_uint32_stripped_tiff_1933(arr: np.ndarray) -> bytes: + """Build a stripped TIFF: predictor=3 + uint32 SampleFormat=1.""" + rel_off, bc, chunks = _write_stripped(arr, COMPRESSION_NONE, False) + bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype) + tags = [ + (TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]), + (TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]), + (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample), + (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE), + (TAG_PHOTOMETRIC, SHORT, 1, 1), + (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), + (TAG_SAMPLE_FORMAT, SHORT, 1, 1), + (TAG_PREDICTOR, SHORT, 1, 3), + (TAG_ROWS_PER_STRIP, SHORT, 1, arr.shape[0]), + (TAG_STRIP_OFFSETS, LONG, len(rel_off), rel_off), + (TAG_STRIP_BYTE_COUNTS, LONG, len(bc), bc), + ] + parts = [(arr, arr.shape[1], arr.shape[0], rel_off, bc, chunks)] + return _assemble_standard_layout(8, [tags], parts, bigtiff=False) + + +def _build_predictor3_uint32_tiled_tiff_1933( + arr: np.ndarray, tile_w: int = 16, tile_h: int = 16, +) -> bytes: + """Build a tiled malformed TIFF: predictor=3 + uint32 SampleFormat=1.""" + bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype) + h, w = arr.shape + + tiles_across = (w + tile_w - 1) // tile_w + tiles_down = (h + tile_h - 1) // tile_h + tiles: list[bytes] = [] + rel_off: list[int] = [] + bc: list[int] = [] + offset = 0 + for tr in range(tiles_down): + for tc in range(tiles_across): + r0 = tr * tile_h + c0 = tc * tile_w + r1 = min(r0 + tile_h, h) + c1 = min(c0 + tile_w, w) + tile_slice = arr[r0:r1, c0:c1] + if tile_slice.shape != (tile_h, tile_w): + padded = np.zeros((tile_h, tile_w), dtype=arr.dtype) + padded[: tile_slice.shape[0], : tile_slice.shape[1]] = ( + tile_slice) + tile_arr = padded + else: + tile_arr = np.ascontiguousarray(tile_slice) + chunk = tile_arr.tobytes() + rel_off.append(offset) + bc.append(len(chunk)) + tiles.append(chunk) + offset += len(chunk) + + tags = [ + (TAG_IMAGE_WIDTH, LONG, 1, w), + (TAG_IMAGE_LENGTH, LONG, 1, h), + (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample), + (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE), + (TAG_PHOTOMETRIC, SHORT, 1, 1), + (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), + (TAG_SAMPLE_FORMAT, SHORT, 1, 1), + (TAG_PREDICTOR, SHORT, 1, 3), + (TAG_TILE_WIDTH, LONG, 1, tile_w), + (TAG_TILE_LENGTH, LONG, 1, tile_h), + (TAG_TILE_OFFSETS, LONG, len(rel_off), rel_off), + (TAG_TILE_BYTE_COUNTS, LONG, len(bc), bc), + ] + parts = [(arr, w, h, rel_off, bc, tiles)] + return _assemble_standard_layout(8, [tags], parts, bigtiff=False) + + +@requires_gpu +class TestGPUEagerRejectsMalformedFile_1933: + """``read_geotiff_gpu`` rejects predictor=3 + integer SampleFormat.""" + + def test_gpu_eager_stripped_raises(self, tmp_path): + from xrspatial.geotiff import read_geotiff_gpu + + arr = np.array( + [[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint32) + path = tmp_path / "pred3_uint32_stripped.tif" + path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr)) + with pytest.raises(ValueError, match="Predictor=3"): + read_geotiff_gpu(str(path)) + + def test_gpu_eager_tiled_raises(self, tmp_path): + """Tiled layout hits the tiled GPU validator at gpu.py:443.""" + from xrspatial.geotiff import read_geotiff_gpu + + arr = np.arange(256, dtype=np.uint32).reshape(16, 16) + path = tmp_path / "pred3_uint32_tiled.tif" + path.write_bytes(_build_predictor3_uint32_tiled_tiff_1933(arr)) + with pytest.raises(ValueError, match="Predictor=3"): + read_geotiff_gpu(str(path)) + + def test_gpu_dispatcher_eager_raises(self, tmp_path): + """``open_geotiff(gpu=True)`` dispatcher rejects the file.""" + from xrspatial.geotiff import open_geotiff + + arr = np.arange(64, dtype=np.uint32).reshape(8, 8) + path = tmp_path / "pred3_uint32_dispatch.tif" + path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr)) + with pytest.raises(ValueError, match="Predictor=3"): + open_geotiff(str(path), gpu=True) + + +@requires_gpu +class TestGPUChunkedRejectsMalformedFile_1933: + """The dask+GPU paths also reject predictor=3 + integer.""" + + def test_read_geotiff_gpu_chunked_stripped_raises(self, tmp_path): + from xrspatial.geotiff import read_geotiff_gpu + + arr = np.arange(64, dtype=np.uint32).reshape(8, 8) + path = tmp_path / "pred3_uint32_chunked_str.tif" + path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr)) + with pytest.raises(ValueError, match="Predictor=3"): + read_geotiff_gpu(str(path), chunks=4) + + def test_read_geotiff_gpu_chunked_tiled_raises(self, tmp_path): + """Tiled chunked path with KvikIO available exercises gpu.py:999.""" + pytest.importorskip("kvikio") + + from xrspatial.geotiff import read_geotiff_gpu + + arr = np.arange(256, dtype=np.uint32).reshape(16, 16) + path = tmp_path / "pred3_uint32_chunked_tiled.tif" + path.write_bytes(_build_predictor3_uint32_tiled_tiff_1933(arr)) + with pytest.raises(ValueError, match="Predictor=3"): + read_geotiff_gpu(str(path), chunks=16) + + def test_open_geotiff_chunks_gpu_dispatcher_raises(self, tmp_path): + """``open_geotiff(chunks=, gpu=True)`` dispatcher rejects the file.""" + from xrspatial.geotiff import open_geotiff + + arr = np.arange(256, dtype=np.uint32).reshape(16, 16) + path = tmp_path / "pred3_uint32_chunked_dispatch.tif" + path.write_bytes(_build_predictor3_uint32_tiled_tiff_1933(arr)) + with pytest.raises(ValueError, match="Predictor=3"): + open_geotiff(str(path), chunks=8, gpu=True) + + +@requires_gpu +class TestValidPredictor3StillWorksOnGPU_1933: + """A legitimate predictor=3 + float32 tiled file still decodes on GPU.""" + + def test_predictor3_float32_gpu_round_trip(self, tmp_path): + from xrspatial.geotiff import read_geotiff_gpu, to_geotiff + + arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16) + path = tmp_path / "pred3_float32_tiled.tif" + to_geotiff( + arr, str(path), compression="deflate", predictor=3, + tiled=True, tile_size=16, + ) + + result = read_geotiff_gpu(str(path)) + assert result.dtype == np.float32 + np.testing.assert_array_equal(result.data.get(), arr) + + def test_predictor3_float32_dask_gpu_round_trip(self, tmp_path): + from xrspatial.geotiff import read_geotiff_gpu, to_geotiff + + arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16) + path = tmp_path / "pred3_float32_dask.tif" + to_geotiff( + arr, str(path), compression="deflate", predictor=3, + tiled=True, tile_size=16, + ) + + result = read_geotiff_gpu(str(path), chunks=8) + assert result.dtype == np.float32 + np.testing.assert_array_equal(result.compute().data.get(), arr) + + +@requires_gpu +class TestErrorMessageStable_1933: + """The GPU error wording matches the eager/dask wording.""" + + def test_gpu_error_message_matches_eager(self, tmp_path): + from xrspatial.geotiff import open_geotiff, read_geotiff_gpu + + arr = np.arange(64, dtype=np.uint32).reshape(8, 8) + path = tmp_path / "pred3_uint32_msg.tif" + path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr)) + + with pytest.raises(ValueError) as exc_eager: + open_geotiff(str(path)) + with pytest.raises(ValueError) as exc_gpu: + read_geotiff_gpu(str(path)) + + assert str(exc_eager.value) == str(exc_gpu.value), ( + "GPU and eager paths must surface the same Predictor=3 " + "error message so callers can use a single except branch." + ) + + +# ============================================================ +# Section: GPU writer rejects JPEG without opt-in (#1845) +# ============================================================ +# Source: test_gpu_jpeg_interop_reject_issue_D_1845.py +# +# ``write_geotiff_gpu`` mirrors ``to_geotiff`` and rejects +# ``compression='jpeg'`` by default. ``allow_internal_only_jpeg=True`` +# opts in and emits ``GeoTIFFFallbackWarning``. + +from xrspatial.geotiff import GeoTIFFFallbackWarning, write_geotiff_gpu # noqa: E402 + + +def _make_rgb_uint8_da_1845() -> xr.DataArray: + """64x64x3 uint8 RGB raster suitable for the JPEG encode path.""" + rng = np.random.RandomState(0) + arr = rng.randint(0, 256, size=(64, 64, 3), dtype=np.uint8) + return xr.DataArray( + arr, + dims=("y", "x", "band"), + coords={ + "y": np.arange(64, dtype=np.float64), + "x": np.arange(64, dtype=np.float64), + "band": np.array([1, 2, 3], dtype=np.int32), + }, + ) + + +def test_write_geotiff_gpu_rejects_jpeg_without_opt_in_1845(tmp_path): + """``compression='jpeg'`` without the opt-in raises ``ValueError``.""" + da = _make_rgb_uint8_da_1845() + path = str(tmp_path / "rejected_issue_D_1845.tif") + + with pytest.raises(ValueError, match="JPEGTables"): + write_geotiff_gpu(da, path, compression='jpeg') + + +def test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives_1845(tmp_path): + """The rejection error mentions the same alternative codecs.""" + da = _make_rgb_uint8_da_1845() + path = str(tmp_path / "rejected_msg_issue_D_1845.tif") + + with pytest.raises(ValueError) as exc: + write_geotiff_gpu(da, path, compression='jpeg') + + msg = str(exc.value) + assert "deflate" in msg + assert "zstd" in msg + + +def test_write_geotiff_gpu_rejects_jpeg_case_insensitive_1845(tmp_path): + """Upper-case ``compression='JPEG'`` is rejected too.""" + da = _make_rgb_uint8_da_1845() + path = str(tmp_path / "rejected_upper_issue_D_1845.tif") + + with pytest.raises(ValueError, match="JPEGTables"): + write_geotiff_gpu(da, path, compression='JPEG') + + +@requires_gpu +def test_write_geotiff_gpu_jpeg_opt_in_emits_warning_1845(tmp_path): + """``allow_internal_only_jpeg=True`` emits ``GeoTIFFFallbackWarning``.""" + da = _make_rgb_uint8_da_1845() + path = str(tmp_path / "opt_in_issue_D_1845.tif") + + with pytest.warns(GeoTIFFFallbackWarning, match="JPEGTables"): + write_geotiff_gpu( + da, path, + compression='jpeg', + allow_internal_only_jpeg=True, + ) + + assert os.path.exists(path) + assert os.path.getsize(path) > 0 + + +@requires_gpu +def test_write_geotiff_gpu_non_jpeg_unaffected_by_flag_1845(tmp_path): + """Setting ``allow_internal_only_jpeg=True`` on a non-JPEG codec is a no-op.""" + da = _make_rgb_uint8_da_1845() + path = str(tmp_path / "non_jpeg_flag_issue_D_1845.tif") + + with _warnings.catch_warnings(): + _warnings.simplefilter("error", GeoTIFFFallbackWarning) + write_geotiff_gpu( + da, path, + compression='zstd', + allow_internal_only_jpeg=True, + ) diff --git a/xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py b/xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py deleted file mode 100644 index 2f88b05b8..000000000 --- a/xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py +++ /dev/null @@ -1,151 +0,0 @@ -"""Issue #1845: ``write_geotiff_gpu`` must reject ``compression='jpeg'`` -by default. - -Background ----------- -``to_geotiff`` raises a ``ValueError`` for ``compression='jpeg'`` because -the encoder writes self-contained JFIF tiles without the TIFF JPEGTables -tag (347); the resulting files are unreadable by libtiff, GDAL, and -rasterio. The GPU writer sat in the same module and silently accepted -the same kwarg, producing the same broken format. The fix introduces an -``allow_internal_only_jpeg`` opt-in: callers who want the experimental -internal-reader-only path must ask for it explicitly, and they get a -``GeoTIFFFallbackWarning`` reminding them the file will not round-trip -through external readers. - -These tests pin the rejection (CPU-only, no CUDA required) and the -opt-in warning behaviour. The internal-only encode path itself is -covered by the updated tests in -``test_gpu_writer_compression_modes_2026_05_11.py`` which run only when -CUDA is present. -""" -from __future__ import annotations - -import importlib.util - -import numpy as np -import pytest -import xarray as xr - -from xrspatial.geotiff import GeoTIFFFallbackWarning, write_geotiff_gpu - - -def _gpu_available() -> bool: - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -_HAS_GPU = _gpu_available() - - -def _make_rgb_uint8_da() -> xr.DataArray: - """64x64x3 uint8 RGB raster suitable for the JPEG encode path.""" - rng = np.random.RandomState(0) - arr = rng.randint(0, 256, size=(64, 64, 3), dtype=np.uint8) - return xr.DataArray( - arr, - dims=("y", "x", "band"), - coords={ - "y": np.arange(64, dtype=np.float64), - "x": np.arange(64, dtype=np.float64), - "band": np.array([1, 2, 3], dtype=np.int32), - }, - ) - - -def test_write_geotiff_gpu_rejects_jpeg_without_opt_in(tmp_path): - """``compression='jpeg'`` without the opt-in raises ``ValueError``. - - Mirrors the ``to_geotiff`` rejection so both writers in the same - module agree about JPEG-in-TIFF interop. The check runs before any - GPU work, so the test does not need CUDA. - """ - da = _make_rgb_uint8_da() - path = str(tmp_path / "rejected_issue_D_1845.tif") - - with pytest.raises(ValueError, match="JPEGTables"): - write_geotiff_gpu(da, path, compression='jpeg') - - -def test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives(tmp_path): - """The rejection error mentions the same alternative codecs that - ``to_geotiff`` recommends, so callers landing on either entry point - learn what to switch to.""" - da = _make_rgb_uint8_da() - path = str(tmp_path / "rejected_msg_issue_D_1845.tif") - - with pytest.raises(ValueError) as exc: - write_geotiff_gpu(da, path, compression='jpeg') - - # The shared message wording from to_geotiff. If the two writers - # drift apart, callers reading the error get inconsistent advice. - msg = str(exc.value) - assert "deflate" in msg - assert "zstd" in msg - - -def test_write_geotiff_gpu_rejects_jpeg_case_insensitive(tmp_path): - """Upper-case ``compression='JPEG'`` is rejected too. - - ``to_geotiff`` lower-cases the compression string before comparing, - so the GPU writer must follow the same rule -- otherwise a caller - who types ``'JPEG'`` slips past the gate.""" - da = _make_rgb_uint8_da() - path = str(tmp_path / "rejected_upper_issue_D_1845.tif") - - with pytest.raises(ValueError, match="JPEGTables"): - write_geotiff_gpu(da, path, compression='JPEG') - - -@pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") -def test_write_geotiff_gpu_jpeg_opt_in_emits_warning(tmp_path): - """Setting ``allow_internal_only_jpeg=True`` proceeds with the JPEG - encode and emits ``GeoTIFFFallbackWarning``. - - The warning is the only signal the caller gets that their file may - not round-trip through GDAL or rasterio, so a missing warning here - would let the footgun back in. - """ - da = _make_rgb_uint8_da() - path = str(tmp_path / "opt_in_issue_D_1845.tif") - - with pytest.warns(GeoTIFFFallbackWarning, match="JPEGTables"): - write_geotiff_gpu( - da, path, - compression='jpeg', - allow_internal_only_jpeg=True, - ) - - # File was actually written, not just warned-about. - import os - assert os.path.exists(path) - assert os.path.getsize(path) > 0 - - -def test_write_geotiff_gpu_non_jpeg_unaffected_by_flag(tmp_path): - """Setting ``allow_internal_only_jpeg=True`` on a non-JPEG codec is a - no-op (no warning, no error). - - The flag is JPEG-specific; other codecs must not pay any cost for - it being present in the signature.""" - pytest.importorskip("cupy") - if not _HAS_GPU: - pytest.skip("cupy + CUDA required") - - da = _make_rgb_uint8_da() - path = str(tmp_path / "non_jpeg_flag_issue_D_1845.tif") - - import warnings as _warnings - with _warnings.catch_warnings(): - _warnings.simplefilter("error", GeoTIFFFallbackWarning) - # Should not raise (no JPEG-related warning fires). - write_geotiff_gpu( - da, path, - compression='zstd', - allow_internal_only_jpeg=True, - ) diff --git a/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py b/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py deleted file mode 100644 index 1b33fe33d..000000000 --- a/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py +++ /dev/null @@ -1,266 +0,0 @@ -"""GPU regression test for issue #1549. - -Reading a 3-band tiled JPEG GeoTIFF with ``open_geotiff(..., gpu=True)`` -crashed inside the nvJPEG decode kernel with:: - - cupy.cuda.runtime.CUDARuntimeError: cudaErrorIllegalAddress - -The crash was sticky -- it poisoned the CUDA context so every later GPU -call in the same process also failed. Root cause: the -``nvjpegOutputFormat_t`` constants in ``_gpu_decode.py`` were defined -two values lower than the SDK's enum. The wrapper sent ``3`` thinking -it was ``NVJPEG_OUTPUT_RGBI`` (interleaved RGB), but ``3`` is -``NVJPEG_OUTPUT_RGB`` (planar) in the real SDK. nvJPEG then wrote the -G and B planes through ``nvjpegImage.channel[1]`` and -``channel[2]``, which the wrapper had set to NULL for an interleaved -output buffer, producing an out-of-bounds GPU write inside -``ycbcr_to_format_kernel_roi``. - -The same off-by-two affected the single-band path: it sent ``5`` -(thinking ``NVJPEG_OUTPUT_UNCHANGED``) which is actually -``NVJPEG_OUTPUT_RGBI`` -- nvJPEG produced 3-byte-per-pixel output into a -1-byte-per-pixel buffer, returning visibly wrong pixels rather than -crashing. - -These tests build the exact reproducer from the issue, decode it on GPU, -and verify (a) the decode does not crash, (b) the GPU pixels match the -CPU pixels within the typical libjpeg/nvjpeg rounding tolerance, and -(c) the CUDA context survives a follow-up GPU read of an unrelated -file. -""" -from __future__ import annotations - -import importlib.util - -import numpy as np -import pytest - - -def _gpu_available() -> bool: - """True when cupy is importable and CUDA is initialised.""" - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -def _nvjpeg_available() -> bool: - """True when libnvjpeg.so loads on this host. - - Without nvJPEG the GPU pipeline silently falls back to CPU Pillow - decode, so the regression for issue #1549 (an out-of-bounds write - inside the nvJPEG kernel) would never be exercised. Skip rather - than test a path the bug never lived on. - """ - if not _gpu_available(): - return False - try: - from xrspatial.geotiff._gpu_decode import _get_nvjpeg - return _get_nvjpeg() is not None - except Exception: - return False - - -_HAS_GPU = _gpu_available() -_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None -_HAS_PIL = importlib.util.find_spec("PIL") is not None -# tifffile.imwrite(compression='jpeg') delegates the codec to imagecodecs -# (or libjpeg via Pillow on some installs); when neither is wired up the -# write raises and the suite would error instead of skipping cleanly. -_HAS_IMAGECODECS = importlib.util.find_spec("imagecodecs") is not None -_HAS_NVJPEG = _nvjpeg_available() - -_gpu_only = pytest.mark.skipif( - not (_HAS_GPU and _HAS_TIFFFILE and _HAS_PIL - and _HAS_IMAGECODECS and _HAS_NVJPEG), - reason="cupy + CUDA + tifffile + Pillow + imagecodecs + nvJPEG required", -) - - -def _write_jpeg_rgb_tiff(path: str, seed: int = 0, - noise: bool = True) -> np.ndarray: - """Write a 3-band 256x256 tiled JPEG TIFF using tifffile. - - tifffile emits a complete JFIF stream (SOI + APP0 + DQT + DHT + SOF0 - + ...) per tile rather than a JPEGTables-style abbreviated stream, - so the GPU decode path runs without any extra splice step. - - With ``noise=True`` the payload is uniform random bytes -- the - worst-case input for JPEG compression and useful for the - crash-reproducer test which only cares about safe completion. - With ``noise=False`` the payload is a smooth gradient where - libjpeg and nvjpeg agree to within a few LSBs per pixel; that lets - the cross-backend match assertion run with a tight tolerance. - """ - import tifffile - if noise: - rng = np.random.default_rng(seed) - arr = rng.integers(0, 256, size=(256, 256, 3), dtype=np.uint8) - else: - # Smooth gradient: per-channel ramp + cross terms. - ys, xs = np.mgrid[0:256, 0:256].astype(np.int32) - r = (ys + xs) // 2 - g = ys - b = xs - arr = np.stack([r, g, b], axis=2).clip(0, 255).astype(np.uint8) - tifffile.imwrite(path, arr, photometric='rgb', tile=(128, 128), - compression='jpeg') - return arr - - -def _write_jpeg_gray_tiff(path: str, seed: int = 42) -> np.ndarray: - """Write a 1-band 256x256 tiled JPEG TIFF using tifffile.""" - import tifffile - rng = np.random.default_rng(seed) - arr = rng.integers(0, 256, size=(256, 256), dtype=np.uint8) - tifffile.imwrite(path, arr, photometric='minisblack', tile=(128, 128), - compression='jpeg') - return arr - - -@_gpu_only -def test_rgb_jpeg_gpu_no_crash(tmp_path, monkeypatch): - """3-band JPEG must not raise CUDARuntimeError on GPU read. - - Uses ``gpu='strict'`` so the original - ``cudaErrorIllegalAddress`` would propagate up the stack instead of - being swallowed by the auto-mode CPU fallback. Without strict the - bug presented as a ``RuntimeWarning: GPU decode failed (...); - falling back to CPU`` and the returned array was the CPU array, so - the assertions below would still pass even with the bug present. - - Also spies on ``_try_nvjpeg_batch_decode`` to fail loudly if the - decode took a CPU fallback path instead of nvJPEG. Without this - guard the test would pass on a system whose nvJPEG returned None for - any reason, defeating the point of the regression test. - """ - import cupy - - from xrspatial.geotiff import _gpu_decode, read_geotiff_gpu - - spy = {"calls": 0, "successes": 0} - original = _gpu_decode._try_nvjpeg_batch_decode - - def wrapped(*args, **kwargs): - spy["calls"] += 1 - result = original(*args, **kwargs) - if result is not None: - spy["successes"] += 1 - return result - - monkeypatch.setattr(_gpu_decode, "_try_nvjpeg_batch_decode", wrapped) - - path = str(tmp_path / "rgb_jpeg_1549.tif") - _write_jpeg_rgb_tiff(path) - - arr = read_geotiff_gpu(path, gpu='strict', allow_internal_only_jpeg=True) - # Materialise the GPU buffer so any deferred kernel actually runs - # and surface any sticky error from the decode pipeline. - assert isinstance(arr.data, cupy.ndarray) - decoded = arr.data.get() - assert decoded.shape == (256, 256, 3) - assert decoded.dtype == np.uint8 - - assert spy["calls"] >= 1, ( - "nvJPEG branch was never called — test did not exercise the " - "code path the #1549 fix lives on" - ) - assert spy["successes"] >= 1, ( - "nvJPEG returned None — CPU Pillow fallback ran and the fix was " - "not exercised" - ) - - -@_gpu_only -def test_rgb_jpeg_gpu_matches_cpu(tmp_path): - """GPU pixels must be within JPEG decoder tolerance of CPU pixels. - - With a smooth gradient input the libjpeg (CPU) and nvjpeg (GPU) - decoders agree to a couple of LSBs per pixel. The off-by-two - constant bug scrambled channels enough to push the mean diff above - 60 and the max diff above 200, so a tight bound here pins both the - constant fix and the per-tile sync that keeps the multi-tile - decode deterministic. - """ - from xrspatial.geotiff import open_geotiff - - path = str(tmp_path / "rgb_jpeg_match_1549.tif") - _write_jpeg_rgb_tiff(path, noise=False) - - cpu = open_geotiff(path, allow_internal_only_jpeg=True) - gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True) - assert cpu.shape == gpu.shape == (256, 256, 3) - - cpu_arr = np.asarray(cpu.data) - gpu_arr = np.asarray(gpu.data.get()) - - diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int)) - assert diff.mean() < 1.0, f"mean diff {diff.mean():.3f} too large" - assert diff.max() < 8, f"max diff {diff.max()} too large" - - -@_gpu_only -def test_grayscale_jpeg_gpu_matches_cpu(tmp_path): - """Single-band JPEG GPU read must also produce correct pixels. - - With the off-by-two constants the single-band path silently produced - wrong output (each Y value duplicated three times then wrapped) -- a - quieter failure mode than the 3-band crash but a corruption - nonetheless. - """ - from xrspatial.geotiff import open_geotiff - - path = str(tmp_path / "gray_jpeg_1549.tif") - _write_jpeg_gray_tiff(path) - - cpu = open_geotiff(path, allow_internal_only_jpeg=True) - gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True) - assert cpu.shape == gpu.shape == (256, 256) - - cpu_arr = np.asarray(cpu.data) - gpu_arr = np.asarray(gpu.data.get()) - diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int)) - # For grayscale there is no chroma involved, so libjpeg and nvjpeg - # only diverge by IDCT rounding (typically <= 1 LSB). - assert diff.max() <= 2, ( - f"grayscale max diff {diff.max()} indicates corruption, " - f"not just rounding" - ) - - -@_gpu_only -def test_cuda_context_survives_after_jpeg_gpu_read(tmp_path): - """Verify the CUDA context is healthy after a GPU JPEG read. - - Before the fix, the failing nvJPEG kernel left the context in an - error state so every later GPU call in the same process raised - ``cudaErrorIllegalAddress`` -- even unrelated allocations. This - test reads the JPEG on GPU, then performs a small follow-up GPU - operation and an unrelated GPU read and asserts both succeed. - """ - import cupy - - from xrspatial.geotiff import open_geotiff - - path = str(tmp_path / "rgb_ctx_1549.tif") - _write_jpeg_rgb_tiff(path) - - arr = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True) - _ = arr.data.get() - - # Plain CuPy op -- this is the call that used to surface the sticky - # cudaErrorIllegalAddress on its first allocation. - x = cupy.arange(1024, dtype=cupy.float32) - s = float(cupy.sum(x).item()) - assert s == 1023 * 1024 / 2 - - # Unrelated GPU TIFF read -- closes the loop on the issue's - # "every later GPU call fails" symptom. - other_path = str(tmp_path / "other_1549.tif") - _write_jpeg_gray_tiff(other_path, seed=7) - other = open_geotiff(other_path, gpu=True, allow_internal_only_jpeg=True) - assert other.shape == (256, 256) - assert other.dtype == np.uint8 diff --git a/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py deleted file mode 100644 index 62ff1ad1f..000000000 --- a/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py +++ /dev/null @@ -1,222 +0,0 @@ -"""GPU follow-up to PR #1529 (LERC valid-mask on decode). - -The CPU LERC reader honours the LERC valid-mask and writes the file's -nodata sentinel into masked pixels. The GPU LERC tile-decode path used -to discard the mask, so masked pixels read back as LERC's zero fill -(real-looking measurements at z == 0) on GPU but as NaN/sentinel on -CPU. These tests confirm the GPU path now matches the CPU path for -representative LERC mask combinations. - -Mirrors the structure of ``test_lerc_valid_mask.py`` but compares -``read_geotiff_gpu`` output to ``read_to_array`` output for each case. -""" -from __future__ import annotations - -import importlib.util - -import numpy as np -import pytest - -lerc = pytest.importorskip("lerc") - -from xrspatial.geotiff._compression import LERC_AVAILABLE # noqa: E402 - - -def _gpu_available() -> bool: - """True if cupy is importable and CUDA is initialised.""" - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -_HAS_GPU = _gpu_available() -_gpu_only = pytest.mark.skipif( - not (_HAS_GPU and LERC_AVAILABLE), - reason="cupy + CUDA + lerc required", -) - - -@pytest.fixture -def lerc_writer_with_mask(monkeypatch): - """Patch ``lerc_compress`` to embed a valid-mask the writer can't pass. - - The xrspatial writer hard-codes ``hasMask=False`` in its call to - ``lerc.encode``. Tests inject a per-tile mask through this holder's - ``invalid`` predicate so the masked pixels survive the encode and - show up at decode time. Same pattern as the CPU test fixture in - ``test_lerc_valid_mask.py``. - """ - holder = {"invalid": None} - - def _patched(data, width, height, samples=1, - dtype=np.dtype('float32'), max_z_error=0.0): - if samples == 1: - arr = np.frombuffer(data, dtype=dtype).reshape(height, width) - else: - arr = np.frombuffer(data, dtype=dtype).reshape( - height, width, samples) - invalid_pred = holder["invalid"] - if invalid_pred is None: - mask = None - has_mask = False - else: - invalid = invalid_pred(arr) - mask = np.where(invalid, np.uint8(0), np.uint8(1)) - has_mask = True - result = lerc.encode(arr, samples, has_mask, mask, max_z_error, 1) - if result[0] != 0: - raise RuntimeError( - f"LERC encode failed with error code {result[0]}") - return bytes(result[2]) - - monkeypatch.setattr( - "xrspatial.geotiff._compression.lerc_compress", _patched, - ) - return holder - - -def _read_cpu_gpu(path): - """Read *path* with both readers and return ``(cpu_array, gpu_host_array)``. - - Uses the low-level ``read_to_array`` for CPU so that nodata sentinels - stay as the literal value (this module checks LERC mask preservation, - not the higher-level NaN promotion that ``open_geotiff`` performs). - - The GPU reader (``read_geotiff_gpu``) applies the same nodata masking - that ``open_geotiff`` does (PR #1542), so its output uses NaN where - the sentinel was. Callers that want a bit-for-bit comparison against - the low-level CPU read should run the GPU result through - ``_restore_sentinel`` below to put the sentinel back. - """ - from xrspatial.geotiff import read_geotiff_gpu - from xrspatial.geotiff._reader import read_to_array - - cpu, _geo = read_to_array(path, allow_experimental_codecs=True) - gpu_da = read_geotiff_gpu( - path, gpu='strict', allow_experimental_codecs=True, - ) - gpu_host = gpu_da.data.get() - return cpu, gpu_host - - -def _restore_sentinel(arr, nodata): - """Replace NaN positions in *arr* with *nodata* so high-level GPU - reads compare bit-exactly against low-level CPU reads (which keep - the sentinel value verbatim).""" - if nodata is None or arr.dtype.kind != 'f' or np.isnan(nodata): - return arr - out = arr.copy() - out[np.isnan(out)] = arr.dtype.type(nodata) - return out - - -@_gpu_only -class TestGpuLercValidMask: - """End-to-end TIFF round-trips comparing GPU vs CPU output.""" - - def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask): - """Float32 LERC + NaN nodata: GPU output matches CPU output.""" - from xrspatial.geotiff._writer import write - - arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8) - invalid_positions = {(0, 1), (5, 4)} - - def invalid_pred(a): - m = np.zeros(a.shape[:2], dtype=bool) - for r, c in invalid_positions: - m[r, c] = True - return m - lerc_writer_with_mask["invalid"] = invalid_pred - - path = str(tmp_path / "lerc_mask_nan_gpu.tif") - write(arr, path, compression="lerc", tiled=True, tile_size=8, - nodata=float("nan")) - - cpu, gpu = _read_cpu_gpu(path) - # NaN positions - for (r, c) in invalid_positions: - assert np.isnan(cpu[r, c]) - assert np.isnan(gpu[r, c]) - # Valid positions agree exactly - cpu_valid = np.where(np.isnan(cpu), 0.0, cpu) - gpu_valid = np.where(np.isnan(gpu), 0.0, gpu) - np.testing.assert_array_equal(cpu_valid, gpu_valid) - - def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask): - """Float32 LERC + sentinel nodata (-9999): GPU matches CPU.""" - from xrspatial.geotiff._writer import write - - arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8) - invalid_positions = {(0, 1), (3, 3), (7, 7)} - - def invalid_pred(a): - m = np.zeros(a.shape[:2], dtype=bool) - for r, c in invalid_positions: - m[r, c] = True - return m - lerc_writer_with_mask["invalid"] = invalid_pred - - path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif") - write(arr, path, compression="lerc", tiled=True, tile_size=8, - nodata=-9999.0) - - cpu, gpu = _read_cpu_gpu(path) - # ``read_geotiff_gpu`` applies the high-level nodata mask (#1542), - # so masked pixels come back as NaN. ``read_to_array`` keeps the - # sentinel verbatim. Restore the sentinel on the GPU side so the - # bit-for-bit comparison still pins LERC mask preservation. - gpu_with_sentinel = _restore_sentinel(gpu, -9999.0) - np.testing.assert_array_equal(cpu, gpu_with_sentinel) - for (r, c) in invalid_positions: - assert np.isnan(gpu[r, c]) - assert gpu_with_sentinel[r, c] == np.float32(-9999.0) - - def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask): - """Uint16 LERC + sentinel nodata (65535): GPU matches CPU.""" - from xrspatial.geotiff._writer import write - - arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8) - invalid_positions = {(0, 1), (4, 4)} - - def invalid_pred(a): - m = np.zeros(a.shape[:2], dtype=bool) - for r, c in invalid_positions: - m[r, c] = True - return m - lerc_writer_with_mask["invalid"] = invalid_pred - - path = str(tmp_path / "lerc_mask_uint16_gpu.tif") - write(arr, path, compression="lerc", tiled=True, tile_size=8, - nodata=65535) - - cpu, gpu = _read_cpu_gpu(path) - # ``read_geotiff_gpu`` applies the high-level nodata mask on - # integer rasters (#1542): the array is promoted to float64 with - # NaN where the sentinel was. ``read_to_array`` keeps uint16 with - # the sentinel literal. Restore the sentinel + dtype on the GPU - # side so the bit-for-bit comparison still pins LERC mask - # preservation. Replace NaN before the uint16 cast to avoid - # numpy's "invalid value encountered in cast" warning. - assert gpu.dtype == np.float64 - gpu_no_nan = np.where(np.isnan(gpu), 65535.0, gpu) - gpu_u16 = gpu_no_nan.astype(np.uint16) - np.testing.assert_array_equal(cpu, gpu_u16) - for (r, c) in invalid_positions: - assert np.isnan(gpu[r, c]) - assert gpu_u16[r, c] == np.uint16(65535) - - def test_no_mask_roundtrip_bitexact(self, tmp_path): - """All-valid LERC (no encoded mask): GPU and CPU agree bit-exact.""" - from xrspatial.geotiff._writer import write - - arr = np.arange(64, dtype=np.float32).reshape(8, 8) - path = str(tmp_path / "lerc_no_mask_gpu.tif") - write(arr, path, compression="lerc", tiled=True, tile_size=8) - - cpu, gpu = _read_cpu_gpu(path) - np.testing.assert_array_equal(cpu, arr) - np.testing.assert_array_equal(gpu, arr) diff --git a/xrspatial/geotiff/tests/test_nvcomp_batch_compress_batched_1712.py b/xrspatial/geotiff/tests/test_nvcomp_batch_compress_batched_1712.py deleted file mode 100644 index df1902f74..000000000 --- a/xrspatial/geotiff/tests/test_nvcomp_batch_compress_batched_1712.py +++ /dev/null @@ -1,157 +0,0 @@ -"""Coverage for the batched ``_nvcomp_batch_compress`` (#1712). - -The pre-fix function allocated compressed-output device buffers one -``cupy.empty`` per tile and then read each tile back to host with one -``.get()`` per tile. Both patterns serialised on the default CUDA -stream and were dominant in large-N writes. The fix folds both into a -single contiguous device allocation + a single batched D2H concat-and- -``.get()``, matching the patterns already in use on the decode side -(#1552, #1659). - -These tests pin the new shape and confirm the deflate / zstd GPU write -paths still round-trip end-to-end. -""" -from __future__ import annotations - -import importlib.util -import inspect -import os -import tempfile - -import numpy as np -import pytest -import xarray as xr - -try: - import cupy - _HAS_CUPY = True -except Exception: - _HAS_CUPY = False - - -def _gpu_available() -> bool: - """Match the geotiff-test convention: cupy import AND working CUDA. - - A host can have cupy installed without a usable CUDA runtime (no - driver, no device visible, container misconfig), and in that case - every test that calls into the GPU writer would fail rather than - skip. ``cupy.cuda.is_available()`` is the cheap probe. - """ - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -_HAS_GPU = _gpu_available() - -# nvCOMP is the entry point that exercises this code path. -from xrspatial.geotiff import _gpu_decode # noqa: E402 - -needs_cupy = pytest.mark.skipif( - not _HAS_GPU, reason="cupy + CUDA required" -) - - -# ---------------------------------------------------------------------- -# Source-level structural assertions -- run on any host with the source -# available, no GPU required. -# ---------------------------------------------------------------------- - -def test_no_per_tile_cupy_empty_in_compressed_pool(): - """The per-tile cupy.empty list comprehension is gone (#1712). - - The fix replaced it with a single contiguous allocation. Catch any - regression that brings the loop back. - """ - source = inspect.getsource(_gpu_decode._nvcomp_batch_compress) - assert "cupy.empty(max_cs, dtype=cupy.uint8) for _ in range" not in source, ( - "_nvcomp_batch_compress regressed to per-tile cupy.empty " - "allocations for the compressed output pool. See #1712." - ) - - -def test_no_per_tile_get_in_result_loop(): - """The per-tile ``d_comp_bufs[i][:cs].get().tobytes()`` is gone (#1712). - - The fix replaced it with one concat + one ``.get()``. Catch any - regression that brings the per-tile pattern back. - """ - source = inspect.getsource(_gpu_decode._nvcomp_batch_compress) - # The exact string the prior loop used: - bad_fragment = "d_comp_bufs[i][:cs].get().tobytes()" - assert bad_fragment not in source, ( - "_nvcomp_batch_compress regressed to per-tile .get().tobytes() " - "D2H readback. See #1712." - ) - - -# ---------------------------------------------------------------------- -# End-to-end behaviour: GPU write + read round-trip stays correct -# ---------------------------------------------------------------------- - -@needs_cupy -@pytest.mark.parametrize("compression", ["deflate", "zstd"]) -def test_gpu_write_roundtrip_after_batched_compress(compression): - """GPU compress path round-trips uncorrupted for deflate + zstd. - - Catches the most likely regression mode: any off-by-one in the - cumulative-sum offsets used to slice the host-side concatenated - buffer would scramble tile order, which a round-trip equality - check picks up immediately. - """ - from xrspatial.geotiff import open_geotiff, write_geotiff_gpu - - rng = np.random.default_rng(seed=1712) - arr_cpu = rng.random((512, 512), dtype=np.float32) - arr_gpu = cupy.asarray(arr_cpu) - darr = xr.DataArray(arr_gpu, dims=["y", "x"]) - - with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td: - path = os.path.join(td, f"roundtrip_{compression}.tif") - try: - write_geotiff_gpu( - darr, path, - compression=compression, - tiled=True, - tile_size=64, - ) - except RuntimeError as e: - # nvCOMP may be unavailable in this environment; the writer - # falls back to CPU and that path doesn't exercise the - # change. Skip rather than fail. - pytest.skip(f"nvCOMP unavailable for {compression}: {e}") - - back = open_geotiff(path) - np.testing.assert_allclose(back.values, arr_cpu, rtol=0, atol=0) - - -@needs_cupy -def test_gpu_write_zero_tile_edge_case(): - """A 0-tile compress returns an empty list without indexing into None. - - The cumulative-sum / concat path must short-circuit before - ``cupy.concatenate`` (which would raise on an empty list). The - pre-fix loop simply iterated zero times, so the contract is the - same empty-list output. - """ - # Direct call into the internal function with n_tiles=0 is - # awkward because it needs a libnvCOMP handle and matching opts. - # Instead, exercise the public writer with a tiny single-tile - # input and confirm the fast path does not crash. Real n_tiles==0 - # never occurs via the writer (every image has at least one tile). - from xrspatial.geotiff import open_geotiff, write_geotiff_gpu - arr_gpu = cupy.zeros((32, 32), dtype=cupy.float32) - darr = xr.DataArray(arr_gpu, dims=["y", "x"]) - with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td: - path = os.path.join(td, "tiny.tif") - try: - write_geotiff_gpu(darr, path, compression="zstd", - tiled=True, tile_size=32) - except RuntimeError as e: - pytest.skip(f"nvCOMP unavailable: {e}") - back = open_geotiff(path) - assert back.shape == (32, 32) diff --git a/xrspatial/geotiff/tests/test_nvcomp_batch_upload_p3.py b/xrspatial/geotiff/tests/test_nvcomp_batch_upload_p3.py deleted file mode 100644 index 3b6009f60..000000000 --- a/xrspatial/geotiff/tests/test_nvcomp_batch_upload_p3.py +++ /dev/null @@ -1,206 +0,0 @@ -"""Regression tests for batched host->device upload in the nvCOMP path. - -Performance audit P3: ``_try_nvcomp_batch_decompress`` previously did one -``cupy.asarray`` per compressed tile, costing ~6.07 ms for 256 x 64 KB -tiles. The fix concatenates all tiles into a single host buffer, performs -one H2D transfer, and derives per-tile device pointers via -``base_ptr + offsets`` -- mirroring the pattern at -``_gpu_decode.py`` L1714-1722 in the LZW/Deflate path. Measured ~1.66x -speedup that scales worse with more tiles. - -The tests skip cleanly when neither libnvcomp nor kvikio.nvcomp is -available on the host -- without one of those, the GPU decoder falls -back to the per-tile numba kernel and the changed code path is not -exercised, so a passing test would be misleading. When the path *is* -available, the correctness test additionally asserts that -``_try_nvcomp_batch_decompress`` returned a non-None CuPy array, so a -silent fall-through to the slow path counts as a test failure. -""" -from __future__ import annotations - -import importlib.util -import time -import uuid - -import numpy as np -import pytest - - -def _gpu_available() -> bool: - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -def _kvikio_nvcomp_importable() -> bool: - """True iff ``import kvikio.nvcomp`` actually succeeds. - - ``importlib.util.find_spec`` may report kvikio as installed even when - the underlying ``libkvikio.so`` is missing, so we attempt the real - import here. - """ - try: - import kvikio.nvcomp # noqa: F401 - except Exception: - return False - return True - - -def _nvcomp_path_available() -> bool: - """True when at least one nvCOMP backend is loadable on this host. - - The optimised code path runs only when either the C nvCOMP library - (``libnvcomp.so``) or ``kvikio.nvcomp`` is importable. Without one - of those, ``_try_nvcomp_batch_decompress`` always returns None and - timing/correctness tests would silently exercise the slower fallback - decoder. - """ - if not _gpu_available(): - return False - try: - from xrspatial.geotiff._gpu_decode import _get_nvcomp - except Exception: - return False - if _get_nvcomp() is not None: - return True - return _kvikio_nvcomp_importable() - - -_HAS_GPU = _gpu_available() -_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None -_HAS_NVCOMP = _nvcomp_path_available() -_nvcomp_only = pytest.mark.skipif( - not (_HAS_GPU and _HAS_TIFFFILE and _HAS_NVCOMP), - reason="cupy + CUDA + tifffile + (libnvcomp or kvikio.nvcomp) required", -) - - -def _write_deflate_tiled(path, arr, tile=(256, 256)): - import tifffile - tifffile.imwrite( - str(path), arr, compression="deflate", tile=tile, - ) - - -def _wrap_nvcomp_with_call_recorder(monkeypatch): - """Replace ``_try_nvcomp_batch_decompress`` with a wrapper that records - each (compression, returned_non_none) call. Returns the records list.""" - from xrspatial.geotiff import _gpu_decode - - records: list[tuple[int, bool]] = [] - original = _gpu_decode._try_nvcomp_batch_decompress - - def _recording(compressed_tiles, tile_bytes, compression): - result = original(compressed_tiles, tile_bytes, compression) - records.append((compression, result is not None)) - return result - - monkeypatch.setattr( - _gpu_decode, - '_try_nvcomp_batch_decompress', - _recording, - raising=True, - ) - return records - - -@_nvcomp_only -@pytest.mark.parametrize("size,tile", [ - (256, (128, 128)), # 4 tiles - (1024, (256, 256)), # 16 tiles - (2048, (128, 128)), # 256 tiles -- matches the audit measurement -]) -def test_nvcomp_batch_upload_correctness(tmp_path, monkeypatch, size, tile): - """GPU decode of Deflate-tiled TIFFs is bit-exact vs CPU after the - batched H2D upload rewrite, AND the nvCOMP fast-path actually ran.""" - from xrspatial.geotiff import read_geotiff_gpu - from xrspatial.geotiff._reader import read_to_array - - rng = np.random.RandomState(20260508) - arr = rng.randint(0, 4096, size=(size, size), dtype=np.uint16) - - name = f"deflate_{size}_{tile[0]}_{uuid.uuid4().hex[:8]}.tif" - path = tmp_path / name - _write_deflate_tiled(path, arr, tile=tile) - - cpu, _ = read_to_array(str(path)) - np.testing.assert_array_equal(cpu, arr) - - records = _wrap_nvcomp_with_call_recorder(monkeypatch) - gpu_da = read_geotiff_gpu(str(path)) - np.testing.assert_array_equal(gpu_da.data.get(), cpu) - - assert any(success for _, success in records), ( - "_try_nvcomp_batch_decompress was never invoked or always returned " - f"None; records={records}. The optimised path was not exercised, so " - f"this test would pass even if the rewrite were broken." - ) - - -@_nvcomp_only -def test_nvcomp_kvikio_fallback_skips_zstd(monkeypatch): - """When the C nvCOMP lib is missing and kvikio is the only backend, - ZSTD-compressed input must NOT take the kvikio DeflateManager path - (which would strip a fake zlib header and try to decode ZSTD frames - as Deflate). It must return None so the caller can fall through.""" - import xrspatial.geotiff._gpu_decode as _gpu_decode - - # Force the libnvcomp branch off so the kvikio fallback is the only - # path. If kvikio.nvcomp isn't importable, the fallback returns - # None for an unrelated reason -- skip in that case. - if not _kvikio_nvcomp_importable(): - pytest.skip("kvikio.nvcomp not importable; the kvikio branch " - "is never entered on this host") - monkeypatch.setattr(_gpu_decode, '_get_nvcomp', lambda: None) - - # Pass any bytes; the gate must reject ZSTD before any decode work. - result = _gpu_decode._try_nvcomp_batch_decompress( - compressed_tiles=[b'\x28\xb5\x2f\xfd' + b'\x00' * 16], - tile_bytes=1024, - compression=50000, # ZSTD - ) - assert result is None, ( - "_try_nvcomp_batch_decompress returned non-None for ZSTD via the " - "kvikio fallback; this would feed ZSTD bytes through DeflateManager " - "and produce garbage." - ) - - -@_nvcomp_only -def test_nvcomp_batch_upload_perf_regression_guard(tmp_path, monkeypatch): - """Sanity guard: 2048x2048 Deflate-tiled GPU decode finishes under a - generous threshold WHILE going through the nvCOMP fast-path. Skipping - when nvCOMP isn't available is handled by ``_nvcomp_only``.""" - from xrspatial.geotiff import read_geotiff_gpu - - rng = np.random.RandomState(20260508) - arr = rng.randint(0, 4096, size=(2048, 2048), dtype=np.uint16) - path = tmp_path / f"deflate_2048_perf_{uuid.uuid4().hex[:8]}.tif" - _write_deflate_tiled(path, arr, tile=(128, 128)) - - # Warm up: first call may JIT-compile kernels and load CUDA libs. - _ = read_geotiff_gpu(str(path)) - - records = _wrap_nvcomp_with_call_recorder(monkeypatch) - t0 = time.perf_counter() - out = read_geotiff_gpu(str(path)) - elapsed = time.perf_counter() - t0 - - assert any(success for _, success in records), ( - "nvCOMP fast-path did not run during the timed call; the threshold " - f"is meaningless without it. Records: {records}" - ) - - # Generous regression threshold; the per-tile upload version was - # ~6 ms just for H2D so anything well above 200 ms is a real - # regression somewhere in the decode pipeline. - assert elapsed < 0.2, ( - f"read_geotiff_gpu on 2048x2048 deflate-tiled TIFF took " - f"{elapsed * 1000:.1f} ms (threshold 200 ms) -- possible " - f"regression in the nvCOMP batched H2D upload path" - ) - assert out.shape == (2048, 2048) diff --git a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py deleted file mode 100644 index 4e0524490..000000000 --- a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py +++ /dev/null @@ -1,155 +0,0 @@ -"""Regression tests for issue #1950. - -``_try_nvcomp_batch_decompress`` used to compute its per-tile host -prefix-sum offsets via a Python ``for`` loop: - -``` -comp_sizes_list = [len(t) for t in raw_tiles] -comp_offsets_h = np.zeros(n_tiles, dtype=np.int64) -for i in range(1, n_tiles): - comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1] -``` - -The sibling batched-D2H helper ``_batched_d2h_to_bytes`` at ~L924 and -the compress-side prefix sum in ``_nvcomp_batch_compress`` at ~L2572 -both use ``np.cumsum(sizes, out=offsets[1:])``. Aligning the -decompress side keeps the codebase consistent and trims interpreter -overhead. - -Two guards here: - -1. Correctness -- a tiny synthetic nvCOMP round-trip (when the lib is - available) still decodes every tile correctly. Without nvCOMP the - test exercises the same prefix-sum reshape via direct comparison - against ``np.cumsum``. -2. Structural -- the source uses ``np.cumsum`` (not a Python - ``range(1, n_tiles)`` loop) for the prefix sum. -""" -from __future__ import annotations - -import importlib.util -import os -import re -import tempfile - -import numpy as np -import pytest - - -def test_nvcomp_decompress_uses_cumsum_for_offsets_1950(): - """Source-level guard against reintroducing the Python for loop. - - The fix swaps the per-tile prefix-sum loop for ``np.cumsum``. - This test fires if anyone reverts to the loop or otherwise breaks - the alignment with ``_batched_d2h_to_bytes`` / ``_nvcomp_batch_compress``. - """ - import pathlib - - src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py" - src = src_path.read_text() - - # Anchor on the exact decompress-side prefix-sum call. Regex is - # tighter than a fixed character window and survives surrounding - # code edits. - cumsum_call = re.compile( - r"np\.cumsum\(\s*comp_sizes_arr\[:-1\]\s*,\s*" - r"out\s*=\s*comp_offsets_h\[1:\]\s*\)" - ) - assert cumsum_call.search(src), ( - "decompress upload block should use " - "``np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])`` for " - "prefix-sum offsets, aligning with _batched_d2h_to_bytes " - "(issue #1950)." - ) - # The legacy Python loop would have written - # ``comp_offsets_h[i] = comp_offsets_h[i - 1] + ...`` inside a - # ``for i in range(1, n_tiles):`` block. - legacy_loop = re.compile( - r"for\s+i\s+in\s+range\(\s*1\s*,\s*n_tiles\s*\)\s*:\s*\n" - r"\s*comp_offsets_h\[i\]" - ) - assert not legacy_loop.search(src), ( - "decompress upload block should no longer compute prefix-sum " - "offsets with a Python for loop (issue #1950)." - ) - - -def test_cumsum_matches_loop_prefix_sum_1950(): - """Equivalence between the vectorised cumsum and the prior loop. - - Numeric guard. Even though the two forms produce the same output - by construction, a runtime check confirms the cumsum form does not - drift away from the previous semantics across numpy versions. - """ - rng = np.random.RandomState(1950) - n = 1024 - sizes = rng.randint(100, 100_000, size=n).astype(np.int64) - - # Vectorised form (matches the fix). - offsets_cumsum = np.zeros(n, dtype=np.int64) - if n > 1: - np.cumsum(sizes[:-1], out=offsets_cumsum[1:]) - - # Reference: explicit Python prefix sum. - offsets_loop = np.zeros(n, dtype=np.int64) - for i in range(1, n): - offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1] - - np.testing.assert_array_equal(offsets_cumsum, offsets_loop) - - -@pytest.mark.skipif( - importlib.util.find_spec("cupy") is None, - reason="cupy required for nvCOMP path", -) -def test_nvcomp_batch_decompress_roundtrip_1950(): - """End-to-end check: a deflate-tiled raster still decodes correctly. - - Exercises ``_try_nvcomp_batch_decompress`` on a real file via the - public ``read_geotiff_gpu`` entry point. If the prefix-sum - refactor mis-stages a tile, the decoded buffer would not match - the source, surfacing as a numerical regression here. - - Gated on ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` so the run-time check - only fires in environments that actually carry nvCOMP. Without the - flag the GPU read path silently falls back to a CPU codec when - nvCOMP is missing, which bypasses the prefix-sum site entirely; a - pass under that fallback would be misleading, so we skip instead. - """ - if os.environ.get("XRSPATIAL_GEOTIFF_STRICT_GPU") != "1": - pytest.skip( - "set XRSPATIAL_GEOTIFF_STRICT_GPU=1 to exercise the nvCOMP " - "prefix-sum site; without it the GPU path may fall back to " - "a CPU codec and bypass this regression." - ) - try: - import cupy - except ImportError: - pytest.skip("cupy not importable") - if not cupy.cuda.is_available(): - pytest.skip("CUDA device not available") - - import xarray as xr - - from xrspatial.geotiff import open_geotiff, to_geotiff - - rng = np.random.RandomState(1950) - height, width = 1024, 1024 - arr = rng.rand(height, width).astype(np.float32) - da = xr.DataArray( - arr, dims=["y", "x"], - coords={"y": np.arange(height), "x": np.arange(width)}, - attrs={"crs": 4326}, - ) - - with tempfile.TemporaryDirectory() as td: - path = os.path.join(td, "tmp_1950_deflate.tif") - to_geotiff(da, path, compression="deflate", tile_size=256) - - # Read back through the GPU pipeline. - result = open_geotiff(path, gpu=True) - assert result.shape == (height, width) - decoded = cupy.asnumpy(result.data) if hasattr( - result.data, "get") else np.asarray(result.data) - - np.testing.assert_allclose(decoded, arr, atol=0, rtol=0) diff --git a/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py b/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py deleted file mode 100644 index 1144f58c1..000000000 --- a/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py +++ /dev/null @@ -1,234 +0,0 @@ -"""Regression tests for the single-buffer pattern in _try_nvcomp_from_device_bufs. - -Issue #1659: ``_try_nvcomp_from_device_bufs`` used to allocate N separate -``cupy.empty(tile_bytes)`` output buffers and run ``cupy.concatenate`` after -the nvCOMP decompress kernel returned. That kept two copies of the -decompressed data alive at once and ran a serial concat the other nvCOMP -paths in this module already avoid. The fix matches the single-contiguous- -buffer + pointer-offset pattern used by the deflate / LZW / host-buffer -paths nearby. - -These tests skip when CuPy + CUDA are not available. They also skip the -end-to-end nvCOMP integration check when ``kvikio`` or the nvCOMP shared -library are not installed, which is the common case on developer hosts; -the unit-level checks (contract + memory guard) run regardless. -""" -from __future__ import annotations - -import importlib.util - -import numpy as np -import pytest - -from xrspatial.geotiff._gpu_decode import _try_nvcomp_from_device_bufs - - -def _gpu_available() -> bool: - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -def _nvcomp_available() -> bool: - from xrspatial.geotiff._gpu_decode import _get_nvcomp - return _get_nvcomp() is not None - - -@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") -def test_unsupported_codec_short_circuits_before_allocation(): - """Non-ZSTD codecs must return None without allocating output buffers. - - Pins the early-return contract that lets the caller pick a different - decoder when nvCOMP cannot handle this codec. - """ - import cupy - - # Use Deflate (8), which is unsupported by this function (ZSTD only). - d_tiles = [cupy.zeros(1024, dtype=cupy.uint8) for _ in range(4)] - assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 8) is None - - -@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") -def test_no_nvcomp_lib_returns_none(monkeypatch): - """When the nvCOMP library is missing, the function must return None. - - The caller relies on this signal to fall back to the bytes-based decode - path. Without it, callers would hit a ctypes ``getattr`` AttributeError - deeper in the function. - """ - import cupy - - from xrspatial.geotiff import _gpu_decode - - monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: None) - - d_tiles = [cupy.zeros(1024, dtype=cupy.uint8)] - assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 50000) is None - - -@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") -def test_memory_guard_runs_with_full_decomp_size(monkeypatch): - """The single-buffer allocation must be size-checked before cupy.empty. - - The new pattern allocates one contiguous ``n * tile_bytes`` buffer - instead of N small buffers. The OOM guard is what tells the caller - early that the decode will not fit on the device; a regression that - removed the guard would surface as an opaque CUDA OOM instead. - """ - import cupy - - from xrspatial.geotiff import _gpu_decode - - seen = {"total_bytes": None, "what": None, "called": False} - - def fake_check(required_bytes, what="tile buffer"): - seen["total_bytes"] = int(required_bytes) - seen["what"] = what - seen["called"] = True - raise MemoryError("simulated OOM") - - # Pin _get_nvcomp to something truthy so the function does not bail - # before reaching the allocation step. The fake check raises before - # any nvCOMP call would happen, so the lib value never gets used. - monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: object()) - monkeypatch.setattr(_gpu_decode, "_check_gpu_memory", fake_check) - - n_tiles = 8 - tile_bytes = 65536 - d_tiles = [cupy.zeros(128, dtype=cupy.uint8) for _ in range(n_tiles)] - - with pytest.raises(MemoryError): - _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) - - assert seen["called"], "_check_gpu_memory was not called" - expected_bytes = n_tiles * tile_bytes - assert seen["total_bytes"] == expected_bytes, ( - f"expected total {expected_bytes}, got {seen['total_bytes']}" - ) - assert "decompressed" in seen["what"] or "nvCOMP" in seen["what"], ( - f"unhelpful 'what' label: {seen['what']!r}" - ) - - -@pytest.mark.skipif( - not _gpu_available() or not _nvcomp_available(), - reason="cupy + CUDA + nvCOMP shared lib required", -) -def test_zstd_decompress_roundtrip_returns_single_contiguous_buffer(): - """End-to-end: feed real ZSTD-compressed device buffers in, check the - output is a single flat ``cupy.uint8`` array of length n*tile_bytes. - - This test confirms the return contract that ``_apply_predictor_and_assemble`` - depends on: ``out`` is the contiguous concatenation of the N decompressed - tiles, not a list. The previous implementation returned the same shape but - via ``cupy.concatenate``; the new one allocates the contig buffer up front - and writes through per-tile pointers, so a regression that dropped the - return value would surface here. - """ - import cupy - import zstandard as zstd - - rng = np.random.default_rng(seed=1659) - tile_bytes = 4096 - n_tiles = 8 - - cctx = zstd.ZstdCompressor() - host_tiles = [rng.integers(0, 256, size=tile_bytes, dtype=np.uint8) - for _ in range(n_tiles)] - compressed = [cctx.compress(t.tobytes()) for t in host_tiles] - d_tiles = [cupy.asarray(np.frombuffer(c, dtype=np.uint8)) - for c in compressed] - - result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) - - # nvCOMP may be present but mis-configured on the host (e.g. driver - # version mismatch); skip rather than fail in that case so the test is - # informative when run on a real GDS rig. - if result is None: - pytest.skip("nvCOMP returned None; library may be unusable on this host") - - assert isinstance(result, cupy.ndarray) - assert result.dtype == cupy.uint8 - assert result.shape == (n_tiles * tile_bytes,) - assert result.flags.c_contiguous - - # Decoded payload must match the original host tiles. The buffer is a - # single flat array; tile i lives at offset i*tile_bytes. - host_out = result.get() - for i, expected in enumerate(host_tiles): - decoded = host_out[i * tile_bytes:(i + 1) * tile_bytes] - assert np.array_equal(decoded, expected), ( - f"tile {i} decoded payload differs from input" - ) - - -@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") -def test_no_orphan_decomp_buffers_after_call(monkeypatch): - """Earlier code held a Python list of N device buffers in scope - alongside the concatenated result. The replacement allocates once - and returns that one buffer. - - The check here is structural rather than numerical: after a successful - call the only cupy ndarray the caller receives is ``result`` itself, - and inspecting it confirms ``result.size == n_tiles * tile_bytes``. - """ - import cupy - - from xrspatial.geotiff import _gpu_decode - - # Stub the nvCOMP entry points so the decompress "succeeds" without an - # actual library. Force the function down the success branch, capture - # the returned buffer, then verify shape + ownership. - monkeypatch.setattr(_gpu_decode, "_get_nvcomp", - lambda: _FakeNvcompLib()) - - n_tiles = 4 - tile_bytes = 2048 - d_tiles = [cupy.zeros(64, dtype=cupy.uint8) for _ in range(n_tiles)] - result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) - - # The fake lib reports success and zero-fills the output buffer; the - # function returns the contiguous buffer as-is. - assert result is not None - assert isinstance(result, cupy.ndarray) - assert result.size == n_tiles * tile_bytes - assert result.flags.c_contiguous - # The contract requires uint8 -- not a uint8 view of something else. - assert result.dtype == cupy.uint8 - - -class _FakeNvcompLib: - """Stand-in for the nvCOMP CDLL handle used in tests. - - The real function calls ``getattr(lib, fn_name)`` for two entry points - and invokes each as a ctypes function. We expose those entry-point - names as Python callables that pretend the work succeeded. - """ - - def __getattr__(self, name): - if name == 'nvcompBatchedZstdDecompressGetTempSizeAsync': - return _fake_temp_size_fn - if name == 'nvcompBatchedZstdDecompressAsync': - return _fake_decompress_fn - raise AttributeError(name) - - -def _fake_temp_size_fn(n, tile_bytes, opts, p_temp_size, total): - """Stub for nvcompBatchedZstdDecompressGetTempSizeAsync.""" - # Write a tiny temp-size value into the caller's c_size_t. - p_temp_size._obj.value = 1 - return 0 - - -def _fake_decompress_fn(*args): - """Stub for nvcompBatchedZstdDecompressAsync. - - The function's return-value test is ``s != 0``. We return 0 (success). - The caller's d_statuses array is already zero from ``cupy.zeros``, so - the post-decode any-nonzero check passes. - """ - return 0 diff --git a/xrspatial/geotiff/tests/test_nvjpeg2k_single_alloc_2107.py b/xrspatial/geotiff/tests/test_nvjpeg2k_single_alloc_2107.py deleted file mode 100644 index b542039ba..000000000 --- a/xrspatial/geotiff/tests/test_nvjpeg2k_single_alloc_2107.py +++ /dev/null @@ -1,343 +0,0 @@ -"""Tests for the nvJPEG2000 batch-decode allocation refactor (#2107). - -The fix replaces per-tile / per-component ``cupy.empty`` allocations and -per-tile ``cupy.cuda.Device().synchronize()`` inside the decode loop with -a single contiguous device pool and a single batch-end sync. The pattern -matches the prior fixes for ``_try_nvcomp_from_device_bufs`` (#1659), -``_try_kvikio_read_tiles`` (#1688), and ``_nvcomp_batch_compress`` (#1712). - -Since nvJPEG2000 is rarely available on test hosts (libnvjpeg2k.so is -not part of cuda-toolkit's default install), these tests focus on -structural properties of the implementation rather than running a real -decode: - -* ``_try_nvjpeg2k_batch_decode`` early-returns ``None`` when the shared - library is missing -- no allocations or syncs happen on the common - test host. -* Source inspection asserts the new contract: - - Exactly two ``cupy.empty`` calls in the decode loop region - (``d_comp_pool`` + ``d_all_tiles``), zero inside the per-tile loop. - - Exactly one ``Device().synchronize()`` after the loop, zero inside. - - The slab indexing math hits the per-tile-component slab. - -Structural tests like these mirror the approach taken by -``test_nvcomp_from_device_bufs_single_alloc_1659.py`` and -``test_kvikio_batched_pread_1688.py`` so a regression that re-introduces -the per-tile alloc pattern fails CI without needing a GPU. -""" -from __future__ import annotations - -import ast -import inspect - -import numpy as np -import pytest - - -def _function_source(func): - """Return the function's source plus its line range in the source file.""" - src = inspect.getsource(func) - start_line = func.__code__.co_firstlineno - return src, start_line - - -def _count_cupy_empty_calls(tree): - """Count ``cupy.empty(...)`` Call nodes inside the AST.""" - n = 0 - for node in ast.walk(tree): - if not isinstance(node, ast.Call): - continue - func = node.func - if not isinstance(func, ast.Attribute): - continue - if func.attr != 'empty': - continue - # ``cupy.empty`` matches both bare ``cupy`` and aliased ``cp``; - # we only care about the ``empty`` method name. - if not isinstance(func.value, ast.Name): - continue - if func.value.id not in ('cupy', 'cp'): - continue - n += 1 - return n - - -def _count_device_synchronize_calls(tree): - """Count ``cupy.cuda.Device(...).synchronize()`` Call nodes.""" - n = 0 - for node in ast.walk(tree): - if not isinstance(node, ast.Call): - continue - func = node.func - if not isinstance(func, ast.Attribute): - continue - if func.attr != 'synchronize': - continue - # We walk back through the chain: synchronize -> Device(...) -> - # cuda -> cupy. Allow any chain that ends in a ``Device`` call. - parent_call = func.value - if not isinstance(parent_call, ast.Call): - continue - if not isinstance(parent_call.func, ast.Attribute): - continue - if parent_call.func.attr != 'Device': - continue - n += 1 - return n - - -def _inside_for_loop(node: ast.AST, parents: dict) -> bool: - """Return True when ``node`` sits anywhere under a ``for`` statement.""" - cur = parents.get(id(node)) - while cur is not None: - if isinstance(cur, ast.For): - return True - cur = parents.get(id(cur)) - return False - - -def _parent_map(tree: ast.AST) -> dict: - """Build a ``{id(child): parent}`` lookup map for ``_inside_for_loop``.""" - mapping: dict = {} - for parent in ast.walk(tree): - for child in ast.iter_child_nodes(parent): - mapping[id(child)] = parent - return mapping - - -class TestNvjpeg2kSingleAllocStructural: - """Structural assertions on the refactored helper (no GPU required).""" - - def setup_method(self): - from xrspatial.geotiff import _gpu_decode - - self._fn = _gpu_decode._try_nvjpeg2k_batch_decode - src, start = _function_source(self._fn) - self._src = src - self._start_line = start - self._tree = ast.parse(src) - self._parents = _parent_map(self._tree) - - def test_no_cupy_empty_inside_decode_loop(self): - """``cupy.empty`` must NOT appear inside the per-tile ``for`` loop. - - The refactor moves the pool allocation outside the loop. A - regression that re-introduces a per-tile ``cupy.empty`` would - bring back the memory-pool round-trip the fix removed. - """ - offending = [] - for node in ast.walk(self._tree): - if not isinstance(node, ast.Call): - continue - func = node.func - if not isinstance(func, ast.Attribute): - continue - if func.attr != 'empty': - continue - if (not isinstance(func.value, ast.Name) - or func.value.id not in ('cupy', 'cp')): - continue - if _inside_for_loop(node, self._parents): - offending.append(self._start_line + node.lineno - 1) - assert offending == [], ( - f"_try_nvjpeg2k_batch_decode contains cupy.empty(...) calls " - f"inside a for-loop at file lines {offending}. The refactor " - f"in #2107 moved every output allocation outside the per-tile " - f"loop; reverting that defeats the pooling optimisation." - ) - - def test_no_device_synchronize_inside_decode_loop(self): - """``Device().synchronize()`` must NOT live inside the decode loop. - - The previous implementation called it once per tile, forcing - default-stream serialisation. The refactor leaves exactly one - synchronize call after the loop body. - """ - offending = [] - for node in ast.walk(self._tree): - if not isinstance(node, ast.Call): - continue - func = node.func - if not isinstance(func, ast.Attribute): - continue - if func.attr != 'synchronize': - continue - parent_call = func.value - if (not isinstance(parent_call, ast.Call) - or not isinstance(parent_call.func, ast.Attribute) - or parent_call.func.attr != 'Device'): - continue - if _inside_for_loop(node, self._parents): - offending.append(self._start_line + node.lineno - 1) - assert offending == [], ( - f"_try_nvjpeg2k_batch_decode contains Device().synchronize() " - f"calls inside a for-loop at file lines {offending}. The " - f"refactor in #2107 keeps exactly one batch-end sync outside " - f"the loop so successive tiles can pipeline through " - f"nvJPEG2000." - ) - - def test_pool_allocation_present(self): - """Source contains the expected pool buffer name and slab math. - - The refactor introduces ``d_comp_pool`` and - ``per_tile_comp_bytes``; if either disappears, the test fails so - the reviewer notices the layout drift. - """ - assert 'd_comp_pool' in self._src, ( - "_try_nvjpeg2k_batch_decode no longer references the shared " - "d_comp_pool buffer; the refactor in #2107 is missing or " - "reverted." - ) - assert 'per_tile_comp_bytes' in self._src, ( - "_try_nvjpeg2k_batch_decode no longer references " - "per_tile_comp_bytes; the per-tile slab math from #2107 " - "is missing or renamed without an audit." - ) - - def test_check_gpu_memory_guard_present(self): - """The pool allocation must be guarded by ``_check_gpu_memory``. - - Sibling helpers (``_try_nvcomp_from_device_bufs``, - ``_try_kvikio_read_tiles``, ``_nvcomp_batch_compress``) all guard - their pool allocations the same way; refusing the allocation - before cupy raises an opaque CUDA OOM keeps the failure mode - consistent (#2107 follows that pattern). - """ - assert '_check_gpu_memory(' in self._src, ( - "_try_nvjpeg2k_batch_decode no longer calls _check_gpu_memory " - "before allocating the per-tile component pool. The fail-fast " - "OOM contract from #2107 is missing." - ) - - -class TestNvjpeg2kLibAbsentShortCircuit: - """When the shared library is missing, the function returns None - without touching cupy / allocating any device memory.""" - - def test_returns_none_when_lib_missing(self, monkeypatch): - """The early-return is the path most test hosts take. Verify - nothing reaches the refactored allocation code on that path so - the refactor does not regress the lib-missing host behaviour. - """ - from xrspatial.geotiff import _gpu_decode - - monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: None) - - result = _gpu_decode._try_nvjpeg2k_batch_decode( - compressed_tiles=[b''], - tile_width=8, - tile_height=8, - dtype=np.dtype('uint8'), - samples=1, - ) - assert result is None - - def test_returns_none_for_unsupported_dtype(self, monkeypatch): - """Unsupported dtypes (e.g. float32) short-circuit before any - device allocation. The refactor moved the pool alloc below the - dtype guard, so this exercises the dtype branch that must still - clean up the handles without touching the pool. - """ - from xrspatial.geotiff import _gpu_decode - - # Fake a lib that succeeds for handle/state/stream/params and - # exposes the destroy entry points so the dtype-guard cleanup - # path runs without crashing. We do not pretend to support the - # actual nvjpeg2k C API beyond what the early-return code uses. - class _FakeLib: - def __init__(self): - self.calls = [] - - def nvjpeg2kCreateSimple(self, *_args): - return 0 - - def nvjpeg2kDecodeStateCreate(self, *_args): - return 0 - - def nvjpeg2kStreamCreate(self, *_args): - return 0 - - def nvjpeg2kDecodeParamsCreate(self, *_args): - return 0 - - def nvjpeg2kDecodeParamsDestroy(self, *_args): - self.calls.append('params_destroy') - - def nvjpeg2kStreamDestroy(self, *_args): - self.calls.append('stream_destroy') - - def nvjpeg2kDecodeStateDestroy(self, *_args): - self.calls.append('state_destroy') - - def nvjpeg2kDestroy(self, *_args): - self.calls.append('handle_destroy') - - fake = _FakeLib() - monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: fake) - - # float32 is not in {uint8, uint16, int16} so the helper exits - # before any pool allocation -- and the host needs no cupy - # for this branch to run. - result = _gpu_decode._try_nvjpeg2k_batch_decode( - compressed_tiles=[b''], - tile_width=8, - tile_height=8, - dtype=np.dtype('float32'), - samples=1, - ) - assert result is None - # The dtype-guard branch should have called all four destroy - # functions, mirroring the success-path cleanup. - assert fake.calls == [ - 'params_destroy', - 'stream_destroy', - 'state_destroy', - 'handle_destroy', - ] - - -@pytest.mark.gpu -class TestNvjpeg2kPoolWithCupy: - """Lightweight cupy-only smoke tests for the pool layout. - - These tests do not exercise the real nvJPEG2000 decode (the host - typically has no libnvjpeg2k.so) but they confirm the pool sizing - math and the per-tile slab indexing produce non-overlapping views - into the shared buffer for representative tile sizes. - """ - - def test_pool_slabs_are_non_overlapping(self): - """Tile-component slabs into the pool must not overlap. - - We recompute the slab boundaries the helper uses and verify the - sequence covers exactly the pool size with no gaps and no - double-coverage. A regression that miscomputes - ``per_tile_comp_bytes`` would either OOB the pool or fold tile - N onto tile N-1's bytes; this test catches both. - """ - cupy = pytest.importorskip('cupy') - - n_tiles = 4 - tile_width = 32 - tile_height = 32 - samples = 3 - dtype = np.dtype('uint16') - pitch = tile_width * dtype.itemsize - per_tile_comp_bytes = samples * tile_height * pitch - pool = cupy.empty(n_tiles * per_tile_comp_bytes, dtype=cupy.uint8) - - seen = set() - for i in range(n_tiles): - tile_pool_start = i * per_tile_comp_bytes - for c in range(samples): - start = tile_pool_start + c * tile_height * pitch - end = start + tile_height * pitch - for byte in range(start, end): - assert byte not in seen, ( - f"pool byte {byte} appears in two slabs " - f"(tile={i}, comp={c}); per-tile slab math is " - f"wrong." - ) - seen.add(byte) - assert len(seen) == int(pool.nbytes) diff --git a/xrspatial/geotiff/tests/test_nvjpeg_encode_stream_sync_2212.py b/xrspatial/geotiff/tests/test_nvjpeg_encode_stream_sync_2212.py deleted file mode 100644 index 0fc44f190..000000000 --- a/xrspatial/geotiff/tests/test_nvjpeg_encode_stream_sync_2212.py +++ /dev/null @@ -1,215 +0,0 @@ -"""Tests for the nvJPEG / nvJPEG2000 encoder default-stream-sync fix (#2212). - -The fix replaces ``cupy.cuda.Device().synchronize()`` inside the per-tile -encode loops in ``_nvjpeg_batch_encode`` and ``_nvjpeg2k_batch_encode`` -with ``cupy.cuda.Stream.null.synchronize()``. ``Device().synchronize()`` -is a whole-device fence that blocks every CUDA stream; the encode / -retrieve sequence only depends on the default stream the calls were -issued on, so scoping the sync to the null stream lets concurrent work -on other streams continue. - -The matching decoder ``_try_nvjpeg_batch_decode`` already uses -``cupy.cuda.Stream.null.synchronize()`` (the pattern was inconsistent -before this fix); the nvJPEG2000 decode-side regression was already -caught by #2107. - -These tests skip the end-to-end encode path because nvJPEG / nvJPEG2000 -shared libraries are rarely installed on developer hosts; the -structural AST checks run regardless and catch a future regression that -re-introduces the device-wide sync. -""" -from __future__ import annotations - -import ast -import inspect - - -def _function_source(func): - src = inspect.getsource(func) - start_line = func.__code__.co_firstlineno - return src, start_line - - -def _parent_map(tree: ast.AST) -> dict: - mapping: dict = {} - for parent in ast.walk(tree): - for child in ast.iter_child_nodes(parent): - mapping[id(child)] = parent - return mapping - - -def _inside_for_loop(node: ast.AST, parents: dict) -> bool: - cur = parents.get(id(node)) - while cur is not None: - if isinstance(cur, ast.For): - return True - cur = parents.get(id(cur)) - return False - - -def _device_synchronize_lines(tree: ast.AST, start_line: int, - parents: dict, *, only_in_loop: bool): - """Return file line numbers of ``cupy.cuda.Device().synchronize()`` calls.""" - out = [] - for node in ast.walk(tree): - if not isinstance(node, ast.Call): - continue - func = node.func - if not isinstance(func, ast.Attribute): - continue - if func.attr != 'synchronize': - continue - parent_call = func.value - if not isinstance(parent_call, ast.Call): - continue - if not isinstance(parent_call.func, ast.Attribute): - continue - if parent_call.func.attr != 'Device': - continue - if only_in_loop and not _inside_for_loop(node, parents): - continue - if not only_in_loop and _inside_for_loop(node, parents): - continue - out.append(start_line + node.lineno - 1) - return out - - -def _stream_null_synchronize_lines(tree: ast.AST, start_line: int, - parents: dict, *, only_in_loop: bool): - """Return file lines of ``cupy.cuda.Stream.null.synchronize()`` calls.""" - out = [] - for node in ast.walk(tree): - if not isinstance(node, ast.Call): - continue - func = node.func - if not isinstance(func, ast.Attribute): - continue - if func.attr != 'synchronize': - continue - # Stream.null is an Attribute chain, not a Call -- the value is - # ``cupy.cuda.Stream.null``. - chain = func.value - if isinstance(chain, ast.Call): - continue - if not isinstance(chain, ast.Attribute): - continue - # Walk back to find ``Stream`` in the chain. - found_stream_null = False - cur = chain - if cur.attr == 'null': - inner = cur.value - if isinstance(inner, ast.Attribute) and inner.attr == 'Stream': - found_stream_null = True - if not found_stream_null: - continue - if only_in_loop and not _inside_for_loop(node, parents): - continue - if not only_in_loop and _inside_for_loop(node, parents): - continue - out.append(start_line + node.lineno - 1) - return out - - -class TestNvjpegEncodeStreamSync: - """Structural assertions on the encoder sync fix (no GPU required).""" - - def setup_method(self): - from xrspatial.geotiff import _gpu_decode - self._fn = _gpu_decode._nvjpeg_batch_encode - src, start = _function_source(self._fn) - self._src = src - self._start_line = start - self._tree = ast.parse(src) - self._parents = _parent_map(self._tree) - - def test_no_device_synchronize_inside_encode_loop(self): - offending = _device_synchronize_lines( - self._tree, self._start_line, self._parents, only_in_loop=True, - ) - assert offending == [], ( - "_nvjpeg_batch_encode contains cupy.cuda.Device().synchronize() " - f"calls inside a for-loop at file lines {offending}. The fix " - "in #2212 scopes the per-tile sync to the default stream via " - "cupy.cuda.Stream.null.synchronize() so concurrent CUDA work " - "on other streams is not serialised behind every tile encode." - ) - - def test_stream_null_synchronize_present(self): - """At least one ``Stream.null.synchronize()`` call must be present.""" - found = _stream_null_synchronize_lines( - self._tree, self._start_line, self._parents, only_in_loop=True, - ) - assert len(found) >= 1, ( - "_nvjpeg_batch_encode no longer calls " - "cupy.cuda.Stream.null.synchronize() inside the encode loop. " - "The fix in #2212 requires the per-tile sync to be scoped to " - "the default stream so encode/retrieve ordering is preserved " - "without blocking other CUDA streams." - ) - - -class TestNvjpeg2kEncodeStreamSync: - """Structural assertions on the nvJPEG2000 encoder sync fix.""" - - def setup_method(self): - from xrspatial.geotiff import _gpu_decode - self._fn = _gpu_decode._nvjpeg2k_batch_encode - src, start = _function_source(self._fn) - self._src = src - self._start_line = start - self._tree = ast.parse(src) - self._parents = _parent_map(self._tree) - - def test_no_device_synchronize_inside_encode_loop(self): - offending = _device_synchronize_lines( - self._tree, self._start_line, self._parents, only_in_loop=True, - ) - assert offending == [], ( - "_nvjpeg2k_batch_encode contains " - "cupy.cuda.Device().synchronize() calls inside a for-loop at " - f"file lines {offending}. The fix in #2212 scopes the per-tile " - "sync to the default stream via " - "cupy.cuda.Stream.null.synchronize()." - ) - - def test_stream_null_synchronize_present(self): - found = _stream_null_synchronize_lines( - self._tree, self._start_line, self._parents, only_in_loop=True, - ) - assert len(found) >= 1, ( - "_nvjpeg2k_batch_encode no longer calls " - "cupy.cuda.Stream.null.synchronize() inside the encode loop. " - "The fix in #2212 requires the per-tile sync to be scoped to " - "the default stream so encode/retrieve ordering is preserved " - "without blocking other CUDA streams." - ) - - -class TestDecodeReferencePattern: - """The decoder pattern is the contract we mirror. Pin it as the reference. - - If ``_try_nvjpeg_batch_decode`` ever swaps back to - ``Device().synchronize()`` inside its loop, the encoder fix would - drift from the codebase's own established pattern; pin it. - """ - - def setup_method(self): - from xrspatial.geotiff import _gpu_decode - self._fn = _gpu_decode._try_nvjpeg_batch_decode - src, start = _function_source(self._fn) - self._src = src - self._start_line = start - self._tree = ast.parse(src) - self._parents = _parent_map(self._tree) - - def test_decoder_uses_stream_null_sync_in_loop(self): - found = _stream_null_synchronize_lines( - self._tree, self._start_line, self._parents, only_in_loop=True, - ) - assert len(found) >= 1, ( - "_try_nvjpeg_batch_decode no longer uses " - "cupy.cuda.Stream.null.synchronize() inside the decode loop. " - "This is the pattern #2212 mirrors for the encoder side; " - "drifting away from it means both sides will need to be " - "re-aligned." - ) diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py deleted file mode 100644 index f03864a14..000000000 --- a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py +++ /dev/null @@ -1,344 +0,0 @@ -"""Regression tests for issue #1517. - -PR #1515 fixed the ``AttributeError: 'ndarray' object has no attribute -'byteswap'`` crash on big-endian multi-byte TIFFs read via -``read_geotiff_gpu``. After that fix the GPU path no longer raised, but -predictor=2 BE files came back with wrong values: the per-dtype -predictor kernels view the byte buffer as native unsigned integers, so -on a BE file the prefix-sum runs on the wrong integer interpretation -and the differencing produces garbage. - -These tests confirm the GPU output now matches the CPU -``read_to_array`` baseline for predictor=2 BE files across several -dtypes and tile layouts, and that the LE predictor=2 path still -round-trips. -""" -from __future__ import annotations - -import importlib.util - -import numpy as np -import pytest - - -def _gpu_available() -> bool: - """True if cupy is importable and CUDA is initialised.""" - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -_HAS_GPU = _gpu_available() -_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None -_gpu_only = pytest.mark.skipif( - not (_HAS_GPU and _HAS_TIFFFILE), - reason="cupy + CUDA + tifffile required", -) - - -def _block_cpu_fallback(monkeypatch): - """Make any call to ``read_to_array`` from ``read_geotiff_gpu`` fail loudly. - - ``read_geotiff_gpu`` returns a cupy-backed array even when its silent CPU - fallback fires (the fallback wraps the CPU result with ``cupy.asarray``), - so ``isinstance(gpu_da.data, cupy.ndarray)`` cannot distinguish the two - paths. ``read_geotiff_gpu`` lives in ``xrspatial.geotiff._backends.gpu`` - and calls the locally bound ``_read_to_array`` symbol there; patching - that binding to raise turns any silent fallback into a hard test failure, - which is what we want when the point of a test is to exercise the actual - GPU decode kernels. - - Tests that legitimately rely on the CPU fallback (e.g. stripped - layouts) must not call this helper. - """ - from xrspatial.geotiff._backends import gpu as gpu_backend - - def _no_fallback(*args, **kwargs): - raise AssertionError( - "read_geotiff_gpu fell back to read_to_array; " - "the GPU decode path was not exercised." - ) - - monkeypatch.setattr( - gpu_backend, '_read_to_array', _no_fallback, raising=True, - ) - - -@_gpu_only -def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path, monkeypatch): - """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2.""" - import cupy - import tifffile - - from xrspatial.geotiff import read_geotiff_gpu - from xrspatial.geotiff._reader import read_to_array - - rng = np.random.RandomState(20260507) - arr = rng.randint( - -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64 - ).astype(np.int32) - - path = tmp_path / "be_pred2_int32.tif" - tifffile.imwrite( - str(path), arr, byteorder=">", predictor=2, - compression="deflate", tile=(16, 16), - ) - - cpu, _ = read_to_array(str(path)) - np.testing.assert_array_equal(cpu, arr) - - _block_cpu_fallback(monkeypatch) - gpu_da = read_geotiff_gpu(str(path)) - assert isinstance(gpu_da.data, cupy.ndarray) - assert gpu_da.data.dtype == np.dtype(np.int32) - assert gpu_da.data.dtype.isnative - np.testing.assert_array_equal(gpu_da.data.get(), cpu) - - -@_gpu_only -@pytest.mark.parametrize( - "dtype", - [np.uint16, np.int16, np.uint32, np.int32], -) -def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, monkeypatch, dtype): - """BE predictor=2 tiled files match CPU baseline across dtypes.""" - import cupy - import tifffile - - from xrspatial.geotiff import read_geotiff_gpu - from xrspatial.geotiff._reader import read_to_array - - rng = np.random.RandomState(20260508) - info = np.iinfo(dtype) - arr = rng.randint( - max(info.min, -1_000_000), - min(info.max, 1_000_000), - size=(32, 48), - dtype=np.int64, - ).astype(dtype) - - path = tmp_path / f"be_pred2_{np.dtype(dtype).name}.tif" - tifffile.imwrite( - str(path), arr, byteorder=">", predictor=2, - compression="deflate", tile=(16, 16), - ) - - cpu, _ = read_to_array(str(path)) - np.testing.assert_array_equal(cpu, arr) - - _block_cpu_fallback(monkeypatch) - gpu_da = read_geotiff_gpu(str(path)) - assert isinstance(gpu_da.data, cupy.ndarray) - assert gpu_da.data.dtype == np.dtype(dtype) - assert gpu_da.data.dtype.isnative - np.testing.assert_array_equal(gpu_da.data.get(), cpu) - - -@_gpu_only -def test_gpu_predictor2_big_endian_stripped_uint16(tmp_path): - """Stripped BE predictor=2 files take the CPU fallback but stay correct. - - ``read_geotiff_gpu`` falls back to the CPU reader for stripped - layouts, then transfers the result to GPU. The fix must not regress - that path. - """ - import cupy - import tifffile - - from xrspatial.geotiff import read_geotiff_gpu - from xrspatial.geotiff._reader import read_to_array - - rng = np.random.RandomState(20260509) - arr = rng.randint(0, 60000, size=(32, 48), dtype=np.uint16) - - path = tmp_path / "be_pred2_uint16_strip.tif" - # Omit ``tile`` to get the strip layout. - tifffile.imwrite( - str(path), arr, byteorder=">", predictor=2, compression="deflate", - ) - - cpu, _ = read_to_array(str(path)) - np.testing.assert_array_equal(cpu, arr) - - gpu_da = read_geotiff_gpu(str(path)) - assert isinstance(gpu_da.data, cupy.ndarray) - assert gpu_da.data.dtype == np.dtype(np.uint16) - assert gpu_da.data.dtype.isnative - np.testing.assert_array_equal(gpu_da.data.get(), cpu) - - -@_gpu_only -def test_gpu_predictor2_little_endian_still_works(tmp_path, monkeypatch): - """LE predictor=2 must still round-trip after the BE fix.""" - import cupy - import tifffile - - from xrspatial.geotiff import read_geotiff_gpu - from xrspatial.geotiff._reader import read_to_array - - rng = np.random.RandomState(20260510) - arr = rng.randint( - -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64 - ).astype(np.int32) - - path = tmp_path / "le_pred2_int32.tif" - tifffile.imwrite( - str(path), arr, byteorder="<", predictor=2, - compression="deflate", tile=(16, 16), - ) - - cpu, _ = read_to_array(str(path)) - np.testing.assert_array_equal(cpu, arr) - - _block_cpu_fallback(monkeypatch) - gpu_da = read_geotiff_gpu(str(path)) - assert isinstance(gpu_da.data, cupy.ndarray) - assert gpu_da.data.dtype == np.dtype(np.int32) - np.testing.assert_array_equal(gpu_da.data.get(), cpu) - - -@_gpu_only -def test_gpu_predictor3_big_endian_still_works(tmp_path, monkeypatch): - """Floating-point predictor BE must still match CPU after the fix.""" - import cupy - import tifffile - - from xrspatial.geotiff import read_geotiff_gpu - from xrspatial.geotiff._reader import read_to_array - - rng = np.random.RandomState(20260511) - arr = rng.standard_normal((32, 48)).astype(np.float32) - - path = tmp_path / "be_pred3_float32.tif" - tifffile.imwrite( - str(path), arr, byteorder=">", predictor=3, - compression="deflate", tile=(16, 16), - ) - - cpu, _ = read_to_array(str(path)) - np.testing.assert_array_equal(cpu, arr) - - _block_cpu_fallback(monkeypatch) - gpu_da = read_geotiff_gpu(str(path)) - assert isinstance(gpu_da.data, cupy.ndarray) - assert gpu_da.data.dtype == np.dtype(np.float32) - np.testing.assert_array_equal(gpu_da.data.get(), cpu) - - -def test_swap_byte_lanes_numpy_bps2(): - """The byte-swap helper reverses bytes per sample on a numpy buffer.""" - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - # uint16 values 0x0102, 0x0304 in BE bytes: 01 02 03 04 - buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8) - _swap_byte_lanes(buf, 2) - np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03], - dtype=np.uint8)) - - -def test_swap_byte_lanes_numpy_bps4(): - """bps=4: full byte reversal within each 4-byte sample.""" - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - buf = np.array([0x01, 0x02, 0x03, 0x04, - 0x05, 0x06, 0x07, 0x08], dtype=np.uint8) - _swap_byte_lanes(buf, 4) - np.testing.assert_array_equal( - buf, np.array([0x04, 0x03, 0x02, 0x01, - 0x08, 0x07, 0x06, 0x05], dtype=np.uint8)) - - -def test_swap_byte_lanes_numpy_bps8(): - """bps=8: full byte reversal within each 8-byte sample.""" - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - sample = np.arange(1, 9, dtype=np.uint8) - buf = np.tile(sample, 2).copy() - _swap_byte_lanes(buf, 8) - np.testing.assert_array_equal( - buf, np.tile(sample[::-1], 2)) - - -def test_swap_byte_lanes_uint8_noop(): - """bps=1 must be a no-op.""" - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - buf = np.array([1, 2, 3], dtype=np.uint8) - _swap_byte_lanes(buf, 1) - np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8)) - - -def test_swap_byte_lanes_rejects_unsupported_bps(): - """Unsupported bps values raise ValueError rather than corrupt data.""" - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - buf = np.zeros(6, dtype=np.uint8) - with pytest.raises(ValueError, match="unsupported bps"): - _swap_byte_lanes(buf, 3) - - -def test_swap_byte_lanes_rejects_misaligned_size(): - """Buffer size must be a multiple of bps.""" - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - buf = np.zeros(5, dtype=np.uint8) - with pytest.raises(ValueError, match="not a multiple"): - _swap_byte_lanes(buf, 2) - - -def test_swap_byte_lanes_numpy_is_zero_temp(): - """The numpy path must mutate the original buffer without realloc.""" - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8) - addr_before = buf.ctypes.data - _swap_byte_lanes(buf, 2) - assert buf.ctypes.data == addr_before - np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03], - dtype=np.uint8)) - - -@_gpu_only -@pytest.mark.parametrize("bps,dtype", [ - (2, np.uint16), - (4, np.uint32), - (8, np.uint64), -]) -def test_swap_byte_lanes_cupy_kernel(bps, dtype): - """The cupy path runs the CUDA kernel and matches numpy.byteswap.""" - import cupy - - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - rng = np.random.RandomState(20260512 + bps) - n_samples = 1024 - src = rng.randint(0, np.iinfo(dtype).max, size=n_samples, - dtype=np.uint64).astype(dtype) - expected = src.byteswap() # numpy reference, returns swapped copy - - d_buf = cupy.asarray(src.view(np.uint8)) - addr_before = int(d_buf.data.ptr) - _swap_byte_lanes(d_buf, bps) - addr_after = int(d_buf.data.ptr) - - assert addr_after == addr_before, "kernel must operate in place" - np.testing.assert_array_equal( - d_buf.get().view(dtype), expected, - ) - - -@_gpu_only -def test_swap_byte_lanes_cupy_uint8_noop(): - """bps=1 leaves cupy buffers untouched (no kernel launch).""" - import cupy - - from xrspatial.geotiff._gpu_decode import _swap_byte_lanes - - src = np.arange(16, dtype=np.uint8) - d_buf = cupy.asarray(src) - _swap_byte_lanes(d_buf, 1) - np.testing.assert_array_equal(d_buf.get(), src) diff --git a/xrspatial/geotiff/tests/test_predictor3_int_dtype_gpu_1933.py b/xrspatial/geotiff/tests/test_predictor3_int_dtype_gpu_1933.py deleted file mode 100644 index 456ab424a..000000000 --- a/xrspatial/geotiff/tests/test_predictor3_int_dtype_gpu_1933.py +++ /dev/null @@ -1,284 +0,0 @@ -"""GPU + dask+GPU backend coverage for issue #1933. - -#1933 added ``_validate_predictor_sample_format`` and wired it into -every IFD-read site (eager numpy, dask, GPU tiled, GPU stripped). The -eager and dask paths are covered by ``test_predictor3_int_dtype_1933``; -this module closes the GPU coverage gap. - -The validator is invoked at two GPU sites: - -* ``_backends/gpu.py:443`` -- the tiled eager GPU read path. Reached when - the file is tiled and ``bps == file_dtype.itemsize * 8`` (so the - bps_mismatch fallback at line 358 does not take over). -* ``_backends/gpu.py:999`` -- the GDS chunked GPU path - (``_read_geotiff_gpu_chunked_gds``). Reached when the file qualifies - for direct disk->GPU decode. - -The stripped GPU path falls back to CPU via ``_read_to_array`` and the -CPU-side validator there fires; the dask+GPU non-GDS path delegates to -``read_geotiff_dask`` which has its own validator (covered by the -existing dask test). The two NEW call sites have no targeted tests. - -A regression dropping either of those two validator calls would let -malformed predictor=3 + integer tiled files decode silently to -garbage bytes on GPU. The eager-test asserts the error path is wired -on CPU; this module asserts the GPU dispatcher path is wired too. -""" -from __future__ import annotations - -import importlib.util - -import numpy as np -import pytest - -from xrspatial.geotiff._compression import COMPRESSION_NONE -from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype -from xrspatial.geotiff._header import (TAG_BITS_PER_SAMPLE, TAG_COMPRESSION, TAG_IMAGE_LENGTH, - TAG_IMAGE_WIDTH, TAG_PHOTOMETRIC, TAG_PREDICTOR, - TAG_ROWS_PER_STRIP, TAG_SAMPLE_FORMAT, TAG_SAMPLES_PER_PIXEL, - TAG_STRIP_BYTE_COUNTS, TAG_STRIP_OFFSETS, - TAG_TILE_BYTE_COUNTS, TAG_TILE_LENGTH, TAG_TILE_OFFSETS, - TAG_TILE_WIDTH) -from xrspatial.geotiff._writer import _assemble_standard_layout, _write_stripped - - -def _gpu_available() -> bool: - if importlib.util.find_spec("cupy") is None: - return False - try: - import cupy - - return bool(cupy.cuda.is_available()) - except Exception: - return False - - -_HAS_GPU = _gpu_available() -pytestmark = pytest.mark.skipif( - not _HAS_GPU, reason="cupy + CUDA required", -) - - -def _build_predictor3_uint32_stripped_tiff(arr: np.ndarray) -> bytes: - """Build a stripped TIFF: predictor=3 + uint32 SampleFormat=1. - - Mirrors the helper in ``test_predictor3_int_dtype_1933`` so the GPU - coverage gap can be exercised against the same shape of malformed - file the eager test uses. Compression is COMPRESSION_NONE so the - strip bytes are exactly the raw integer values. - """ - rel_off, bc, chunks = _write_stripped(arr, COMPRESSION_NONE, False) - bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype) - tags = [ - (TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]), - (TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]), - (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample), - (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE), - (TAG_PHOTOMETRIC, SHORT, 1, 1), - (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), - (TAG_SAMPLE_FORMAT, SHORT, 1, 1), - (TAG_PREDICTOR, SHORT, 1, 3), - (TAG_ROWS_PER_STRIP, SHORT, 1, arr.shape[0]), - (TAG_STRIP_OFFSETS, LONG, len(rel_off), rel_off), - (TAG_STRIP_BYTE_COUNTS, LONG, len(bc), bc), - ] - parts = [(arr, arr.shape[1], arr.shape[0], rel_off, bc, chunks)] - return _assemble_standard_layout(8, [tags], parts, bigtiff=False) - - -def _build_predictor3_uint32_tiled_tiff( - arr: np.ndarray, tile_w: int = 16, tile_h: int = 16, -) -> bytes: - """Build a tiled malformed TIFF: predictor=3 + uint32 SampleFormat=1. - - The tiled layout is the one that reaches the GPU validator at - ``_backends/gpu.py:443`` (no bps_mismatch fallback). Tile size is - 16x16, the smallest tifffile/standard tile size. - """ - bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype) - h, w = arr.shape - - tiles_across = (w + tile_w - 1) // tile_w - tiles_down = (h + tile_h - 1) // tile_h - tiles: list[bytes] = [] - rel_off: list[int] = [] - bc: list[int] = [] - offset = 0 - for tr in range(tiles_down): - for tc in range(tiles_across): - r0 = tr * tile_h - c0 = tc * tile_w - r1 = min(r0 + tile_h, h) - c1 = min(c0 + tile_w, w) - tile_slice = arr[r0:r1, c0:c1] - if tile_slice.shape != (tile_h, tile_w): - padded = np.zeros((tile_h, tile_w), dtype=arr.dtype) - padded[: tile_slice.shape[0], : tile_slice.shape[1]] = ( - tile_slice) - tile_arr = padded - else: - tile_arr = np.ascontiguousarray(tile_slice) - chunk = tile_arr.tobytes() - rel_off.append(offset) - bc.append(len(chunk)) - tiles.append(chunk) - offset += len(chunk) - - tags = [ - (TAG_IMAGE_WIDTH, LONG, 1, w), - (TAG_IMAGE_LENGTH, LONG, 1, h), - (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample), - (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE), - (TAG_PHOTOMETRIC, SHORT, 1, 1), - (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), - (TAG_SAMPLE_FORMAT, SHORT, 1, 1), - (TAG_PREDICTOR, SHORT, 1, 3), - (TAG_TILE_WIDTH, LONG, 1, tile_w), - (TAG_TILE_LENGTH, LONG, 1, tile_h), - (TAG_TILE_OFFSETS, LONG, len(rel_off), rel_off), - (TAG_TILE_BYTE_COUNTS, LONG, len(bc), bc), - ] - parts = [(arr, w, h, rel_off, bc, tiles)] - return _assemble_standard_layout(8, [tags], parts, bigtiff=False) - - -class TestGPUEagerRejectsMalformedFile: - """``read_geotiff_gpu`` rejects predictor=3 + integer SampleFormat.""" - - def test_gpu_eager_stripped_raises(self, tmp_path): - from xrspatial.geotiff import read_geotiff_gpu - - arr = np.array( - [[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint32) - path = tmp_path / "pred3_uint32_stripped.tif" - path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr)) - with pytest.raises(ValueError, match="Predictor=3"): - read_geotiff_gpu(str(path)) - - def test_gpu_eager_tiled_raises(self, tmp_path): - """Tiled layout hits the tiled GPU validator at gpu.py:443. - - Distinct from the stripped fallback path -- a regression - dropping the line 443 call would leak through this test - because the stripped path's validator lives in - ``_read_to_array`` and would still raise. - """ - from xrspatial.geotiff import read_geotiff_gpu - - arr = np.arange(256, dtype=np.uint32).reshape(16, 16) - path = tmp_path / "pred3_uint32_tiled.tif" - path.write_bytes(_build_predictor3_uint32_tiled_tiff(arr)) - with pytest.raises(ValueError, match="Predictor=3"): - read_geotiff_gpu(str(path)) - - def test_gpu_dispatcher_eager_raises(self, tmp_path): - """``open_geotiff(gpu=True)`` dispatcher rejects the file.""" - from xrspatial.geotiff import open_geotiff - - arr = np.arange(64, dtype=np.uint32).reshape(8, 8) - path = tmp_path / "pred3_uint32_dispatch.tif" - path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr)) - with pytest.raises(ValueError, match="Predictor=3"): - open_geotiff(str(path), gpu=True) - - -class TestGPUChunkedRejectsMalformedFile: - """The dask+GPU paths also reject predictor=3 + integer.""" - - def test_read_geotiff_gpu_chunked_stripped_raises(self, tmp_path): - from xrspatial.geotiff import read_geotiff_gpu - - arr = np.arange(64, dtype=np.uint32).reshape(8, 8) - path = tmp_path / "pred3_uint32_chunked_str.tif" - path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr)) - with pytest.raises(ValueError, match="Predictor=3"): - read_geotiff_gpu(str(path), chunks=4) - - def test_read_geotiff_gpu_chunked_tiled_raises(self, tmp_path): - """Tiled chunked path with KvikIO available exercises gpu.py:999. - - Gated on ``kvikio`` so the GDS qualification path - (``_read_geotiff_gpu_chunked_gds``) is the branch actually - taken. Without KvikIO the dispatcher falls back to the CPU - dask path and the line-999 validator is never reached, which - leaves the targeted call site untested. The CPU fallback - rejection is already covered by the eager/dask tests in - ``test_predictor3_int_dtype_1933``. - """ - pytest.importorskip("kvikio") - - from xrspatial.geotiff import read_geotiff_gpu - - arr = np.arange(256, dtype=np.uint32).reshape(16, 16) - path = tmp_path / "pred3_uint32_chunked_tiled.tif" - path.write_bytes(_build_predictor3_uint32_tiled_tiff(arr)) - with pytest.raises(ValueError, match="Predictor=3"): - read_geotiff_gpu(str(path), chunks=16) - - def test_open_geotiff_chunks_gpu_dispatcher_raises(self, tmp_path): - """``open_geotiff(chunks=, gpu=True)`` dispatcher rejects the file.""" - from xrspatial.geotiff import open_geotiff - - arr = np.arange(256, dtype=np.uint32).reshape(16, 16) - path = tmp_path / "pred3_uint32_chunked_dispatch.tif" - path.write_bytes(_build_predictor3_uint32_tiled_tiff(arr)) - with pytest.raises(ValueError, match="Predictor=3"): - open_geotiff(str(path), chunks=8, gpu=True) - - -class TestValidPredictor3StillWorksOnGPU: - """A legitimate predictor=3 + float32 tiled file still decodes on GPU.""" - - def test_predictor3_float32_gpu_round_trip(self, tmp_path): - from xrspatial.geotiff import read_geotiff_gpu, to_geotiff - - arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16) - path = tmp_path / "pred3_float32_tiled.tif" - to_geotiff( - arr, str(path), compression="deflate", predictor=3, - tiled=True, tile_size=16, - ) - - result = read_geotiff_gpu(str(path)) - assert result.dtype == np.float32 - np.testing.assert_array_equal(result.data.get(), arr) - - def test_predictor3_float32_dask_gpu_round_trip(self, tmp_path): - from xrspatial.geotiff import read_geotiff_gpu, to_geotiff - - arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16) - path = tmp_path / "pred3_float32_dask.tif" - to_geotiff( - arr, str(path), compression="deflate", predictor=3, - tiled=True, tile_size=16, - ) - - result = read_geotiff_gpu(str(path), chunks=8) - assert result.dtype == np.float32 - np.testing.assert_array_equal(result.compute().data.get(), arr) - - -class TestErrorMessageStable: - """The GPU error wording matches the eager/dask wording. - - Cross-backend error parity is a real concern -- a regression that - fired the validator on GPU but with a different message would force - callers to special-case the backend on ``except ValueError``. - """ - - def test_gpu_error_message_matches_eager(self, tmp_path): - from xrspatial.geotiff import open_geotiff, read_geotiff_gpu - - arr = np.arange(64, dtype=np.uint32).reshape(8, 8) - path = tmp_path / "pred3_uint32_msg.tif" - path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr)) - - with pytest.raises(ValueError) as exc_eager: - open_geotiff(str(path)) - with pytest.raises(ValueError) as exc_gpu: - read_geotiff_gpu(str(path)) - - assert str(exc_eager.value) == str(exc_gpu.value), ( - "GPU and eager paths must surface the same Predictor=3 " - "error message so callers can use a single except branch." - ) diff --git a/xrspatial/geotiff/tests/unit/test_predictor.py b/xrspatial/geotiff/tests/unit/test_predictor.py index 3d1216b87..39ad94f36 100644 --- a/xrspatial/geotiff/tests/unit/test_predictor.py +++ b/xrspatial/geotiff/tests/unit/test_predictor.py @@ -30,10 +30,9 @@ predictor=2 on smooth float data (opt-in via env var). GPU predictor variants are intentionally out of scope here -- the -dedicated GPU predictor files -(``test_predictor2_big_endian_gpu_1517.py``, -``test_predictor3_int_dtype_gpu_1933.py``) are folded into the GPU -cluster #2438. GPU regressions that lived alongside CPU tests in the +dedicated GPU predictor coverage lives in +``xrspatial/geotiff/tests/gpu/test_codec.py`` (folded under epic +#2438). GPU regressions that lived alongside CPU tests in the old files (predictor=2 int8 tiled/stripped, predictor=3 BE GPU, predictor=2/3 multi-sample GPU parity) move with this consolidation so the CPU and GPU coverage stay co-located by behaviour rather than