diff --git a/docs/source/reference/release_gate_geotiff.rst b/docs/source/reference/release_gate_geotiff.rst
index 8c4433b32..3d81ddfd2 100644
--- a/docs/source/reference/release_gate_geotiff.rst
+++ b/docs/source/reference/release_gate_geotiff.rst
@@ -683,7 +683,7 @@ Internal-only surfaces (not promised)
        ``allow_internal_only_jpeg=True``; not covered by
        ``allow_experimental_codecs``.
      - ``xrspatial/geotiff/tests/unit/test_photometric.py``,
-       ``xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py``
+       ``xrspatial/geotiff/tests/gpu/test_codec.py``
      - `#2340`_
 
 Cross-cutting CI gates
diff --git a/xrspatial/geotiff/tests/CLUSTER_AUDIT_GPU_C.md b/xrspatial/geotiff/tests/CLUSTER_AUDIT_GPU_C.md
new file mode 100644
index 000000000..58d7a446f
--- /dev/null
+++ b/xrspatial/geotiff/tests/CLUSTER_AUDIT_GPU_C.md
@@ -0,0 +1,153 @@
+# Cluster 14 (Sub-PR C) audit: GPU codec test consolidation
+
+Folds 11 GPU codec test files into `xrspatial/geotiff/tests/gpu/test_codec.py`.
+Baseline collection: 71 tests across the 11 source files. Consolidated
+collection: 71 tests in the new file. Run-time outcome on this checkout
+(no nvCOMP / nvJPEG / lerc beyond what the host ships): 68 passed, 3
+skipped.
+
+Source -> destination mapping (every old test landed under the same
+issue-suffixed name unless noted):
+
+## test_nvcomp_batch_compress_batched_1712.py
+
+| old `file::test`                                    | new `test_codec.py::test_id`                                  |
+| --------------------------------------------------- | ------------------------------------------------------------- |
+| `test_no_per_tile_cupy_empty_in_compressed_pool`    | `test_no_per_tile_cupy_empty_in_compressed_pool_1712`         |
+| `test_no_per_tile_get_in_result_loop`               | `test_no_per_tile_get_in_result_loop_1712`                    |
+| `test_gpu_write_roundtrip_after_batched_compress[deflate]` | `test_gpu_write_roundtrip_after_batched_compress_1712[deflate]` |
+| `test_gpu_write_roundtrip_after_batched_compress[zstd]`    | `test_gpu_write_roundtrip_after_batched_compress_1712[zstd]`    |
+| `test_gpu_write_zero_tile_edge_case`                | `test_gpu_write_zero_tile_edge_case_1712`                     |
+
+## test_nvcomp_batch_upload_p3.py
+
+| old `file::test`                                       | new `test_codec.py::test_id`                            |
+| ------------------------------------------------------ | ------------------------------------------------------- |
+| `test_nvcomp_batch_upload_correctness[256-tile0]`      | `test_nvcomp_batch_upload_correctness_p3[256-tile0]`    |
+| `test_nvcomp_batch_upload_correctness[1024-tile1]`     | `test_nvcomp_batch_upload_correctness_p3[1024-tile1]`   |
+| `test_nvcomp_batch_upload_correctness[2048-tile2]`     | `test_nvcomp_batch_upload_correctness_p3[2048-tile2]`   |
+| `test_nvcomp_kvikio_fallback_skips_zstd`               | `test_nvcomp_kvikio_fallback_skips_zstd_p3`             |
+| `test_nvcomp_batch_upload_perf_regression_guard`       | `test_nvcomp_batch_upload_perf_regression_guard_p3`     |
+
+## test_nvcomp_decompress_cumsum_offsets_1950.py
+
+| old `file::test`                                          | new `test_codec.py::test_id`                            |
+| --------------------------------------------------------- | ------------------------------------------------------- |
+| `test_nvcomp_decompress_uses_cumsum_for_offsets_1950`     | `test_nvcomp_decompress_uses_cumsum_for_offsets_1950`   |
+| `test_cumsum_matches_loop_prefix_sum_1950`                | `test_cumsum_matches_loop_prefix_sum_1950`              |
+| `test_nvcomp_batch_decompress_roundtrip_1950`             | `test_nvcomp_batch_decompress_roundtrip_1950`           |
+
+## test_nvcomp_from_device_bufs_single_alloc_1659.py
+
+| old `file::test`                                                | new `test_codec.py::test_id`                                       |
+| --------------------------------------------------------------- | ------------------------------------------------------------------ |
+| `test_unsupported_codec_short_circuits_before_allocation`       | `test_unsupported_codec_short_circuits_before_allocation_1659`     |
+| `test_no_nvcomp_lib_returns_none`                               | `test_no_nvcomp_lib_returns_none_1659`                             |
+| `test_memory_guard_runs_with_full_decomp_size`                  | `test_memory_guard_runs_with_full_decomp_size_1659`                |
+| `test_zstd_decompress_roundtrip_returns_single_contiguous_buffer` | `test_zstd_decompress_roundtrip_returns_single_contiguous_buffer_1659` |
+| `test_no_orphan_decomp_buffers_after_call`                      | `test_no_orphan_decomp_buffers_after_call_1659`                    |
+
+## test_nvjpeg_encode_stream_sync_2212.py
+
+| old `file::test`                                                          | new `test_codec.py::test_id`                                                  |
+| ------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| `TestNvjpegEncodeStreamSync::test_no_device_synchronize_inside_encode_loop` | `TestNvjpegEncodeStreamSync_2212::test_no_device_synchronize_inside_encode_loop` |
+| `TestNvjpegEncodeStreamSync::test_stream_null_synchronize_present`        | `TestNvjpegEncodeStreamSync_2212::test_stream_null_synchronize_present`       |
+| `TestNvjpeg2kEncodeStreamSync::test_no_device_synchronize_inside_encode_loop` | `TestNvjpeg2kEncodeStreamSync_2212::test_no_device_synchronize_inside_encode_loop` |
+| `TestNvjpeg2kEncodeStreamSync::test_stream_null_synchronize_present`      | `TestNvjpeg2kEncodeStreamSync_2212::test_stream_null_synchronize_present`     |
+| `TestDecodeReferencePattern::test_decoder_uses_stream_null_sync_in_loop`  | `TestDecodeReferencePattern_2212::test_decoder_uses_stream_null_sync_in_loop` |
+
+## test_nvjpeg2k_single_alloc_2107.py
+
+| old `file::test`                                                            | new `test_codec.py::test_id`                                                 |
+| --------------------------------------------------------------------------- | ---------------------------------------------------------------------------- |
+| `TestNvjpeg2kSingleAllocStructural::test_no_cupy_empty_inside_decode_loop`  | `TestNvjpeg2kSingleAllocStructural_2107::test_no_cupy_empty_inside_decode_loop` |
+| `TestNvjpeg2kSingleAllocStructural::test_no_device_synchronize_inside_decode_loop` | `TestNvjpeg2kSingleAllocStructural_2107::test_no_device_synchronize_inside_decode_loop` |
+| `TestNvjpeg2kSingleAllocStructural::test_pool_allocation_present`           | `TestNvjpeg2kSingleAllocStructural_2107::test_pool_allocation_present`        |
+| `TestNvjpeg2kSingleAllocStructural::test_check_gpu_memory_guard_present`    | `TestNvjpeg2kSingleAllocStructural_2107::test_check_gpu_memory_guard_present` |
+| `TestNvjpeg2kLibAbsentShortCircuit::test_returns_none_when_lib_missing`     | `TestNvjpeg2kLibAbsentShortCircuit_2107::test_returns_none_when_lib_missing`  |
+| `TestNvjpeg2kLibAbsentShortCircuit::test_returns_none_for_unsupported_dtype` | `TestNvjpeg2kLibAbsentShortCircuit_2107::test_returns_none_for_unsupported_dtype` |
+| `TestNvjpeg2kPoolWithCupy::test_pool_slabs_are_non_overlapping`             | `TestNvjpeg2kPoolWithCupy_2107::test_pool_slabs_are_non_overlapping`          |
+
+Note: the source had `@pytest.mark.gpu` on `TestNvjpeg2kPoolWithCupy`,
+which raised an `UnknownMark` warning because the project does not
+register a `gpu` mark. The new section uses `@requires_gpu` from
+`_helpers/markers.py` -- same skip behaviour, no warning.
+
+## test_jpeg_gpu_1549.py
+
+| old `file::test`                                | new `test_codec.py::test_id`                          |
+| ----------------------------------------------- | ----------------------------------------------------- |
+| `test_rgb_jpeg_gpu_no_crash`                    | `test_rgb_jpeg_gpu_no_crash_1549`                     |
+| `test_rgb_jpeg_gpu_matches_cpu`                 | `test_rgb_jpeg_gpu_matches_cpu_1549`                  |
+| `test_grayscale_jpeg_gpu_matches_cpu`           | `test_grayscale_jpeg_gpu_matches_cpu_1549`            |
+| `test_cuda_context_survives_after_jpeg_gpu_read` | `test_cuda_context_survives_after_jpeg_gpu_read_1549` |
+
+## test_lerc_valid_mask_gpu.py
+
+| old `file::test`                                       | new `test_codec.py::test_id`                            |
+| ------------------------------------------------------ | ------------------------------------------------------- |
+| `TestGpuLercValidMask::test_float32_nan_nodata`        | `TestGpuLercValidMask::test_float32_nan_nodata`         |
+| `TestGpuLercValidMask::test_float32_sentinel_nodata`   | `TestGpuLercValidMask::test_float32_sentinel_nodata`    |
+| `TestGpuLercValidMask::test_uint16_sentinel_nodata`    | `TestGpuLercValidMask::test_uint16_sentinel_nodata`     |
+| `TestGpuLercValidMask::test_no_mask_roundtrip_bitexact` | `TestGpuLercValidMask::test_no_mask_roundtrip_bitexact` |
+
+## test_predictor2_big_endian_gpu_1517.py
+
+| old `file::test`                                            | new `test_codec.py::test_id`                                 |
+| ----------------------------------------------------------- | ------------------------------------------------------------ |
+| `test_gpu_predictor2_big_endian_int32_tiled_reproducer`     | `test_gpu_predictor2_big_endian_int32_tiled_reproducer_1517` |
+| `test_gpu_predictor2_big_endian_dtypes_tiled[uint16]`       | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[uint16]`   |
+| `test_gpu_predictor2_big_endian_dtypes_tiled[int16]`        | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[int16]`    |
+| `test_gpu_predictor2_big_endian_dtypes_tiled[uint32]`       | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[uint32]`   |
+| `test_gpu_predictor2_big_endian_dtypes_tiled[int32]`        | `test_gpu_predictor2_big_endian_dtypes_tiled_1517[int32]`    |
+| `test_gpu_predictor2_big_endian_stripped_uint16`            | `test_gpu_predictor2_big_endian_stripped_uint16_1517`        |
+| `test_gpu_predictor2_little_endian_still_works`             | `test_gpu_predictor2_little_endian_still_works_1517`         |
+| `test_gpu_predictor3_big_endian_still_works`                | `test_gpu_predictor3_big_endian_still_works_1517`            |
+| `test_swap_byte_lanes_numpy_bps2`                           | `test_swap_byte_lanes_numpy_bps2_1517`                       |
+| `test_swap_byte_lanes_numpy_bps4`                           | `test_swap_byte_lanes_numpy_bps4_1517`                       |
+| `test_swap_byte_lanes_numpy_bps8`                           | `test_swap_byte_lanes_numpy_bps8_1517`                       |
+| `test_swap_byte_lanes_uint8_noop`                           | `test_swap_byte_lanes_uint8_noop_1517`                       |
+| `test_swap_byte_lanes_rejects_unsupported_bps`              | `test_swap_byte_lanes_rejects_unsupported_bps_1517`          |
+| `test_swap_byte_lanes_rejects_misaligned_size`              | `test_swap_byte_lanes_rejects_misaligned_size_1517`          |
+| `test_swap_byte_lanes_numpy_is_zero_temp`                   | `test_swap_byte_lanes_numpy_is_zero_temp_1517`               |
+| `test_swap_byte_lanes_cupy_kernel[2-uint16]`                | `test_swap_byte_lanes_cupy_kernel_1517[2-uint16]`            |
+| `test_swap_byte_lanes_cupy_kernel[4-uint32]`                | `test_swap_byte_lanes_cupy_kernel_1517[4-uint32]`            |
+| `test_swap_byte_lanes_cupy_kernel[8-uint64]`                | `test_swap_byte_lanes_cupy_kernel_1517[8-uint64]`            |
+| `test_swap_byte_lanes_cupy_uint8_noop`                      | `test_swap_byte_lanes_cupy_uint8_noop_1517`                  |
+
+## test_predictor3_int_dtype_gpu_1933.py
+
+| old `file::test`                                                              | new `test_codec.py::test_id`                                                   |
+| ----------------------------------------------------------------------------- | ------------------------------------------------------------------------------ |
+| `TestGPUEagerRejectsMalformedFile::test_gpu_eager_stripped_raises`            | `TestGPUEagerRejectsMalformedFile_1933::test_gpu_eager_stripped_raises`        |
+| `TestGPUEagerRejectsMalformedFile::test_gpu_eager_tiled_raises`               | `TestGPUEagerRejectsMalformedFile_1933::test_gpu_eager_tiled_raises`           |
+| `TestGPUEagerRejectsMalformedFile::test_gpu_dispatcher_eager_raises`          | `TestGPUEagerRejectsMalformedFile_1933::test_gpu_dispatcher_eager_raises`      |
+| `TestGPUChunkedRejectsMalformedFile::test_read_geotiff_gpu_chunked_stripped_raises` | `TestGPUChunkedRejectsMalformedFile_1933::test_read_geotiff_gpu_chunked_stripped_raises` |
+| `TestGPUChunkedRejectsMalformedFile::test_read_geotiff_gpu_chunked_tiled_raises` | `TestGPUChunkedRejectsMalformedFile_1933::test_read_geotiff_gpu_chunked_tiled_raises` |
+| `TestGPUChunkedRejectsMalformedFile::test_open_geotiff_chunks_gpu_dispatcher_raises` | `TestGPUChunkedRejectsMalformedFile_1933::test_open_geotiff_chunks_gpu_dispatcher_raises` |
+| `TestValidPredictor3StillWorksOnGPU::test_predictor3_float32_gpu_round_trip`  | `TestValidPredictor3StillWorksOnGPU_1933::test_predictor3_float32_gpu_round_trip` |
+| `TestValidPredictor3StillWorksOnGPU::test_predictor3_float32_dask_gpu_round_trip` | `TestValidPredictor3StillWorksOnGPU_1933::test_predictor3_float32_dask_gpu_round_trip` |
+| `TestErrorMessageStable::test_gpu_error_message_matches_eager`                | `TestErrorMessageStable_1933::test_gpu_error_message_matches_eager`            |
+
+## test_gpu_jpeg_interop_reject_issue_D_1845.py
+
+| old `file::test`                                                  | new `test_codec.py::test_id`                                       |
+| ----------------------------------------------------------------- | ------------------------------------------------------------------ |
+| `test_write_geotiff_gpu_rejects_jpeg_without_opt_in`              | `test_write_geotiff_gpu_rejects_jpeg_without_opt_in_1845`          |
+| `test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives` | `test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives_1845` |
+| `test_write_geotiff_gpu_rejects_jpeg_case_insensitive`            | `test_write_geotiff_gpu_rejects_jpeg_case_insensitive_1845`        |
+| `test_write_geotiff_gpu_jpeg_opt_in_emits_warning`                | `test_write_geotiff_gpu_jpeg_opt_in_emits_warning_1845`            |
+| `test_write_geotiff_gpu_non_jpeg_unaffected_by_flag`              | `test_write_geotiff_gpu_non_jpeg_unaffected_by_flag_1845`          |
+
+## Cross-references updated
+
+* `docs/source/reference/release_gate_geotiff.rst` -- codec ``jpeg``
+  row now cites `gpu/test_codec.py` instead of the deleted
+  `test_gpu_jpeg_interop_reject_issue_D_1845.py`.
+* `xrspatial/geotiff/tests/unit/test_predictor.py` -- the GPU
+  predictor file pointers in the module docstring now point at
+  `gpu/test_codec.py`.
+
+This audit file is deleted in a final pre-merge commit on this branch
+(epic #2424 hard gate).
diff --git a/xrspatial/geotiff/tests/gpu/__init__.py b/xrspatial/geotiff/tests/gpu/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/xrspatial/geotiff/tests/gpu/test_codec.py b/xrspatial/geotiff/tests/gpu/test_codec.py
new file mode 100644
index 000000000..c33e80f63
--- /dev/null
+++ b/xrspatial/geotiff/tests/gpu/test_codec.py
@@ -0,0 +1,1904 @@
+"""GPU codec coverage: nvCOMP, nvJPEG / nvJPEG2000, JPEG, LERC, predictor.
+
+Cluster 14 of long-tail epic #2424 (Sub-PR C) folds the GPU codec test
+files into one home. Sections in source-order below:
+
+* ``test_nvcomp_batch_compress_batched_1712.py`` -- batched nvCOMP
+  compress: single contiguous output alloc + single batched D2H concat.
+* ``test_nvcomp_batch_upload_p3.py`` -- batched H2D upload on the
+  nvCOMP decompress side; cumulative-sum offset pattern.
+* ``test_nvcomp_decompress_cumsum_offsets_1950.py`` -- decompress-side
+  prefix-sum offsets via ``np.cumsum`` rather than a Python loop.
+* ``test_nvcomp_from_device_bufs_single_alloc_1659.py`` -- single
+  contiguous output buffer for the device-buf nvCOMP path.
+* ``test_nvjpeg_encode_stream_sync_2212.py`` -- the per-tile encode-
+  loop sync uses ``Stream.null.synchronize()`` (not
+  ``Device().synchronize()``).
+* ``test_nvjpeg2k_single_alloc_2107.py`` -- pool the per-tile alloc +
+  per-tile sync in ``_try_nvjpeg2k_batch_decode``.
+* ``test_jpeg_gpu_1549.py`` -- nvJPEG output-format constants match
+  the SDK; cross-backend pixel parity + context survival.
+* ``test_lerc_valid_mask_gpu.py`` -- the GPU LERC tile-decode path
+  honours the file's valid-mask, matching the CPU reader.
+* ``test_predictor2_big_endian_gpu_1517.py`` -- byte-swap helper +
+  predictor=2 BE files match CPU baseline.
+* ``test_predictor3_int_dtype_gpu_1933.py`` -- predictor=3 + integer
+  SampleFormat is rejected at every GPU entry point.
+* ``test_gpu_jpeg_interop_reject_issue_D_1845.py`` -- the GPU writer
+  rejects ``compression='jpeg'`` by default and emits a
+  ``GeoTIFFFallbackWarning`` on the opt-in.
+
+Every test in this module is gated through the shared ``requires_gpu``
+marker from ``_helpers/markers.py``. Module-level helpers carry the
+source issue number suffix (e.g. ``_write_jpeg_rgb_tiff_1549``) so
+sibling sections stay collision-free.
+"""
+from __future__ import annotations
+
+import ast
+import importlib.util
+import inspect
+import os
+import pathlib
+import re
+import tempfile
+import time
+import uuid
+import warnings as _warnings
+
+import numpy as np
+import pytest
+import xarray as xr
+
+from .._helpers.markers import gpu_available, requires_gpu
+
+# Aliased so the per-file ``_gpu_only`` decorators read the same as
+# before the consolidation; the underlying check is the shared
+# ``requires_gpu`` marker.
+_gpu_only = requires_gpu
+needs_cupy = requires_gpu
+
+# A handful of sections additionally gate on optional libraries (tifffile,
+# imagecodecs, nvJPEG, etc.). Those gates layer on top of ``requires_gpu``
+# below; they need separate skipif decorators because the missing-library
+# reason text is informative.
+_HAS_GPU = gpu_available()
+_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None
+_HAS_PIL = importlib.util.find_spec("PIL") is not None
+_HAS_IMAGECODECS = importlib.util.find_spec("imagecodecs") is not None
+
+
+# ============================================================
+# Section: nvCOMP batched compress (#1712)
+# ============================================================
+# Source: test_nvcomp_batch_compress_batched_1712.py
+#
+# The pre-fix function allocated compressed-output device buffers one
+# ``cupy.empty`` per tile and then read each tile back to host with one
+# ``.get()`` per tile. Both patterns serialised on the default CUDA
+# stream and were dominant in large-N writes. The fix folds both into
+# a single contiguous device allocation + a single batched D2H concat-
+# and-``.get()``. These tests pin the new shape and confirm the deflate
+# / zstd GPU write paths still round-trip end-to-end.
+
+# nvCOMP is the entry point that exercises this code path.
+from xrspatial.geotiff import _gpu_decode  # noqa: E402
+
+
+def test_no_per_tile_cupy_empty_in_compressed_pool_1712():
+    """The per-tile cupy.empty list comprehension is gone (#1712)."""
+    source = inspect.getsource(_gpu_decode._nvcomp_batch_compress)
+    assert "cupy.empty(max_cs, dtype=cupy.uint8) for _ in range" not in source, (
+        "_nvcomp_batch_compress regressed to per-tile cupy.empty "
+        "allocations for the compressed output pool. See #1712."
+    )
+
+
+def test_no_per_tile_get_in_result_loop_1712():
+    """The per-tile ``d_comp_bufs[i][:cs].get().tobytes()`` is gone (#1712)."""
+    source = inspect.getsource(_gpu_decode._nvcomp_batch_compress)
+    bad_fragment = "d_comp_bufs[i][:cs].get().tobytes()"
+    assert bad_fragment not in source, (
+        "_nvcomp_batch_compress regressed to per-tile .get().tobytes() "
+        "D2H readback. See #1712."
+    )
+
+
+@requires_gpu
+@pytest.mark.parametrize("compression", ["deflate", "zstd"])
+def test_gpu_write_roundtrip_after_batched_compress_1712(compression):
+    """GPU compress path round-trips uncorrupted for deflate + zstd."""
+    import cupy
+
+    from xrspatial.geotiff import open_geotiff, write_geotiff_gpu
+
+    rng = np.random.default_rng(seed=1712)
+    arr_cpu = rng.random((512, 512), dtype=np.float32)
+    arr_gpu = cupy.asarray(arr_cpu)
+    darr = xr.DataArray(arr_gpu, dims=["y", "x"])
+
+    with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td:
+        path = os.path.join(td, f"roundtrip_{compression}.tif")
+        try:
+            write_geotiff_gpu(
+                darr, path,
+                compression=compression,
+                tiled=True,
+                tile_size=64,
+            )
+        except RuntimeError as e:
+            pytest.skip(f"nvCOMP unavailable for {compression}: {e}")
+
+        back = open_geotiff(path)
+        np.testing.assert_allclose(back.values, arr_cpu, rtol=0, atol=0)
+
+
+@requires_gpu
+def test_gpu_write_zero_tile_edge_case_1712():
+    """A 0-tile compress returns an empty list without indexing into None."""
+    import cupy
+
+    from xrspatial.geotiff import open_geotiff, write_geotiff_gpu
+
+    arr_gpu = cupy.zeros((32, 32), dtype=cupy.float32)
+    darr = xr.DataArray(arr_gpu, dims=["y", "x"])
+    with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td:
+        path = os.path.join(td, "tiny.tif")
+        try:
+            write_geotiff_gpu(darr, path, compression="zstd",
+                              tiled=True, tile_size=32)
+        except RuntimeError as e:
+            pytest.skip(f"nvCOMP unavailable: {e}")
+        back = open_geotiff(path)
+        assert back.shape == (32, 32)
+
+
+# ============================================================
+# Section: nvCOMP batched H2D upload (P3 perf audit)
+# ============================================================
+# Source: test_nvcomp_batch_upload_p3.py
+#
+# The decompress-side fast path used to do one ``cupy.asarray`` per
+# compressed tile. The fix concatenates all tiles into a single host
+# buffer, performs one H2D transfer, and derives per-tile device
+# pointers via ``base_ptr + offsets``.
+
+
+def _kvikio_nvcomp_importable_p3() -> bool:
+    """True iff ``import kvikio.nvcomp`` actually succeeds."""
+    try:
+        import kvikio.nvcomp  # noqa: F401
+    except Exception:
+        return False
+    return True
+
+
+def _nvcomp_path_available_p3() -> bool:
+    """True when at least one nvCOMP backend is loadable on this host."""
+    if not _HAS_GPU:
+        return False
+    try:
+        from xrspatial.geotiff._gpu_decode import _get_nvcomp
+    except Exception:
+        return False
+    if _get_nvcomp() is not None:
+        return True
+    return _kvikio_nvcomp_importable_p3()
+
+
+_HAS_NVCOMP_P3 = _nvcomp_path_available_p3()
+_nvcomp_only_p3 = pytest.mark.skipif(
+    not (_HAS_GPU and _HAS_TIFFFILE and _HAS_NVCOMP_P3),
+    reason="cupy + CUDA + tifffile + (libnvcomp or kvikio.nvcomp) required",
+)
+
+
+def _write_deflate_tiled_p3(path, arr, tile=(256, 256)):
+    import tifffile
+    tifffile.imwrite(
+        str(path), arr, compression="deflate", tile=tile,
+    )
+
+
+def _wrap_nvcomp_with_call_recorder_p3(monkeypatch):
+    """Replace ``_try_nvcomp_batch_decompress`` with a recording wrapper."""
+    from xrspatial.geotiff import _gpu_decode
+
+    records: list[tuple[int, bool]] = []
+    original = _gpu_decode._try_nvcomp_batch_decompress
+
+    def _recording(compressed_tiles, tile_bytes, compression):
+        result = original(compressed_tiles, tile_bytes, compression)
+        records.append((compression, result is not None))
+        return result
+
+    monkeypatch.setattr(
+        _gpu_decode,
+        '_try_nvcomp_batch_decompress',
+        _recording,
+        raising=True,
+    )
+    return records
+
+
+@_nvcomp_only_p3
+@pytest.mark.parametrize("size,tile", [
+    (256, (128, 128)),    # 4 tiles
+    (1024, (256, 256)),   # 16 tiles
+    (2048, (128, 128)),   # 256 tiles -- matches the audit measurement
+])
+def test_nvcomp_batch_upload_correctness_p3(tmp_path, monkeypatch, size, tile):
+    """GPU decode of Deflate-tiled TIFFs is bit-exact vs CPU."""
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260508)
+    arr = rng.randint(0, 4096, size=(size, size), dtype=np.uint16)
+
+    name = f"deflate_{size}_{tile[0]}_{uuid.uuid4().hex[:8]}.tif"
+    path = tmp_path / name
+    _write_deflate_tiled_p3(path, arr, tile=tile)
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    records = _wrap_nvcomp_with_call_recorder_p3(monkeypatch)
+    gpu_da = read_geotiff_gpu(str(path))
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+    assert any(success for _, success in records), (
+        "_try_nvcomp_batch_decompress was never invoked or always returned "
+        f"None; records={records}. The optimised path was not exercised, so "
+        f"this test would pass even if the rewrite were broken."
+    )
+
+
+@_nvcomp_only_p3
+def test_nvcomp_kvikio_fallback_skips_zstd_p3(monkeypatch):
+    """ZSTD-compressed input must NOT take the kvikio DeflateManager path."""
+    import xrspatial.geotiff._gpu_decode as _gpu_decode
+
+    if not _kvikio_nvcomp_importable_p3():
+        pytest.skip("kvikio.nvcomp not importable; the kvikio branch "
+                    "is never entered on this host")
+    monkeypatch.setattr(_gpu_decode, '_get_nvcomp', lambda: None)
+
+    result = _gpu_decode._try_nvcomp_batch_decompress(
+        compressed_tiles=[b'\x28\xb5\x2f\xfd' + b'\x00' * 16],
+        tile_bytes=1024,
+        compression=50000,  # ZSTD
+    )
+    assert result is None, (
+        "_try_nvcomp_batch_decompress returned non-None for ZSTD via the "
+        "kvikio fallback; this would feed ZSTD bytes through DeflateManager "
+        "and produce garbage."
+    )
+
+
+@_nvcomp_only_p3
+def test_nvcomp_batch_upload_perf_regression_guard_p3(tmp_path, monkeypatch):
+    """Sanity guard: 2048x2048 Deflate-tiled GPU decode finishes quickly."""
+    from xrspatial.geotiff import read_geotiff_gpu
+
+    rng = np.random.RandomState(20260508)
+    arr = rng.randint(0, 4096, size=(2048, 2048), dtype=np.uint16)
+    path = tmp_path / f"deflate_2048_perf_{uuid.uuid4().hex[:8]}.tif"
+    _write_deflate_tiled_p3(path, arr, tile=(128, 128))
+
+    # Warm up.
+    _ = read_geotiff_gpu(str(path))
+
+    records = _wrap_nvcomp_with_call_recorder_p3(monkeypatch)
+    t0 = time.perf_counter()
+    out = read_geotiff_gpu(str(path))
+    elapsed = time.perf_counter() - t0
+
+    assert any(success for _, success in records), (
+        "nvCOMP fast-path did not run during the timed call; the threshold "
+        f"is meaningless without it. Records: {records}"
+    )
+
+    assert elapsed < 0.2, (
+        f"read_geotiff_gpu on 2048x2048 deflate-tiled TIFF took "
+        f"{elapsed * 1000:.1f} ms (threshold 200 ms) -- possible "
+        f"regression in the nvCOMP batched H2D upload path"
+    )
+    assert out.shape == (2048, 2048)
+
+
+# ============================================================
+# Section: nvCOMP decompress cumsum offsets (#1950)
+# ============================================================
+# Source: test_nvcomp_decompress_cumsum_offsets_1950.py
+#
+# ``_try_nvcomp_batch_decompress`` used to compute its per-tile host
+# prefix-sum offsets via a Python ``for`` loop. The fix swaps in
+# ``np.cumsum(sizes, out=offsets[1:])`` to align with the sibling
+# batched-D2H helper and the compress-side prefix sum.
+
+
+def test_nvcomp_decompress_uses_cumsum_for_offsets_1950():
+    """Source-level guard against reintroducing the Python for loop."""
+    src_path = pathlib.Path(__file__).parent.parent.parent / "_gpu_decode.py"
+    src = src_path.read_text()
+
+    cumsum_call = re.compile(
+        r"np\.cumsum\(\s*comp_sizes_arr\[:-1\]\s*,\s*"
+        r"out\s*=\s*comp_offsets_h\[1:\]\s*\)"
+    )
+    assert cumsum_call.search(src), (
+        "decompress upload block should use "
+        "``np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])`` for "
+        "prefix-sum offsets, aligning with _batched_d2h_to_bytes "
+        "(issue #1950)."
+    )
+    legacy_loop = re.compile(
+        r"for\s+i\s+in\s+range\(\s*1\s*,\s*n_tiles\s*\)\s*:\s*\n"
+        r"\s*comp_offsets_h\[i\]"
+    )
+    assert not legacy_loop.search(src), (
+        "decompress upload block should no longer compute prefix-sum "
+        "offsets with a Python for loop (issue #1950)."
+    )
+
+
+def test_cumsum_matches_loop_prefix_sum_1950():
+    """Equivalence between the vectorised cumsum and the prior loop."""
+    rng = np.random.RandomState(1950)
+    n = 1024
+    sizes = rng.randint(100, 100_000, size=n).astype(np.int64)
+
+    offsets_cumsum = np.zeros(n, dtype=np.int64)
+    if n > 1:
+        np.cumsum(sizes[:-1], out=offsets_cumsum[1:])
+
+    offsets_loop = np.zeros(n, dtype=np.int64)
+    for i in range(1, n):
+        offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1]
+
+    np.testing.assert_array_equal(offsets_cumsum, offsets_loop)
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("cupy") is None,
+    reason="cupy required for nvCOMP path",
+)
+def test_nvcomp_batch_decompress_roundtrip_1950():
+    """End-to-end check: a deflate-tiled raster still decodes correctly."""
+    if os.environ.get("XRSPATIAL_GEOTIFF_STRICT_GPU") != "1":
+        pytest.skip(
+            "set XRSPATIAL_GEOTIFF_STRICT_GPU=1 to exercise the nvCOMP "
+            "prefix-sum site; without it the GPU path may fall back to "
+            "a CPU codec and bypass this regression."
+        )
+    try:
+        import cupy
+    except ImportError:
+        pytest.skip("cupy not importable")
+    if not cupy.cuda.is_available():
+        pytest.skip("CUDA device not available")
+
+    from xrspatial.geotiff import open_geotiff, to_geotiff
+
+    rng = np.random.RandomState(1950)
+    height, width = 1024, 1024
+    arr = rng.rand(height, width).astype(np.float32)
+    da = xr.DataArray(
+        arr, dims=["y", "x"],
+        coords={"y": np.arange(height), "x": np.arange(width)},
+        attrs={"crs": 4326},
+    )
+
+    with tempfile.TemporaryDirectory() as td:
+        path = os.path.join(td, "tmp_1950_deflate.tif")
+        to_geotiff(da, path, compression="deflate", tile_size=256)
+
+        result = open_geotiff(path, gpu=True)
+        assert result.shape == (height, width)
+        decoded = cupy.asnumpy(result.data) if hasattr(
+            result.data, "get") else np.asarray(result.data)
+
+    np.testing.assert_allclose(decoded, arr, atol=0, rtol=0)
+
+
+# ============================================================
+# Section: nvCOMP from-device-bufs single alloc (#1659)
+# ============================================================
+# Source: test_nvcomp_from_device_bufs_single_alloc_1659.py
+#
+# ``_try_nvcomp_from_device_bufs`` used to allocate N separate
+# ``cupy.empty(tile_bytes)`` output buffers and run ``cupy.concatenate``
+# after the nvCOMP decompress kernel returned. The fix matches the
+# single-contiguous-buffer + pointer-offset pattern.
+
+from xrspatial.geotiff._gpu_decode import _try_nvcomp_from_device_bufs  # noqa: E402
+
+
+def _nvcomp_available_1659() -> bool:
+    from xrspatial.geotiff._gpu_decode import _get_nvcomp
+    return _get_nvcomp() is not None
+
+
+@requires_gpu
+def test_unsupported_codec_short_circuits_before_allocation_1659():
+    """Non-ZSTD codecs must return None without allocating output buffers."""
+    import cupy
+
+    d_tiles = [cupy.zeros(1024, dtype=cupy.uint8) for _ in range(4)]
+    assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 8) is None
+
+
+@requires_gpu
+def test_no_nvcomp_lib_returns_none_1659(monkeypatch):
+    """When the nvCOMP library is missing, the function must return None."""
+    import cupy
+
+    from xrspatial.geotiff import _gpu_decode
+
+    monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: None)
+
+    d_tiles = [cupy.zeros(1024, dtype=cupy.uint8)]
+    assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 50000) is None
+
+
+@requires_gpu
+def test_memory_guard_runs_with_full_decomp_size_1659(monkeypatch):
+    """The single-buffer allocation must be size-checked before cupy.empty."""
+    import cupy
+
+    from xrspatial.geotiff import _gpu_decode
+
+    seen = {"total_bytes": None, "what": None, "called": False}
+
+    def fake_check(required_bytes, what="tile buffer"):
+        seen["total_bytes"] = int(required_bytes)
+        seen["what"] = what
+        seen["called"] = True
+        raise MemoryError("simulated OOM")
+
+    monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: object())
+    monkeypatch.setattr(_gpu_decode, "_check_gpu_memory", fake_check)
+
+    n_tiles = 8
+    tile_bytes = 65536
+    d_tiles = [cupy.zeros(128, dtype=cupy.uint8) for _ in range(n_tiles)]
+
+    with pytest.raises(MemoryError):
+        _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000)
+
+    assert seen["called"], "_check_gpu_memory was not called"
+    expected_bytes = n_tiles * tile_bytes
+    assert seen["total_bytes"] == expected_bytes, (
+        f"expected total {expected_bytes}, got {seen['total_bytes']}"
+    )
+    assert "decompressed" in seen["what"] or "nvCOMP" in seen["what"], (
+        f"unhelpful 'what' label: {seen['what']!r}"
+    )
+
+
+@pytest.mark.skipif(
+    not _HAS_GPU or not _nvcomp_available_1659(),
+    reason="cupy + CUDA + nvCOMP shared lib required",
+)
+def test_zstd_decompress_roundtrip_returns_single_contiguous_buffer_1659():
+    """End-to-end: feed real ZSTD-compressed device buffers in."""
+    import cupy
+    import zstandard as zstd
+
+    rng = np.random.default_rng(seed=1659)
+    tile_bytes = 4096
+    n_tiles = 8
+
+    cctx = zstd.ZstdCompressor()
+    host_tiles = [rng.integers(0, 256, size=tile_bytes, dtype=np.uint8)
+                  for _ in range(n_tiles)]
+    compressed = [cctx.compress(t.tobytes()) for t in host_tiles]
+    d_tiles = [cupy.asarray(np.frombuffer(c, dtype=np.uint8))
+               for c in compressed]
+
+    result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000)
+
+    if result is None:
+        pytest.skip("nvCOMP returned None; library may be unusable on this host")
+
+    assert isinstance(result, cupy.ndarray)
+    assert result.dtype == cupy.uint8
+    assert result.shape == (n_tiles * tile_bytes,)
+    assert result.flags.c_contiguous
+
+    host_out = result.get()
+    for i, expected in enumerate(host_tiles):
+        decoded = host_out[i * tile_bytes:(i + 1) * tile_bytes]
+        assert np.array_equal(decoded, expected), (
+            f"tile {i} decoded payload differs from input"
+        )
+
+
+@requires_gpu
+def test_no_orphan_decomp_buffers_after_call_1659(monkeypatch):
+    """A successful call returns a single contiguous buffer."""
+    import cupy
+
+    from xrspatial.geotiff import _gpu_decode
+
+    monkeypatch.setattr(_gpu_decode, "_get_nvcomp",
+                        lambda: _FakeNvcompLib_1659())
+
+    n_tiles = 4
+    tile_bytes = 2048
+    d_tiles = [cupy.zeros(64, dtype=cupy.uint8) for _ in range(n_tiles)]
+    result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000)
+
+    assert result is not None
+    assert isinstance(result, cupy.ndarray)
+    assert result.size == n_tiles * tile_bytes
+    assert result.flags.c_contiguous
+    assert result.dtype == cupy.uint8
+
+
+class _FakeNvcompLib_1659:
+    """Stand-in for the nvCOMP CDLL handle used in tests."""
+
+    def __getattr__(self, name):
+        if name == 'nvcompBatchedZstdDecompressGetTempSizeAsync':
+            return _fake_temp_size_fn_1659
+        if name == 'nvcompBatchedZstdDecompressAsync':
+            return _fake_decompress_fn_1659
+        raise AttributeError(name)
+
+
+def _fake_temp_size_fn_1659(n, tile_bytes, opts, p_temp_size, total):
+    """Stub for nvcompBatchedZstdDecompressGetTempSizeAsync."""
+    p_temp_size._obj.value = 1
+    return 0
+
+
+def _fake_decompress_fn_1659(*args):
+    """Stub for nvcompBatchedZstdDecompressAsync (success)."""
+    return 0
+
+
+# ============================================================
+# Section: nvJPEG encode stream-null sync (#2212)
+# ============================================================
+# Source: test_nvjpeg_encode_stream_sync_2212.py
+#
+# Replace ``Device().synchronize()`` inside the per-tile encode loops
+# in ``_nvjpeg_batch_encode`` and ``_nvjpeg2k_batch_encode`` with
+# ``Stream.null.synchronize()`` so the per-tile sync is scoped to the
+# default stream rather than the whole device.
+
+
+def _function_source_2212(func):
+    src = inspect.getsource(func)
+    start_line = func.__code__.co_firstlineno
+    return src, start_line
+
+
+def _parent_map_2212(tree: ast.AST) -> dict:
+    mapping: dict = {}
+    for parent in ast.walk(tree):
+        for child in ast.iter_child_nodes(parent):
+            mapping[id(child)] = parent
+    return mapping
+
+
+def _inside_for_loop_2212(node: ast.AST, parents: dict) -> bool:
+    cur = parents.get(id(node))
+    while cur is not None:
+        if isinstance(cur, ast.For):
+            return True
+        cur = parents.get(id(cur))
+    return False
+
+
+def _device_synchronize_lines_2212(tree: ast.AST, start_line: int,
+                                   parents: dict, *, only_in_loop: bool):
+    out = []
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.Call):
+            continue
+        func = node.func
+        if not isinstance(func, ast.Attribute):
+            continue
+        if func.attr != 'synchronize':
+            continue
+        parent_call = func.value
+        if not isinstance(parent_call, ast.Call):
+            continue
+        if not isinstance(parent_call.func, ast.Attribute):
+            continue
+        if parent_call.func.attr != 'Device':
+            continue
+        if only_in_loop and not _inside_for_loop_2212(node, parents):
+            continue
+        if not only_in_loop and _inside_for_loop_2212(node, parents):
+            continue
+        out.append(start_line + node.lineno - 1)
+    return out
+
+
+def _stream_null_synchronize_lines_2212(tree: ast.AST, start_line: int,
+                                        parents: dict, *, only_in_loop: bool):
+    out = []
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.Call):
+            continue
+        func = node.func
+        if not isinstance(func, ast.Attribute):
+            continue
+        if func.attr != 'synchronize':
+            continue
+        chain = func.value
+        if isinstance(chain, ast.Call):
+            continue
+        if not isinstance(chain, ast.Attribute):
+            continue
+        found_stream_null = False
+        cur = chain
+        if cur.attr == 'null':
+            inner = cur.value
+            if isinstance(inner, ast.Attribute) and inner.attr == 'Stream':
+                found_stream_null = True
+        if not found_stream_null:
+            continue
+        if only_in_loop and not _inside_for_loop_2212(node, parents):
+            continue
+        if not only_in_loop and _inside_for_loop_2212(node, parents):
+            continue
+        out.append(start_line + node.lineno - 1)
+    return out
+
+
+class TestNvjpegEncodeStreamSync_2212:
+    """Structural assertions on the encoder sync fix (no GPU required)."""
+
+    def setup_method(self):
+        from xrspatial.geotiff import _gpu_decode
+        self._fn = _gpu_decode._nvjpeg_batch_encode
+        src, start = _function_source_2212(self._fn)
+        self._src = src
+        self._start_line = start
+        self._tree = ast.parse(src)
+        self._parents = _parent_map_2212(self._tree)
+
+    def test_no_device_synchronize_inside_encode_loop(self):
+        offending = _device_synchronize_lines_2212(
+            self._tree, self._start_line, self._parents, only_in_loop=True,
+        )
+        assert offending == [], (
+            "_nvjpeg_batch_encode contains cupy.cuda.Device().synchronize() "
+            f"calls inside a for-loop at file lines {offending}. The fix "
+            "in #2212 scopes the per-tile sync to the default stream via "
+            "cupy.cuda.Stream.null.synchronize()."
+        )
+
+    def test_stream_null_synchronize_present(self):
+        found = _stream_null_synchronize_lines_2212(
+            self._tree, self._start_line, self._parents, only_in_loop=True,
+        )
+        assert len(found) >= 1, (
+            "_nvjpeg_batch_encode no longer calls "
+            "cupy.cuda.Stream.null.synchronize() inside the encode loop."
+        )
+
+
+class TestNvjpeg2kEncodeStreamSync_2212:
+    """Structural assertions on the nvJPEG2000 encoder sync fix."""
+
+    def setup_method(self):
+        from xrspatial.geotiff import _gpu_decode
+        self._fn = _gpu_decode._nvjpeg2k_batch_encode
+        src, start = _function_source_2212(self._fn)
+        self._src = src
+        self._start_line = start
+        self._tree = ast.parse(src)
+        self._parents = _parent_map_2212(self._tree)
+
+    def test_no_device_synchronize_inside_encode_loop(self):
+        offending = _device_synchronize_lines_2212(
+            self._tree, self._start_line, self._parents, only_in_loop=True,
+        )
+        assert offending == [], (
+            "_nvjpeg2k_batch_encode contains Device().synchronize() inside "
+            f"a for-loop at file lines {offending}. The fix in #2212 "
+            "requires Stream.null.synchronize()."
+        )
+
+    def test_stream_null_synchronize_present(self):
+        found = _stream_null_synchronize_lines_2212(
+            self._tree, self._start_line, self._parents, only_in_loop=True,
+        )
+        assert len(found) >= 1, (
+            "_nvjpeg2k_batch_encode no longer calls "
+            "Stream.null.synchronize() inside the encode loop."
+        )
+
+
+class TestDecodeReferencePattern_2212:
+    """The decoder pattern is the contract we mirror. Pin it as the reference."""
+
+    def setup_method(self):
+        from xrspatial.geotiff import _gpu_decode
+        self._fn = _gpu_decode._try_nvjpeg_batch_decode
+        src, start = _function_source_2212(self._fn)
+        self._src = src
+        self._start_line = start
+        self._tree = ast.parse(src)
+        self._parents = _parent_map_2212(self._tree)
+
+    def test_decoder_uses_stream_null_sync_in_loop(self):
+        found = _stream_null_synchronize_lines_2212(
+            self._tree, self._start_line, self._parents, only_in_loop=True,
+        )
+        assert len(found) >= 1, (
+            "_try_nvjpeg_batch_decode no longer uses "
+            "Stream.null.synchronize() inside the decode loop."
+        )
+
+
+# ============================================================
+# Section: nvJPEG2000 single-alloc pool (#2107)
+# ============================================================
+# Source: test_nvjpeg2k_single_alloc_2107.py
+#
+# Replace per-tile / per-component ``cupy.empty`` allocations and per-
+# tile ``Device().synchronize()`` inside the decode loop with a single
+# contiguous device pool and a single batch-end sync.
+
+
+def _function_source_2107(func):
+    src = inspect.getsource(func)
+    start_line = func.__code__.co_firstlineno
+    return src, start_line
+
+
+def _inside_for_loop_2107(node: ast.AST, parents: dict) -> bool:
+    cur = parents.get(id(node))
+    while cur is not None:
+        if isinstance(cur, ast.For):
+            return True
+        cur = parents.get(id(cur))
+    return False
+
+
+def _parent_map_2107(tree: ast.AST) -> dict:
+    mapping: dict = {}
+    for parent in ast.walk(tree):
+        for child in ast.iter_child_nodes(parent):
+            mapping[id(child)] = parent
+    return mapping
+
+
+class TestNvjpeg2kSingleAllocStructural_2107:
+    """Structural assertions on the refactored helper (no GPU required)."""
+
+    def setup_method(self):
+        from xrspatial.geotiff import _gpu_decode
+
+        self._fn = _gpu_decode._try_nvjpeg2k_batch_decode
+        src, start = _function_source_2107(self._fn)
+        self._src = src
+        self._start_line = start
+        self._tree = ast.parse(src)
+        self._parents = _parent_map_2107(self._tree)
+
+    def test_no_cupy_empty_inside_decode_loop(self):
+        """``cupy.empty`` must NOT appear inside the per-tile ``for`` loop."""
+        offending = []
+        for node in ast.walk(self._tree):
+            if not isinstance(node, ast.Call):
+                continue
+            func = node.func
+            if not isinstance(func, ast.Attribute):
+                continue
+            if func.attr != 'empty':
+                continue
+            if (not isinstance(func.value, ast.Name)
+                    or func.value.id not in ('cupy', 'cp')):
+                continue
+            if _inside_for_loop_2107(node, self._parents):
+                offending.append(self._start_line + node.lineno - 1)
+        assert offending == [], (
+            f"_try_nvjpeg2k_batch_decode contains cupy.empty(...) calls "
+            f"inside a for-loop at file lines {offending}. The refactor "
+            f"in #2107 moved every output allocation outside the per-tile "
+            f"loop."
+        )
+
+    def test_no_device_synchronize_inside_decode_loop(self):
+        """``Device().synchronize()`` must NOT live inside the decode loop."""
+        offending = []
+        for node in ast.walk(self._tree):
+            if not isinstance(node, ast.Call):
+                continue
+            func = node.func
+            if not isinstance(func, ast.Attribute):
+                continue
+            if func.attr != 'synchronize':
+                continue
+            parent_call = func.value
+            if (not isinstance(parent_call, ast.Call)
+                    or not isinstance(parent_call.func, ast.Attribute)
+                    or parent_call.func.attr != 'Device'):
+                continue
+            if _inside_for_loop_2107(node, self._parents):
+                offending.append(self._start_line + node.lineno - 1)
+        assert offending == [], (
+            f"_try_nvjpeg2k_batch_decode contains Device().synchronize() "
+            f"calls inside a for-loop at file lines {offending}. The "
+            f"refactor in #2107 keeps exactly one batch-end sync outside "
+            f"the loop."
+        )
+
+    def test_pool_allocation_present(self):
+        """Source contains the expected pool buffer name and slab math."""
+        assert 'd_comp_pool' in self._src, (
+            "_try_nvjpeg2k_batch_decode no longer references the shared "
+            "d_comp_pool buffer; the refactor in #2107 is missing or "
+            "reverted."
+        )
+        assert 'per_tile_comp_bytes' in self._src, (
+            "_try_nvjpeg2k_batch_decode no longer references "
+            "per_tile_comp_bytes."
+        )
+
+    def test_check_gpu_memory_guard_present(self):
+        """The pool allocation must be guarded by ``_check_gpu_memory``."""
+        assert '_check_gpu_memory(' in self._src, (
+            "_try_nvjpeg2k_batch_decode no longer calls _check_gpu_memory."
+        )
+
+
+class TestNvjpeg2kLibAbsentShortCircuit_2107:
+    """When the shared library is missing, the function returns None."""
+
+    def test_returns_none_when_lib_missing(self, monkeypatch):
+        from xrspatial.geotiff import _gpu_decode
+
+        monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: None)
+
+        result = _gpu_decode._try_nvjpeg2k_batch_decode(
+            compressed_tiles=[b''],
+            tile_width=8,
+            tile_height=8,
+            dtype=np.dtype('uint8'),
+            samples=1,
+        )
+        assert result is None
+
+    def test_returns_none_for_unsupported_dtype(self, monkeypatch):
+        """Unsupported dtypes short-circuit before any device allocation."""
+        from xrspatial.geotiff import _gpu_decode
+
+        class _FakeLib:
+            def __init__(self):
+                self.calls = []
+
+            def nvjpeg2kCreateSimple(self, *_args):
+                return 0
+
+            def nvjpeg2kDecodeStateCreate(self, *_args):
+                return 0
+
+            def nvjpeg2kStreamCreate(self, *_args):
+                return 0
+
+            def nvjpeg2kDecodeParamsCreate(self, *_args):
+                return 0
+
+            def nvjpeg2kDecodeParamsDestroy(self, *_args):
+                self.calls.append('params_destroy')
+
+            def nvjpeg2kStreamDestroy(self, *_args):
+                self.calls.append('stream_destroy')
+
+            def nvjpeg2kDecodeStateDestroy(self, *_args):
+                self.calls.append('state_destroy')
+
+            def nvjpeg2kDestroy(self, *_args):
+                self.calls.append('handle_destroy')
+
+        fake = _FakeLib()
+        monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: fake)
+
+        result = _gpu_decode._try_nvjpeg2k_batch_decode(
+            compressed_tiles=[b''],
+            tile_width=8,
+            tile_height=8,
+            dtype=np.dtype('float32'),
+            samples=1,
+        )
+        assert result is None
+        assert fake.calls == [
+            'params_destroy',
+            'stream_destroy',
+            'state_destroy',
+            'handle_destroy',
+        ]
+
+
+@requires_gpu
+class TestNvjpeg2kPoolWithCupy_2107:
+    """Lightweight cupy-only smoke tests for the pool layout."""
+
+    def test_pool_slabs_are_non_overlapping(self):
+        """Tile-component slabs into the pool must not overlap."""
+        cupy = pytest.importorskip('cupy')
+
+        n_tiles = 4
+        tile_width = 32
+        tile_height = 32
+        samples = 3
+        dtype = np.dtype('uint16')
+        pitch = tile_width * dtype.itemsize
+        per_tile_comp_bytes = samples * tile_height * pitch
+        pool = cupy.empty(n_tiles * per_tile_comp_bytes, dtype=cupy.uint8)
+
+        seen = set()
+        for i in range(n_tiles):
+            tile_pool_start = i * per_tile_comp_bytes
+            for c in range(samples):
+                start = tile_pool_start + c * tile_height * pitch
+                end = start + tile_height * pitch
+                for byte in range(start, end):
+                    assert byte not in seen, (
+                        f"pool byte {byte} appears in two slabs "
+                        f"(tile={i}, comp={c}); per-tile slab math is "
+                        f"wrong."
+                    )
+                    seen.add(byte)
+        assert len(seen) == int(pool.nbytes)
+
+
+# ============================================================
+# Section: nvJPEG output-format constants (#1549)
+# ============================================================
+# Source: test_jpeg_gpu_1549.py
+#
+# Off-by-two on the ``nvjpegOutputFormat_t`` constants in
+# ``_gpu_decode.py`` caused ``cudaErrorIllegalAddress`` on 3-band JPEG
+# TIFFs and silently-wrong pixels on single-band JPEG TIFFs.
+
+
+def _nvjpeg_available_1549() -> bool:
+    """True when libnvjpeg.so loads on this host."""
+    if not _HAS_GPU:
+        return False
+    try:
+        from xrspatial.geotiff._gpu_decode import _get_nvjpeg
+        return _get_nvjpeg() is not None
+    except Exception:
+        return False
+
+
+_HAS_NVJPEG_1549 = _nvjpeg_available_1549()
+
+_gpu_only_1549 = pytest.mark.skipif(
+    not (_HAS_GPU and _HAS_TIFFFILE and _HAS_PIL
+         and _HAS_IMAGECODECS and _HAS_NVJPEG_1549),
+    reason="cupy + CUDA + tifffile + Pillow + imagecodecs + nvJPEG required",
+)
+
+
+def _write_jpeg_rgb_tiff_1549(path: str, seed: int = 0,
+                              noise: bool = True) -> np.ndarray:
+    """Write a 3-band 256x256 tiled JPEG TIFF using tifffile."""
+    import tifffile
+    if noise:
+        rng = np.random.default_rng(seed)
+        arr = rng.integers(0, 256, size=(256, 256, 3), dtype=np.uint8)
+    else:
+        ys, xs = np.mgrid[0:256, 0:256].astype(np.int32)
+        r = (ys + xs) // 2
+        g = ys
+        b = xs
+        arr = np.stack([r, g, b], axis=2).clip(0, 255).astype(np.uint8)
+    tifffile.imwrite(path, arr, photometric='rgb', tile=(128, 128),
+                     compression='jpeg')
+    return arr
+
+
+def _write_jpeg_gray_tiff_1549(path: str, seed: int = 42) -> np.ndarray:
+    """Write a 1-band 256x256 tiled JPEG TIFF using tifffile."""
+    import tifffile
+    rng = np.random.default_rng(seed)
+    arr = rng.integers(0, 256, size=(256, 256), dtype=np.uint8)
+    tifffile.imwrite(path, arr, photometric='minisblack', tile=(128, 128),
+                     compression='jpeg')
+    return arr
+
+
+@_gpu_only_1549
+def test_rgb_jpeg_gpu_no_crash_1549(tmp_path, monkeypatch):
+    """3-band JPEG must not raise CUDARuntimeError on GPU read."""
+    import cupy
+
+    from xrspatial.geotiff import _gpu_decode, read_geotiff_gpu
+
+    spy = {"calls": 0, "successes": 0}
+    original = _gpu_decode._try_nvjpeg_batch_decode
+
+    def wrapped(*args, **kwargs):
+        spy["calls"] += 1
+        result = original(*args, **kwargs)
+        if result is not None:
+            spy["successes"] += 1
+        return result
+
+    monkeypatch.setattr(_gpu_decode, "_try_nvjpeg_batch_decode", wrapped)
+
+    path = str(tmp_path / "rgb_jpeg_1549.tif")
+    _write_jpeg_rgb_tiff_1549(path)
+
+    arr = read_geotiff_gpu(path, gpu='strict', allow_internal_only_jpeg=True)
+    assert isinstance(arr.data, cupy.ndarray)
+    decoded = arr.data.get()
+    assert decoded.shape == (256, 256, 3)
+    assert decoded.dtype == np.uint8
+
+    assert spy["calls"] >= 1, (
+        "nvJPEG branch was never called -- test did not exercise the "
+        "code path the #1549 fix lives on"
+    )
+    assert spy["successes"] >= 1, (
+        "nvJPEG returned None -- CPU Pillow fallback ran and the fix was "
+        "not exercised"
+    )
+
+
+@_gpu_only_1549
+def test_rgb_jpeg_gpu_matches_cpu_1549(tmp_path):
+    """GPU pixels must be within JPEG decoder tolerance of CPU pixels."""
+    from xrspatial.geotiff import open_geotiff
+
+    path = str(tmp_path / "rgb_jpeg_match_1549.tif")
+    _write_jpeg_rgb_tiff_1549(path, noise=False)
+
+    cpu = open_geotiff(path, allow_internal_only_jpeg=True)
+    gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True)
+    assert cpu.shape == gpu.shape == (256, 256, 3)
+
+    cpu_arr = np.asarray(cpu.data)
+    gpu_arr = np.asarray(gpu.data.get())
+
+    diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int))
+    assert diff.mean() < 1.0, f"mean diff {diff.mean():.3f} too large"
+    assert diff.max() < 8, f"max diff {diff.max()} too large"
+
+
+@_gpu_only_1549
+def test_grayscale_jpeg_gpu_matches_cpu_1549(tmp_path):
+    """Single-band JPEG GPU read must also produce correct pixels."""
+    from xrspatial.geotiff import open_geotiff
+
+    path = str(tmp_path / "gray_jpeg_1549.tif")
+    _write_jpeg_gray_tiff_1549(path)
+
+    cpu = open_geotiff(path, allow_internal_only_jpeg=True)
+    gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True)
+    assert cpu.shape == gpu.shape == (256, 256)
+
+    cpu_arr = np.asarray(cpu.data)
+    gpu_arr = np.asarray(gpu.data.get())
+    diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int))
+    assert diff.max() <= 2, (
+        f"grayscale max diff {diff.max()} indicates corruption, "
+        f"not just rounding"
+    )
+
+
+@_gpu_only_1549
+def test_cuda_context_survives_after_jpeg_gpu_read_1549(tmp_path):
+    """Verify the CUDA context is healthy after a GPU JPEG read."""
+    import cupy
+
+    from xrspatial.geotiff import open_geotiff
+
+    path = str(tmp_path / "rgb_ctx_1549.tif")
+    _write_jpeg_rgb_tiff_1549(path)
+
+    arr = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True)
+    _ = arr.data.get()
+
+    x = cupy.arange(1024, dtype=cupy.float32)
+    s = float(cupy.sum(x).item())
+    assert s == 1023 * 1024 / 2
+
+    other_path = str(tmp_path / "other_1549.tif")
+    _write_jpeg_gray_tiff_1549(other_path, seed=7)
+    other = open_geotiff(other_path, gpu=True, allow_internal_only_jpeg=True)
+    assert other.shape == (256, 256)
+    assert other.dtype == np.uint8
+
+
+# ============================================================
+# Section: LERC valid-mask GPU (PR #1529 follow-up)
+# ============================================================
+# Source: test_lerc_valid_mask_gpu.py
+#
+# The CPU LERC reader honours the LERC valid-mask. The GPU LERC tile-
+# decode path used to discard the mask. These tests confirm the GPU
+# path now matches the CPU path for representative mask combinations.
+
+# Module-level skip: this whole section is LERC-only.
+lerc_lerc = pytest.importorskip("lerc", reason="lerc required for LERC GPU tests")
+
+from xrspatial.geotiff._compression import LERC_AVAILABLE  # noqa: E402
+
+_gpu_only_lerc = pytest.mark.skipif(
+    not (_HAS_GPU and LERC_AVAILABLE),
+    reason="cupy + CUDA + lerc required",
+)
+
+
+@pytest.fixture
+def lerc_writer_with_mask_gpu(monkeypatch):
+    """Patch ``lerc_compress`` to embed a valid-mask the writer can't pass."""
+    holder = {"invalid": None}
+
+    def _patched(data, width, height, samples=1,
+                 dtype=np.dtype('float32'), max_z_error=0.0):
+        if samples == 1:
+            arr = np.frombuffer(data, dtype=dtype).reshape(height, width)
+        else:
+            arr = np.frombuffer(data, dtype=dtype).reshape(
+                height, width, samples)
+        invalid_pred = holder["invalid"]
+        if invalid_pred is None:
+            mask = None
+            has_mask = False
+        else:
+            invalid = invalid_pred(arr)
+            mask = np.where(invalid, np.uint8(0), np.uint8(1))
+            has_mask = True
+        result = lerc_lerc.encode(
+            arr, samples, has_mask, mask, max_z_error, 1,
+        )
+        if result[0] != 0:
+            raise RuntimeError(
+                f"LERC encode failed with error code {result[0]}")
+        return bytes(result[2])
+
+    monkeypatch.setattr(
+        "xrspatial.geotiff._compression.lerc_compress", _patched,
+    )
+    return holder
+
+
+def _read_cpu_gpu_lerc(path):
+    """Read *path* with both readers and return ``(cpu_array, gpu_host_array)``."""
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    cpu, _geo = read_to_array(path, allow_experimental_codecs=True)
+    gpu_da = read_geotiff_gpu(
+        path, gpu='strict', allow_experimental_codecs=True,
+    )
+    gpu_host = gpu_da.data.get()
+    return cpu, gpu_host
+
+
+def _restore_sentinel_lerc(arr, nodata):
+    """Replace NaN positions in *arr* with *nodata* for bit-exact compare."""
+    if nodata is None or arr.dtype.kind != 'f' or np.isnan(nodata):
+        return arr
+    out = arr.copy()
+    out[np.isnan(out)] = arr.dtype.type(nodata)
+    return out
+
+
+@_gpu_only_lerc
+class TestGpuLercValidMask:
+    """End-to-end TIFF round-trips comparing GPU vs CPU output."""
+
+    def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask_gpu):
+        """Float32 LERC + NaN nodata: GPU output matches CPU output."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
+        invalid_positions = {(0, 1), (5, 4)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask_gpu["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_nan_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=float("nan"))
+
+        cpu, gpu = _read_cpu_gpu_lerc(path)
+        for (r, c) in invalid_positions:
+            assert np.isnan(cpu[r, c])
+            assert np.isnan(gpu[r, c])
+        cpu_valid = np.where(np.isnan(cpu), 0.0, cpu)
+        gpu_valid = np.where(np.isnan(gpu), 0.0, gpu)
+        np.testing.assert_array_equal(cpu_valid, gpu_valid)
+
+    def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask_gpu):
+        """Float32 LERC + sentinel nodata (-9999): GPU matches CPU."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
+        invalid_positions = {(0, 1), (3, 3), (7, 7)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask_gpu["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=-9999.0)
+
+        cpu, gpu = _read_cpu_gpu_lerc(path)
+        gpu_with_sentinel = _restore_sentinel_lerc(gpu, -9999.0)
+        np.testing.assert_array_equal(cpu, gpu_with_sentinel)
+        for (r, c) in invalid_positions:
+            assert np.isnan(gpu[r, c])
+            assert gpu_with_sentinel[r, c] == np.float32(-9999.0)
+
+    def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask_gpu):
+        """Uint16 LERC + sentinel nodata (65535): GPU matches CPU."""
+        from xrspatial.geotiff._writer import write
+
+        arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8)
+        invalid_positions = {(0, 1), (4, 4)}
+
+        def invalid_pred(a):
+            m = np.zeros(a.shape[:2], dtype=bool)
+            for r, c in invalid_positions:
+                m[r, c] = True
+            return m
+        lerc_writer_with_mask_gpu["invalid"] = invalid_pred
+
+        path = str(tmp_path / "lerc_mask_uint16_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8,
+              nodata=65535)
+
+        cpu, gpu = _read_cpu_gpu_lerc(path)
+        assert gpu.dtype == np.float64
+        gpu_no_nan = np.where(np.isnan(gpu), 65535.0, gpu)
+        gpu_u16 = gpu_no_nan.astype(np.uint16)
+        np.testing.assert_array_equal(cpu, gpu_u16)
+        for (r, c) in invalid_positions:
+            assert np.isnan(gpu[r, c])
+            assert gpu_u16[r, c] == np.uint16(65535)
+
+    def test_no_mask_roundtrip_bitexact(self, tmp_path):
+        """All-valid LERC (no encoded mask): GPU and CPU agree bit-exact."""
+        from xrspatial.geotiff._writer import write
+
+        arr = np.arange(64, dtype=np.float32).reshape(8, 8)
+        path = str(tmp_path / "lerc_no_mask_gpu.tif")
+        write(arr, path, compression="lerc", tiled=True, tile_size=8)
+
+        cpu, gpu = _read_cpu_gpu_lerc(path)
+        np.testing.assert_array_equal(cpu, arr)
+        np.testing.assert_array_equal(gpu, arr)
+
+
+# ============================================================
+# Section: predictor=2 big-endian GPU (#1517)
+# ============================================================
+# Source: test_predictor2_big_endian_gpu_1517.py
+#
+# Predictor=2 BE files used to come back with wrong values on the GPU
+# tiled path. The per-dtype predictor kernels now byte-swap the buffer
+# before running the prefix-sum.
+
+_gpu_only_1517 = pytest.mark.skipif(
+    not (_HAS_GPU and _HAS_TIFFFILE),
+    reason="cupy + CUDA + tifffile required",
+)
+
+
+def _block_cpu_fallback_1517(monkeypatch):
+    """Make any call to ``read_to_array`` from ``read_geotiff_gpu`` fail loudly."""
+    from xrspatial.geotiff._backends import gpu as gpu_backend
+
+    def _no_fallback(*args, **kwargs):
+        raise AssertionError(
+            "read_geotiff_gpu fell back to read_to_array; "
+            "the GPU decode path was not exercised."
+        )
+
+    monkeypatch.setattr(
+        gpu_backend, '_read_to_array', _no_fallback, raising=True,
+    )
+
+
+@_gpu_only_1517
+def test_gpu_predictor2_big_endian_int32_tiled_reproducer_1517(tmp_path, monkeypatch):
+    """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260507)
+    arr = rng.randint(
+        -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64
+    ).astype(np.int32)
+
+    path = tmp_path / "be_pred2_int32.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    _block_cpu_fallback_1517(monkeypatch)
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(np.int32)
+    assert gpu_da.data.dtype.isnative
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only_1517
+@pytest.mark.parametrize(
+    "dtype",
+    [np.uint16, np.int16, np.uint32, np.int32],
+)
+def test_gpu_predictor2_big_endian_dtypes_tiled_1517(tmp_path, monkeypatch, dtype):
+    """BE predictor=2 tiled files match CPU baseline across dtypes."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260508)
+    info = np.iinfo(dtype)
+    arr = rng.randint(
+        max(info.min, -1_000_000),
+        min(info.max, 1_000_000),
+        size=(32, 48),
+        dtype=np.int64,
+    ).astype(dtype)
+
+    path = tmp_path / f"be_pred2_{np.dtype(dtype).name}.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    _block_cpu_fallback_1517(monkeypatch)
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(dtype)
+    assert gpu_da.data.dtype.isnative
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only_1517
+def test_gpu_predictor2_big_endian_stripped_uint16_1517(tmp_path):
+    """Stripped BE predictor=2 files take the CPU fallback but stay correct."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260509)
+    arr = rng.randint(0, 60000, size=(32, 48), dtype=np.uint16)
+
+    path = tmp_path / "be_pred2_uint16_strip.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=2, compression="deflate",
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(np.uint16)
+    assert gpu_da.data.dtype.isnative
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only_1517
+def test_gpu_predictor2_little_endian_still_works_1517(tmp_path, monkeypatch):
+    """LE predictor=2 must still round-trip after the BE fix."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260510)
+    arr = rng.randint(
+        -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64
+    ).astype(np.int32)
+
+    path = tmp_path / "le_pred2_int32.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder="<", predictor=2,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    _block_cpu_fallback_1517(monkeypatch)
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(np.int32)
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+@_gpu_only_1517
+def test_gpu_predictor3_big_endian_still_works_1517(tmp_path, monkeypatch):
+    """Floating-point predictor BE must still match CPU after the fix."""
+    import cupy
+    import tifffile
+
+    from xrspatial.geotiff import read_geotiff_gpu
+    from xrspatial.geotiff._reader import read_to_array
+
+    rng = np.random.RandomState(20260511)
+    arr = rng.standard_normal((32, 48)).astype(np.float32)
+
+    path = tmp_path / "be_pred3_float32.tif"
+    tifffile.imwrite(
+        str(path), arr, byteorder=">", predictor=3,
+        compression="deflate", tile=(16, 16),
+    )
+
+    cpu, _ = read_to_array(str(path))
+    np.testing.assert_array_equal(cpu, arr)
+
+    _block_cpu_fallback_1517(monkeypatch)
+    gpu_da = read_geotiff_gpu(str(path))
+    assert isinstance(gpu_da.data, cupy.ndarray)
+    assert gpu_da.data.dtype == np.dtype(np.float32)
+    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
+
+
+def test_swap_byte_lanes_numpy_bps2_1517():
+    """The byte-swap helper reverses bytes per sample on a numpy buffer."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8)
+    _swap_byte_lanes(buf, 2)
+    np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03],
+                                                dtype=np.uint8))
+
+
+def test_swap_byte_lanes_numpy_bps4_1517():
+    """bps=4: full byte reversal within each 4-byte sample."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.array([0x01, 0x02, 0x03, 0x04,
+                    0x05, 0x06, 0x07, 0x08], dtype=np.uint8)
+    _swap_byte_lanes(buf, 4)
+    np.testing.assert_array_equal(
+        buf, np.array([0x04, 0x03, 0x02, 0x01,
+                       0x08, 0x07, 0x06, 0x05], dtype=np.uint8))
+
+
+def test_swap_byte_lanes_numpy_bps8_1517():
+    """bps=8: full byte reversal within each 8-byte sample."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    sample = np.arange(1, 9, dtype=np.uint8)
+    buf = np.tile(sample, 2).copy()
+    _swap_byte_lanes(buf, 8)
+    np.testing.assert_array_equal(
+        buf, np.tile(sample[::-1], 2))
+
+
+def test_swap_byte_lanes_uint8_noop_1517():
+    """bps=1 must be a no-op."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.array([1, 2, 3], dtype=np.uint8)
+    _swap_byte_lanes(buf, 1)
+    np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8))
+
+
+def test_swap_byte_lanes_rejects_unsupported_bps_1517():
+    """Unsupported bps values raise ValueError rather than corrupt data."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.zeros(6, dtype=np.uint8)
+    with pytest.raises(ValueError, match="unsupported bps"):
+        _swap_byte_lanes(buf, 3)
+
+
+def test_swap_byte_lanes_rejects_misaligned_size_1517():
+    """Buffer size must be a multiple of bps."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.zeros(5, dtype=np.uint8)
+    with pytest.raises(ValueError, match="not a multiple"):
+        _swap_byte_lanes(buf, 2)
+
+
+def test_swap_byte_lanes_numpy_is_zero_temp_1517():
+    """The numpy path must mutate the original buffer without realloc."""
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8)
+    addr_before = buf.ctypes.data
+    _swap_byte_lanes(buf, 2)
+    assert buf.ctypes.data == addr_before
+    np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03],
+                                                dtype=np.uint8))
+
+
+@_gpu_only_1517
+@pytest.mark.parametrize("bps,dtype", [
+    (2, np.uint16),
+    (4, np.uint32),
+    (8, np.uint64),
+])
+def test_swap_byte_lanes_cupy_kernel_1517(bps, dtype):
+    """The cupy path runs the CUDA kernel and matches numpy.byteswap."""
+    import cupy
+
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    rng = np.random.RandomState(20260512 + bps)
+    n_samples = 1024
+    src = rng.randint(0, np.iinfo(dtype).max, size=n_samples,
+                      dtype=np.uint64).astype(dtype)
+    expected = src.byteswap()
+
+    d_buf = cupy.asarray(src.view(np.uint8))
+    addr_before = int(d_buf.data.ptr)
+    _swap_byte_lanes(d_buf, bps)
+    addr_after = int(d_buf.data.ptr)
+
+    assert addr_after == addr_before, "kernel must operate in place"
+    np.testing.assert_array_equal(
+        d_buf.get().view(dtype), expected,
+    )
+
+
+@_gpu_only_1517
+def test_swap_byte_lanes_cupy_uint8_noop_1517():
+    """bps=1 leaves cupy buffers untouched (no kernel launch)."""
+    import cupy
+
+    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
+
+    src = np.arange(16, dtype=np.uint8)
+    d_buf = cupy.asarray(src)
+    _swap_byte_lanes(d_buf, 1)
+    np.testing.assert_array_equal(d_buf.get(), src)
+
+
+# ============================================================
+# Section: predictor=3 + integer SampleFormat rejection on GPU (#1933)
+# ============================================================
+# Source: test_predictor3_int_dtype_gpu_1933.py
+#
+# ``_validate_predictor_sample_format`` is wired into every IFD-read
+# site. This section closes the GPU coverage gap for the two GPU
+# validator call sites (tiled eager + GDS chunked).
+
+from xrspatial.geotiff._compression import COMPRESSION_NONE  # noqa: E402
+from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype  # noqa: E402
+from xrspatial.geotiff._header import (  # noqa: E402
+    TAG_BITS_PER_SAMPLE,
+    TAG_COMPRESSION,
+    TAG_IMAGE_LENGTH,
+    TAG_IMAGE_WIDTH,
+    TAG_PHOTOMETRIC,
+    TAG_PREDICTOR,
+    TAG_ROWS_PER_STRIP,
+    TAG_SAMPLE_FORMAT,
+    TAG_SAMPLES_PER_PIXEL,
+    TAG_STRIP_BYTE_COUNTS,
+    TAG_STRIP_OFFSETS,
+    TAG_TILE_BYTE_COUNTS,
+    TAG_TILE_LENGTH,
+    TAG_TILE_OFFSETS,
+    TAG_TILE_WIDTH,
+)
+from xrspatial.geotiff._writer import (  # noqa: E402
+    _assemble_standard_layout,
+    _write_stripped,
+)
+
+
+def _build_predictor3_uint32_stripped_tiff_1933(arr: np.ndarray) -> bytes:
+    """Build a stripped TIFF: predictor=3 + uint32 SampleFormat=1."""
+    rel_off, bc, chunks = _write_stripped(arr, COMPRESSION_NONE, False)
+    bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype)
+    tags = [
+        (TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]),
+        (TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]),
+        (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample),
+        (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE),
+        (TAG_PHOTOMETRIC, SHORT, 1, 1),
+        (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1),
+        (TAG_SAMPLE_FORMAT, SHORT, 1, 1),
+        (TAG_PREDICTOR, SHORT, 1, 3),
+        (TAG_ROWS_PER_STRIP, SHORT, 1, arr.shape[0]),
+        (TAG_STRIP_OFFSETS, LONG, len(rel_off), rel_off),
+        (TAG_STRIP_BYTE_COUNTS, LONG, len(bc), bc),
+    ]
+    parts = [(arr, arr.shape[1], arr.shape[0], rel_off, bc, chunks)]
+    return _assemble_standard_layout(8, [tags], parts, bigtiff=False)
+
+
+def _build_predictor3_uint32_tiled_tiff_1933(
+    arr: np.ndarray, tile_w: int = 16, tile_h: int = 16,
+) -> bytes:
+    """Build a tiled malformed TIFF: predictor=3 + uint32 SampleFormat=1."""
+    bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype)
+    h, w = arr.shape
+
+    tiles_across = (w + tile_w - 1) // tile_w
+    tiles_down = (h + tile_h - 1) // tile_h
+    tiles: list[bytes] = []
+    rel_off: list[int] = []
+    bc: list[int] = []
+    offset = 0
+    for tr in range(tiles_down):
+        for tc in range(tiles_across):
+            r0 = tr * tile_h
+            c0 = tc * tile_w
+            r1 = min(r0 + tile_h, h)
+            c1 = min(c0 + tile_w, w)
+            tile_slice = arr[r0:r1, c0:c1]
+            if tile_slice.shape != (tile_h, tile_w):
+                padded = np.zeros((tile_h, tile_w), dtype=arr.dtype)
+                padded[: tile_slice.shape[0], : tile_slice.shape[1]] = (
+                    tile_slice)
+                tile_arr = padded
+            else:
+                tile_arr = np.ascontiguousarray(tile_slice)
+            chunk = tile_arr.tobytes()
+            rel_off.append(offset)
+            bc.append(len(chunk))
+            tiles.append(chunk)
+            offset += len(chunk)
+
+    tags = [
+        (TAG_IMAGE_WIDTH, LONG, 1, w),
+        (TAG_IMAGE_LENGTH, LONG, 1, h),
+        (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample),
+        (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE),
+        (TAG_PHOTOMETRIC, SHORT, 1, 1),
+        (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1),
+        (TAG_SAMPLE_FORMAT, SHORT, 1, 1),
+        (TAG_PREDICTOR, SHORT, 1, 3),
+        (TAG_TILE_WIDTH, LONG, 1, tile_w),
+        (TAG_TILE_LENGTH, LONG, 1, tile_h),
+        (TAG_TILE_OFFSETS, LONG, len(rel_off), rel_off),
+        (TAG_TILE_BYTE_COUNTS, LONG, len(bc), bc),
+    ]
+    parts = [(arr, w, h, rel_off, bc, tiles)]
+    return _assemble_standard_layout(8, [tags], parts, bigtiff=False)
+
+
+@requires_gpu
+class TestGPUEagerRejectsMalformedFile_1933:
+    """``read_geotiff_gpu`` rejects predictor=3 + integer SampleFormat."""
+
+    def test_gpu_eager_stripped_raises(self, tmp_path):
+        from xrspatial.geotiff import read_geotiff_gpu
+
+        arr = np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint32)
+        path = tmp_path / "pred3_uint32_stripped.tif"
+        path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr))
+        with pytest.raises(ValueError, match="Predictor=3"):
+            read_geotiff_gpu(str(path))
+
+    def test_gpu_eager_tiled_raises(self, tmp_path):
+        """Tiled layout hits the tiled GPU validator at gpu.py:443."""
+        from xrspatial.geotiff import read_geotiff_gpu
+
+        arr = np.arange(256, dtype=np.uint32).reshape(16, 16)
+        path = tmp_path / "pred3_uint32_tiled.tif"
+        path.write_bytes(_build_predictor3_uint32_tiled_tiff_1933(arr))
+        with pytest.raises(ValueError, match="Predictor=3"):
+            read_geotiff_gpu(str(path))
+
+    def test_gpu_dispatcher_eager_raises(self, tmp_path):
+        """``open_geotiff(gpu=True)`` dispatcher rejects the file."""
+        from xrspatial.geotiff import open_geotiff
+
+        arr = np.arange(64, dtype=np.uint32).reshape(8, 8)
+        path = tmp_path / "pred3_uint32_dispatch.tif"
+        path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr))
+        with pytest.raises(ValueError, match="Predictor=3"):
+            open_geotiff(str(path), gpu=True)
+
+
+@requires_gpu
+class TestGPUChunkedRejectsMalformedFile_1933:
+    """The dask+GPU paths also reject predictor=3 + integer."""
+
+    def test_read_geotiff_gpu_chunked_stripped_raises(self, tmp_path):
+        from xrspatial.geotiff import read_geotiff_gpu
+
+        arr = np.arange(64, dtype=np.uint32).reshape(8, 8)
+        path = tmp_path / "pred3_uint32_chunked_str.tif"
+        path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr))
+        with pytest.raises(ValueError, match="Predictor=3"):
+            read_geotiff_gpu(str(path), chunks=4)
+
+    def test_read_geotiff_gpu_chunked_tiled_raises(self, tmp_path):
+        """Tiled chunked path with KvikIO available exercises gpu.py:999."""
+        pytest.importorskip("kvikio")
+
+        from xrspatial.geotiff import read_geotiff_gpu
+
+        arr = np.arange(256, dtype=np.uint32).reshape(16, 16)
+        path = tmp_path / "pred3_uint32_chunked_tiled.tif"
+        path.write_bytes(_build_predictor3_uint32_tiled_tiff_1933(arr))
+        with pytest.raises(ValueError, match="Predictor=3"):
+            read_geotiff_gpu(str(path), chunks=16)
+
+    def test_open_geotiff_chunks_gpu_dispatcher_raises(self, tmp_path):
+        """``open_geotiff(chunks=, gpu=True)`` dispatcher rejects the file."""
+        from xrspatial.geotiff import open_geotiff
+
+        arr = np.arange(256, dtype=np.uint32).reshape(16, 16)
+        path = tmp_path / "pred3_uint32_chunked_dispatch.tif"
+        path.write_bytes(_build_predictor3_uint32_tiled_tiff_1933(arr))
+        with pytest.raises(ValueError, match="Predictor=3"):
+            open_geotiff(str(path), chunks=8, gpu=True)
+
+
+@requires_gpu
+class TestValidPredictor3StillWorksOnGPU_1933:
+    """A legitimate predictor=3 + float32 tiled file still decodes on GPU."""
+
+    def test_predictor3_float32_gpu_round_trip(self, tmp_path):
+        from xrspatial.geotiff import read_geotiff_gpu, to_geotiff
+
+        arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16)
+        path = tmp_path / "pred3_float32_tiled.tif"
+        to_geotiff(
+            arr, str(path), compression="deflate", predictor=3,
+            tiled=True, tile_size=16,
+        )
+
+        result = read_geotiff_gpu(str(path))
+        assert result.dtype == np.float32
+        np.testing.assert_array_equal(result.data.get(), arr)
+
+    def test_predictor3_float32_dask_gpu_round_trip(self, tmp_path):
+        from xrspatial.geotiff import read_geotiff_gpu, to_geotiff
+
+        arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16)
+        path = tmp_path / "pred3_float32_dask.tif"
+        to_geotiff(
+            arr, str(path), compression="deflate", predictor=3,
+            tiled=True, tile_size=16,
+        )
+
+        result = read_geotiff_gpu(str(path), chunks=8)
+        assert result.dtype == np.float32
+        np.testing.assert_array_equal(result.compute().data.get(), arr)
+
+
+@requires_gpu
+class TestErrorMessageStable_1933:
+    """The GPU error wording matches the eager/dask wording."""
+
+    def test_gpu_error_message_matches_eager(self, tmp_path):
+        from xrspatial.geotiff import open_geotiff, read_geotiff_gpu
+
+        arr = np.arange(64, dtype=np.uint32).reshape(8, 8)
+        path = tmp_path / "pred3_uint32_msg.tif"
+        path.write_bytes(_build_predictor3_uint32_stripped_tiff_1933(arr))
+
+        with pytest.raises(ValueError) as exc_eager:
+            open_geotiff(str(path))
+        with pytest.raises(ValueError) as exc_gpu:
+            read_geotiff_gpu(str(path))
+
+        assert str(exc_eager.value) == str(exc_gpu.value), (
+            "GPU and eager paths must surface the same Predictor=3 "
+            "error message so callers can use a single except branch."
+        )
+
+
+# ============================================================
+# Section: GPU writer rejects JPEG without opt-in (#1845)
+# ============================================================
+# Source: test_gpu_jpeg_interop_reject_issue_D_1845.py
+#
+# ``write_geotiff_gpu`` mirrors ``to_geotiff`` and rejects
+# ``compression='jpeg'`` by default. ``allow_internal_only_jpeg=True``
+# opts in and emits ``GeoTIFFFallbackWarning``.
+
+from xrspatial.geotiff import GeoTIFFFallbackWarning, write_geotiff_gpu  # noqa: E402
+
+
+def _make_rgb_uint8_da_1845() -> xr.DataArray:
+    """64x64x3 uint8 RGB raster suitable for the JPEG encode path."""
+    rng = np.random.RandomState(0)
+    arr = rng.randint(0, 256, size=(64, 64, 3), dtype=np.uint8)
+    return xr.DataArray(
+        arr,
+        dims=("y", "x", "band"),
+        coords={
+            "y": np.arange(64, dtype=np.float64),
+            "x": np.arange(64, dtype=np.float64),
+            "band": np.array([1, 2, 3], dtype=np.int32),
+        },
+    )
+
+
+def test_write_geotiff_gpu_rejects_jpeg_without_opt_in_1845(tmp_path):
+    """``compression='jpeg'`` without the opt-in raises ``ValueError``."""
+    da = _make_rgb_uint8_da_1845()
+    path = str(tmp_path / "rejected_issue_D_1845.tif")
+
+    with pytest.raises(ValueError, match="JPEGTables"):
+        write_geotiff_gpu(da, path, compression='jpeg')
+
+
+def test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives_1845(tmp_path):
+    """The rejection error mentions the same alternative codecs."""
+    da = _make_rgb_uint8_da_1845()
+    path = str(tmp_path / "rejected_msg_issue_D_1845.tif")
+
+    with pytest.raises(ValueError) as exc:
+        write_geotiff_gpu(da, path, compression='jpeg')
+
+    msg = str(exc.value)
+    assert "deflate" in msg
+    assert "zstd" in msg
+
+
+def test_write_geotiff_gpu_rejects_jpeg_case_insensitive_1845(tmp_path):
+    """Upper-case ``compression='JPEG'`` is rejected too."""
+    da = _make_rgb_uint8_da_1845()
+    path = str(tmp_path / "rejected_upper_issue_D_1845.tif")
+
+    with pytest.raises(ValueError, match="JPEGTables"):
+        write_geotiff_gpu(da, path, compression='JPEG')
+
+
+@requires_gpu
+def test_write_geotiff_gpu_jpeg_opt_in_emits_warning_1845(tmp_path):
+    """``allow_internal_only_jpeg=True`` emits ``GeoTIFFFallbackWarning``."""
+    da = _make_rgb_uint8_da_1845()
+    path = str(tmp_path / "opt_in_issue_D_1845.tif")
+
+    with pytest.warns(GeoTIFFFallbackWarning, match="JPEGTables"):
+        write_geotiff_gpu(
+            da, path,
+            compression='jpeg',
+            allow_internal_only_jpeg=True,
+        )
+
+    assert os.path.exists(path)
+    assert os.path.getsize(path) > 0
+
+
+@requires_gpu
+def test_write_geotiff_gpu_non_jpeg_unaffected_by_flag_1845(tmp_path):
+    """Setting ``allow_internal_only_jpeg=True`` on a non-JPEG codec is a no-op."""
+    da = _make_rgb_uint8_da_1845()
+    path = str(tmp_path / "non_jpeg_flag_issue_D_1845.tif")
+
+    with _warnings.catch_warnings():
+        _warnings.simplefilter("error", GeoTIFFFallbackWarning)
+        write_geotiff_gpu(
+            da, path,
+            compression='zstd',
+            allow_internal_only_jpeg=True,
+        )
diff --git a/xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py b/xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py
deleted file mode 100644
index 2f88b05b8..000000000
--- a/xrspatial/geotiff/tests/test_gpu_jpeg_interop_reject_issue_D_1845.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""Issue #1845: ``write_geotiff_gpu`` must reject ``compression='jpeg'``
-by default.
-
-Background
-----------
-``to_geotiff`` raises a ``ValueError`` for ``compression='jpeg'`` because
-the encoder writes self-contained JFIF tiles without the TIFF JPEGTables
-tag (347); the resulting files are unreadable by libtiff, GDAL, and
-rasterio. The GPU writer sat in the same module and silently accepted
-the same kwarg, producing the same broken format. The fix introduces an
-``allow_internal_only_jpeg`` opt-in: callers who want the experimental
-internal-reader-only path must ask for it explicitly, and they get a
-``GeoTIFFFallbackWarning`` reminding them the file will not round-trip
-through external readers.
-
-These tests pin the rejection (CPU-only, no CUDA required) and the
-opt-in warning behaviour. The internal-only encode path itself is
-covered by the updated tests in
-``test_gpu_writer_compression_modes_2026_05_11.py`` which run only when
-CUDA is present.
-"""
-from __future__ import annotations
-
-import importlib.util
-
-import numpy as np
-import pytest
-import xarray as xr
-
-from xrspatial.geotiff import GeoTIFFFallbackWarning, write_geotiff_gpu
-
-
-def _gpu_available() -> bool:
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-_HAS_GPU = _gpu_available()
-
-
-def _make_rgb_uint8_da() -> xr.DataArray:
-    """64x64x3 uint8 RGB raster suitable for the JPEG encode path."""
-    rng = np.random.RandomState(0)
-    arr = rng.randint(0, 256, size=(64, 64, 3), dtype=np.uint8)
-    return xr.DataArray(
-        arr,
-        dims=("y", "x", "band"),
-        coords={
-            "y": np.arange(64, dtype=np.float64),
-            "x": np.arange(64, dtype=np.float64),
-            "band": np.array([1, 2, 3], dtype=np.int32),
-        },
-    )
-
-
-def test_write_geotiff_gpu_rejects_jpeg_without_opt_in(tmp_path):
-    """``compression='jpeg'`` without the opt-in raises ``ValueError``.
-
-    Mirrors the ``to_geotiff`` rejection so both writers in the same
-    module agree about JPEG-in-TIFF interop. The check runs before any
-    GPU work, so the test does not need CUDA.
-    """
-    da = _make_rgb_uint8_da()
-    path = str(tmp_path / "rejected_issue_D_1845.tif")
-
-    with pytest.raises(ValueError, match="JPEGTables"):
-        write_geotiff_gpu(da, path, compression='jpeg')
-
-
-def test_write_geotiff_gpu_rejects_jpeg_message_mentions_alternatives(tmp_path):
-    """The rejection error mentions the same alternative codecs that
-    ``to_geotiff`` recommends, so callers landing on either entry point
-    learn what to switch to."""
-    da = _make_rgb_uint8_da()
-    path = str(tmp_path / "rejected_msg_issue_D_1845.tif")
-
-    with pytest.raises(ValueError) as exc:
-        write_geotiff_gpu(da, path, compression='jpeg')
-
-    # The shared message wording from to_geotiff. If the two writers
-    # drift apart, callers reading the error get inconsistent advice.
-    msg = str(exc.value)
-    assert "deflate" in msg
-    assert "zstd" in msg
-
-
-def test_write_geotiff_gpu_rejects_jpeg_case_insensitive(tmp_path):
-    """Upper-case ``compression='JPEG'`` is rejected too.
-
-    ``to_geotiff`` lower-cases the compression string before comparing,
-    so the GPU writer must follow the same rule -- otherwise a caller
-    who types ``'JPEG'`` slips past the gate."""
-    da = _make_rgb_uint8_da()
-    path = str(tmp_path / "rejected_upper_issue_D_1845.tif")
-
-    with pytest.raises(ValueError, match="JPEGTables"):
-        write_geotiff_gpu(da, path, compression='JPEG')
-
-
-@pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required")
-def test_write_geotiff_gpu_jpeg_opt_in_emits_warning(tmp_path):
-    """Setting ``allow_internal_only_jpeg=True`` proceeds with the JPEG
-    encode and emits ``GeoTIFFFallbackWarning``.
-
-    The warning is the only signal the caller gets that their file may
-    not round-trip through GDAL or rasterio, so a missing warning here
-    would let the footgun back in.
-    """
-    da = _make_rgb_uint8_da()
-    path = str(tmp_path / "opt_in_issue_D_1845.tif")
-
-    with pytest.warns(GeoTIFFFallbackWarning, match="JPEGTables"):
-        write_geotiff_gpu(
-            da, path,
-            compression='jpeg',
-            allow_internal_only_jpeg=True,
-        )
-
-    # File was actually written, not just warned-about.
-    import os
-    assert os.path.exists(path)
-    assert os.path.getsize(path) > 0
-
-
-def test_write_geotiff_gpu_non_jpeg_unaffected_by_flag(tmp_path):
-    """Setting ``allow_internal_only_jpeg=True`` on a non-JPEG codec is a
-    no-op (no warning, no error).
-
-    The flag is JPEG-specific; other codecs must not pay any cost for
-    it being present in the signature."""
-    pytest.importorskip("cupy")
-    if not _HAS_GPU:
-        pytest.skip("cupy + CUDA required")
-
-    da = _make_rgb_uint8_da()
-    path = str(tmp_path / "non_jpeg_flag_issue_D_1845.tif")
-
-    import warnings as _warnings
-    with _warnings.catch_warnings():
-        _warnings.simplefilter("error", GeoTIFFFallbackWarning)
-        # Should not raise (no JPEG-related warning fires).
-        write_geotiff_gpu(
-            da, path,
-            compression='zstd',
-            allow_internal_only_jpeg=True,
-        )
diff --git a/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py b/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py
deleted file mode 100644
index 1b33fe33d..000000000
--- a/xrspatial/geotiff/tests/test_jpeg_gpu_1549.py
+++ /dev/null
@@ -1,266 +0,0 @@
-"""GPU regression test for issue #1549.
-
-Reading a 3-band tiled JPEG GeoTIFF with ``open_geotiff(..., gpu=True)``
-crashed inside the nvJPEG decode kernel with::
-
-    cupy.cuda.runtime.CUDARuntimeError: cudaErrorIllegalAddress
-
-The crash was sticky -- it poisoned the CUDA context so every later GPU
-call in the same process also failed.  Root cause: the
-``nvjpegOutputFormat_t`` constants in ``_gpu_decode.py`` were defined
-two values lower than the SDK's enum.  The wrapper sent ``3`` thinking
-it was ``NVJPEG_OUTPUT_RGBI`` (interleaved RGB), but ``3`` is
-``NVJPEG_OUTPUT_RGB`` (planar) in the real SDK.  nvJPEG then wrote the
-G and B planes through ``nvjpegImage.channel[1]`` and
-``channel[2]``, which the wrapper had set to NULL for an interleaved
-output buffer, producing an out-of-bounds GPU write inside
-``ycbcr_to_format_kernel_roi``.
-
-The same off-by-two affected the single-band path: it sent ``5``
-(thinking ``NVJPEG_OUTPUT_UNCHANGED``) which is actually
-``NVJPEG_OUTPUT_RGBI`` -- nvJPEG produced 3-byte-per-pixel output into a
-1-byte-per-pixel buffer, returning visibly wrong pixels rather than
-crashing.
-
-These tests build the exact reproducer from the issue, decode it on GPU,
-and verify (a) the decode does not crash, (b) the GPU pixels match the
-CPU pixels within the typical libjpeg/nvjpeg rounding tolerance, and
-(c) the CUDA context survives a follow-up GPU read of an unrelated
-file.
-"""
-from __future__ import annotations
-
-import importlib.util
-
-import numpy as np
-import pytest
-
-
-def _gpu_available() -> bool:
-    """True when cupy is importable and CUDA is initialised."""
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-def _nvjpeg_available() -> bool:
-    """True when libnvjpeg.so loads on this host.
-
-    Without nvJPEG the GPU pipeline silently falls back to CPU Pillow
-    decode, so the regression for issue #1549 (an out-of-bounds write
-    inside the nvJPEG kernel) would never be exercised. Skip rather
-    than test a path the bug never lived on.
-    """
-    if not _gpu_available():
-        return False
-    try:
-        from xrspatial.geotiff._gpu_decode import _get_nvjpeg
-        return _get_nvjpeg() is not None
-    except Exception:
-        return False
-
-
-_HAS_GPU = _gpu_available()
-_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None
-_HAS_PIL = importlib.util.find_spec("PIL") is not None
-# tifffile.imwrite(compression='jpeg') delegates the codec to imagecodecs
-# (or libjpeg via Pillow on some installs); when neither is wired up the
-# write raises and the suite would error instead of skipping cleanly.
-_HAS_IMAGECODECS = importlib.util.find_spec("imagecodecs") is not None
-_HAS_NVJPEG = _nvjpeg_available()
-
-_gpu_only = pytest.mark.skipif(
-    not (_HAS_GPU and _HAS_TIFFFILE and _HAS_PIL
-         and _HAS_IMAGECODECS and _HAS_NVJPEG),
-    reason="cupy + CUDA + tifffile + Pillow + imagecodecs + nvJPEG required",
-)
-
-
-def _write_jpeg_rgb_tiff(path: str, seed: int = 0,
-                         noise: bool = True) -> np.ndarray:
-    """Write a 3-band 256x256 tiled JPEG TIFF using tifffile.
-
-    tifffile emits a complete JFIF stream (SOI + APP0 + DQT + DHT + SOF0
-    + ...) per tile rather than a JPEGTables-style abbreviated stream,
-    so the GPU decode path runs without any extra splice step.
-
-    With ``noise=True`` the payload is uniform random bytes -- the
-    worst-case input for JPEG compression and useful for the
-    crash-reproducer test which only cares about safe completion.
-    With ``noise=False`` the payload is a smooth gradient where
-    libjpeg and nvjpeg agree to within a few LSBs per pixel; that lets
-    the cross-backend match assertion run with a tight tolerance.
-    """
-    import tifffile
-    if noise:
-        rng = np.random.default_rng(seed)
-        arr = rng.integers(0, 256, size=(256, 256, 3), dtype=np.uint8)
-    else:
-        # Smooth gradient: per-channel ramp + cross terms.
-        ys, xs = np.mgrid[0:256, 0:256].astype(np.int32)
-        r = (ys + xs) // 2
-        g = ys
-        b = xs
-        arr = np.stack([r, g, b], axis=2).clip(0, 255).astype(np.uint8)
-    tifffile.imwrite(path, arr, photometric='rgb', tile=(128, 128),
-                     compression='jpeg')
-    return arr
-
-
-def _write_jpeg_gray_tiff(path: str, seed: int = 42) -> np.ndarray:
-    """Write a 1-band 256x256 tiled JPEG TIFF using tifffile."""
-    import tifffile
-    rng = np.random.default_rng(seed)
-    arr = rng.integers(0, 256, size=(256, 256), dtype=np.uint8)
-    tifffile.imwrite(path, arr, photometric='minisblack', tile=(128, 128),
-                     compression='jpeg')
-    return arr
-
-
-@_gpu_only
-def test_rgb_jpeg_gpu_no_crash(tmp_path, monkeypatch):
-    """3-band JPEG must not raise CUDARuntimeError on GPU read.
-
-    Uses ``gpu='strict'`` so the original
-    ``cudaErrorIllegalAddress`` would propagate up the stack instead of
-    being swallowed by the auto-mode CPU fallback.  Without strict the
-    bug presented as a ``RuntimeWarning: GPU decode failed (...);
-    falling back to CPU`` and the returned array was the CPU array, so
-    the assertions below would still pass even with the bug present.
-
-    Also spies on ``_try_nvjpeg_batch_decode`` to fail loudly if the
-    decode took a CPU fallback path instead of nvJPEG.  Without this
-    guard the test would pass on a system whose nvJPEG returned None for
-    any reason, defeating the point of the regression test.
-    """
-    import cupy
-
-    from xrspatial.geotiff import _gpu_decode, read_geotiff_gpu
-
-    spy = {"calls": 0, "successes": 0}
-    original = _gpu_decode._try_nvjpeg_batch_decode
-
-    def wrapped(*args, **kwargs):
-        spy["calls"] += 1
-        result = original(*args, **kwargs)
-        if result is not None:
-            spy["successes"] += 1
-        return result
-
-    monkeypatch.setattr(_gpu_decode, "_try_nvjpeg_batch_decode", wrapped)
-
-    path = str(tmp_path / "rgb_jpeg_1549.tif")
-    _write_jpeg_rgb_tiff(path)
-
-    arr = read_geotiff_gpu(path, gpu='strict', allow_internal_only_jpeg=True)
-    # Materialise the GPU buffer so any deferred kernel actually runs
-    # and surface any sticky error from the decode pipeline.
-    assert isinstance(arr.data, cupy.ndarray)
-    decoded = arr.data.get()
-    assert decoded.shape == (256, 256, 3)
-    assert decoded.dtype == np.uint8
-
-    assert spy["calls"] >= 1, (
-        "nvJPEG branch was never called — test did not exercise the "
-        "code path the #1549 fix lives on"
-    )
-    assert spy["successes"] >= 1, (
-        "nvJPEG returned None — CPU Pillow fallback ran and the fix was "
-        "not exercised"
-    )
-
-
-@_gpu_only
-def test_rgb_jpeg_gpu_matches_cpu(tmp_path):
-    """GPU pixels must be within JPEG decoder tolerance of CPU pixels.
-
-    With a smooth gradient input the libjpeg (CPU) and nvjpeg (GPU)
-    decoders agree to a couple of LSBs per pixel.  The off-by-two
-    constant bug scrambled channels enough to push the mean diff above
-    60 and the max diff above 200, so a tight bound here pins both the
-    constant fix and the per-tile sync that keeps the multi-tile
-    decode deterministic.
-    """
-    from xrspatial.geotiff import open_geotiff
-
-    path = str(tmp_path / "rgb_jpeg_match_1549.tif")
-    _write_jpeg_rgb_tiff(path, noise=False)
-
-    cpu = open_geotiff(path, allow_internal_only_jpeg=True)
-    gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True)
-    assert cpu.shape == gpu.shape == (256, 256, 3)
-
-    cpu_arr = np.asarray(cpu.data)
-    gpu_arr = np.asarray(gpu.data.get())
-
-    diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int))
-    assert diff.mean() < 1.0, f"mean diff {diff.mean():.3f} too large"
-    assert diff.max() < 8, f"max diff {diff.max()} too large"
-
-
-@_gpu_only
-def test_grayscale_jpeg_gpu_matches_cpu(tmp_path):
-    """Single-band JPEG GPU read must also produce correct pixels.
-
-    With the off-by-two constants the single-band path silently produced
-    wrong output (each Y value duplicated three times then wrapped) -- a
-    quieter failure mode than the 3-band crash but a corruption
-    nonetheless.
-    """
-    from xrspatial.geotiff import open_geotiff
-
-    path = str(tmp_path / "gray_jpeg_1549.tif")
-    _write_jpeg_gray_tiff(path)
-
-    cpu = open_geotiff(path, allow_internal_only_jpeg=True)
-    gpu = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True)
-    assert cpu.shape == gpu.shape == (256, 256)
-
-    cpu_arr = np.asarray(cpu.data)
-    gpu_arr = np.asarray(gpu.data.get())
-    diff = np.abs(cpu_arr.astype(int) - gpu_arr.astype(int))
-    # For grayscale there is no chroma involved, so libjpeg and nvjpeg
-    # only diverge by IDCT rounding (typically <= 1 LSB).
-    assert diff.max() <= 2, (
-        f"grayscale max diff {diff.max()} indicates corruption, "
-        f"not just rounding"
-    )
-
-
-@_gpu_only
-def test_cuda_context_survives_after_jpeg_gpu_read(tmp_path):
-    """Verify the CUDA context is healthy after a GPU JPEG read.
-
-    Before the fix, the failing nvJPEG kernel left the context in an
-    error state so every later GPU call in the same process raised
-    ``cudaErrorIllegalAddress`` -- even unrelated allocations.  This
-    test reads the JPEG on GPU, then performs a small follow-up GPU
-    operation and an unrelated GPU read and asserts both succeed.
-    """
-    import cupy
-
-    from xrspatial.geotiff import open_geotiff
-
-    path = str(tmp_path / "rgb_ctx_1549.tif")
-    _write_jpeg_rgb_tiff(path)
-
-    arr = open_geotiff(path, gpu=True, allow_internal_only_jpeg=True)
-    _ = arr.data.get()
-
-    # Plain CuPy op -- this is the call that used to surface the sticky
-    # cudaErrorIllegalAddress on its first allocation.
-    x = cupy.arange(1024, dtype=cupy.float32)
-    s = float(cupy.sum(x).item())
-    assert s == 1023 * 1024 / 2
-
-    # Unrelated GPU TIFF read -- closes the loop on the issue's
-    # "every later GPU call fails" symptom.
-    other_path = str(tmp_path / "other_1549.tif")
-    _write_jpeg_gray_tiff(other_path, seed=7)
-    other = open_geotiff(other_path, gpu=True, allow_internal_only_jpeg=True)
-    assert other.shape == (256, 256)
-    assert other.dtype == np.uint8
diff --git a/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py b/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py
deleted file mode 100644
index 62ff1ad1f..000000000
--- a/xrspatial/geotiff/tests/test_lerc_valid_mask_gpu.py
+++ /dev/null
@@ -1,222 +0,0 @@
-"""GPU follow-up to PR #1529 (LERC valid-mask on decode).
-
-The CPU LERC reader honours the LERC valid-mask and writes the file's
-nodata sentinel into masked pixels.  The GPU LERC tile-decode path used
-to discard the mask, so masked pixels read back as LERC's zero fill
-(real-looking measurements at z == 0) on GPU but as NaN/sentinel on
-CPU.  These tests confirm the GPU path now matches the CPU path for
-representative LERC mask combinations.
-
-Mirrors the structure of ``test_lerc_valid_mask.py`` but compares
-``read_geotiff_gpu`` output to ``read_to_array`` output for each case.
-"""
-from __future__ import annotations
-
-import importlib.util
-
-import numpy as np
-import pytest
-
-lerc = pytest.importorskip("lerc")
-
-from xrspatial.geotiff._compression import LERC_AVAILABLE  # noqa: E402
-
-
-def _gpu_available() -> bool:
-    """True if cupy is importable and CUDA is initialised."""
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-_HAS_GPU = _gpu_available()
-_gpu_only = pytest.mark.skipif(
-    not (_HAS_GPU and LERC_AVAILABLE),
-    reason="cupy + CUDA + lerc required",
-)
-
-
-@pytest.fixture
-def lerc_writer_with_mask(monkeypatch):
-    """Patch ``lerc_compress`` to embed a valid-mask the writer can't pass.
-
-    The xrspatial writer hard-codes ``hasMask=False`` in its call to
-    ``lerc.encode``.  Tests inject a per-tile mask through this holder's
-    ``invalid`` predicate so the masked pixels survive the encode and
-    show up at decode time.  Same pattern as the CPU test fixture in
-    ``test_lerc_valid_mask.py``.
-    """
-    holder = {"invalid": None}
-
-    def _patched(data, width, height, samples=1,
-                 dtype=np.dtype('float32'), max_z_error=0.0):
-        if samples == 1:
-            arr = np.frombuffer(data, dtype=dtype).reshape(height, width)
-        else:
-            arr = np.frombuffer(data, dtype=dtype).reshape(
-                height, width, samples)
-        invalid_pred = holder["invalid"]
-        if invalid_pred is None:
-            mask = None
-            has_mask = False
-        else:
-            invalid = invalid_pred(arr)
-            mask = np.where(invalid, np.uint8(0), np.uint8(1))
-            has_mask = True
-        result = lerc.encode(arr, samples, has_mask, mask, max_z_error, 1)
-        if result[0] != 0:
-            raise RuntimeError(
-                f"LERC encode failed with error code {result[0]}")
-        return bytes(result[2])
-
-    monkeypatch.setattr(
-        "xrspatial.geotiff._compression.lerc_compress", _patched,
-    )
-    return holder
-
-
-def _read_cpu_gpu(path):
-    """Read *path* with both readers and return ``(cpu_array, gpu_host_array)``.
-
-    Uses the low-level ``read_to_array`` for CPU so that nodata sentinels
-    stay as the literal value (this module checks LERC mask preservation,
-    not the higher-level NaN promotion that ``open_geotiff`` performs).
-
-    The GPU reader (``read_geotiff_gpu``) applies the same nodata masking
-    that ``open_geotiff`` does (PR #1542), so its output uses NaN where
-    the sentinel was. Callers that want a bit-for-bit comparison against
-    the low-level CPU read should run the GPU result through
-    ``_restore_sentinel`` below to put the sentinel back.
-    """
-    from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial.geotiff._reader import read_to_array
-
-    cpu, _geo = read_to_array(path, allow_experimental_codecs=True)
-    gpu_da = read_geotiff_gpu(
-        path, gpu='strict', allow_experimental_codecs=True,
-    )
-    gpu_host = gpu_da.data.get()
-    return cpu, gpu_host
-
-
-def _restore_sentinel(arr, nodata):
-    """Replace NaN positions in *arr* with *nodata* so high-level GPU
-    reads compare bit-exactly against low-level CPU reads (which keep
-    the sentinel value verbatim)."""
-    if nodata is None or arr.dtype.kind != 'f' or np.isnan(nodata):
-        return arr
-    out = arr.copy()
-    out[np.isnan(out)] = arr.dtype.type(nodata)
-    return out
-
-
-@_gpu_only
-class TestGpuLercValidMask:
-    """End-to-end TIFF round-trips comparing GPU vs CPU output."""
-
-    def test_float32_nan_nodata(self, tmp_path, lerc_writer_with_mask):
-        """Float32 LERC + NaN nodata: GPU output matches CPU output."""
-        from xrspatial.geotiff._writer import write
-
-        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
-        invalid_positions = {(0, 1), (5, 4)}
-
-        def invalid_pred(a):
-            m = np.zeros(a.shape[:2], dtype=bool)
-            for r, c in invalid_positions:
-                m[r, c] = True
-            return m
-        lerc_writer_with_mask["invalid"] = invalid_pred
-
-        path = str(tmp_path / "lerc_mask_nan_gpu.tif")
-        write(arr, path, compression="lerc", tiled=True, tile_size=8,
-              nodata=float("nan"))
-
-        cpu, gpu = _read_cpu_gpu(path)
-        # NaN positions
-        for (r, c) in invalid_positions:
-            assert np.isnan(cpu[r, c])
-            assert np.isnan(gpu[r, c])
-        # Valid positions agree exactly
-        cpu_valid = np.where(np.isnan(cpu), 0.0, cpu)
-        gpu_valid = np.where(np.isnan(gpu), 0.0, gpu)
-        np.testing.assert_array_equal(cpu_valid, gpu_valid)
-
-    def test_float32_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
-        """Float32 LERC + sentinel nodata (-9999): GPU matches CPU."""
-        from xrspatial.geotiff._writer import write
-
-        arr = np.arange(1, 65, dtype=np.float32).reshape(8, 8)
-        invalid_positions = {(0, 1), (3, 3), (7, 7)}
-
-        def invalid_pred(a):
-            m = np.zeros(a.shape[:2], dtype=bool)
-            for r, c in invalid_positions:
-                m[r, c] = True
-            return m
-        lerc_writer_with_mask["invalid"] = invalid_pred
-
-        path = str(tmp_path / "lerc_mask_sentinel_f32_gpu.tif")
-        write(arr, path, compression="lerc", tiled=True, tile_size=8,
-              nodata=-9999.0)
-
-        cpu, gpu = _read_cpu_gpu(path)
-        # ``read_geotiff_gpu`` applies the high-level nodata mask (#1542),
-        # so masked pixels come back as NaN. ``read_to_array`` keeps the
-        # sentinel verbatim. Restore the sentinel on the GPU side so the
-        # bit-for-bit comparison still pins LERC mask preservation.
-        gpu_with_sentinel = _restore_sentinel(gpu, -9999.0)
-        np.testing.assert_array_equal(cpu, gpu_with_sentinel)
-        for (r, c) in invalid_positions:
-            assert np.isnan(gpu[r, c])
-            assert gpu_with_sentinel[r, c] == np.float32(-9999.0)
-
-    def test_uint16_sentinel_nodata(self, tmp_path, lerc_writer_with_mask):
-        """Uint16 LERC + sentinel nodata (65535): GPU matches CPU."""
-        from xrspatial.geotiff._writer import write
-
-        arr = (np.arange(1, 65, dtype=np.uint16) * 100).reshape(8, 8)
-        invalid_positions = {(0, 1), (4, 4)}
-
-        def invalid_pred(a):
-            m = np.zeros(a.shape[:2], dtype=bool)
-            for r, c in invalid_positions:
-                m[r, c] = True
-            return m
-        lerc_writer_with_mask["invalid"] = invalid_pred
-
-        path = str(tmp_path / "lerc_mask_uint16_gpu.tif")
-        write(arr, path, compression="lerc", tiled=True, tile_size=8,
-              nodata=65535)
-
-        cpu, gpu = _read_cpu_gpu(path)
-        # ``read_geotiff_gpu`` applies the high-level nodata mask on
-        # integer rasters (#1542): the array is promoted to float64 with
-        # NaN where the sentinel was. ``read_to_array`` keeps uint16 with
-        # the sentinel literal. Restore the sentinel + dtype on the GPU
-        # side so the bit-for-bit comparison still pins LERC mask
-        # preservation. Replace NaN before the uint16 cast to avoid
-        # numpy's "invalid value encountered in cast" warning.
-        assert gpu.dtype == np.float64
-        gpu_no_nan = np.where(np.isnan(gpu), 65535.0, gpu)
-        gpu_u16 = gpu_no_nan.astype(np.uint16)
-        np.testing.assert_array_equal(cpu, gpu_u16)
-        for (r, c) in invalid_positions:
-            assert np.isnan(gpu[r, c])
-            assert gpu_u16[r, c] == np.uint16(65535)
-
-    def test_no_mask_roundtrip_bitexact(self, tmp_path):
-        """All-valid LERC (no encoded mask): GPU and CPU agree bit-exact."""
-        from xrspatial.geotiff._writer import write
-
-        arr = np.arange(64, dtype=np.float32).reshape(8, 8)
-        path = str(tmp_path / "lerc_no_mask_gpu.tif")
-        write(arr, path, compression="lerc", tiled=True, tile_size=8)
-
-        cpu, gpu = _read_cpu_gpu(path)
-        np.testing.assert_array_equal(cpu, arr)
-        np.testing.assert_array_equal(gpu, arr)
diff --git a/xrspatial/geotiff/tests/test_nvcomp_batch_compress_batched_1712.py b/xrspatial/geotiff/tests/test_nvcomp_batch_compress_batched_1712.py
deleted file mode 100644
index df1902f74..000000000
--- a/xrspatial/geotiff/tests/test_nvcomp_batch_compress_batched_1712.py
+++ /dev/null
@@ -1,157 +0,0 @@
-"""Coverage for the batched ``_nvcomp_batch_compress`` (#1712).
-
-The pre-fix function allocated compressed-output device buffers one
-``cupy.empty`` per tile and then read each tile back to host with one
-``.get()`` per tile. Both patterns serialised on the default CUDA
-stream and were dominant in large-N writes. The fix folds both into a
-single contiguous device allocation + a single batched D2H concat-and-
-``.get()``, matching the patterns already in use on the decode side
-(#1552, #1659).
-
-These tests pin the new shape and confirm the deflate / zstd GPU write
-paths still round-trip end-to-end.
-"""
-from __future__ import annotations
-
-import importlib.util
-import inspect
-import os
-import tempfile
-
-import numpy as np
-import pytest
-import xarray as xr
-
-try:
-    import cupy
-    _HAS_CUPY = True
-except Exception:
-    _HAS_CUPY = False
-
-
-def _gpu_available() -> bool:
-    """Match the geotiff-test convention: cupy import AND working CUDA.
-
-    A host can have cupy installed without a usable CUDA runtime (no
-    driver, no device visible, container misconfig), and in that case
-    every test that calls into the GPU writer would fail rather than
-    skip. ``cupy.cuda.is_available()`` is the cheap probe.
-    """
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-_HAS_GPU = _gpu_available()
-
-# nvCOMP is the entry point that exercises this code path.
-from xrspatial.geotiff import _gpu_decode  # noqa: E402
-
-needs_cupy = pytest.mark.skipif(
-    not _HAS_GPU, reason="cupy + CUDA required"
-)
-
-
-# ----------------------------------------------------------------------
-# Source-level structural assertions -- run on any host with the source
-# available, no GPU required.
-# ----------------------------------------------------------------------
-
-def test_no_per_tile_cupy_empty_in_compressed_pool():
-    """The per-tile cupy.empty list comprehension is gone (#1712).
-
-    The fix replaced it with a single contiguous allocation. Catch any
-    regression that brings the loop back.
-    """
-    source = inspect.getsource(_gpu_decode._nvcomp_batch_compress)
-    assert "cupy.empty(max_cs, dtype=cupy.uint8) for _ in range" not in source, (
-        "_nvcomp_batch_compress regressed to per-tile cupy.empty "
-        "allocations for the compressed output pool. See #1712."
-    )
-
-
-def test_no_per_tile_get_in_result_loop():
-    """The per-tile ``d_comp_bufs[i][:cs].get().tobytes()`` is gone (#1712).
-
-    The fix replaced it with one concat + one ``.get()``. Catch any
-    regression that brings the per-tile pattern back.
-    """
-    source = inspect.getsource(_gpu_decode._nvcomp_batch_compress)
-    # The exact string the prior loop used:
-    bad_fragment = "d_comp_bufs[i][:cs].get().tobytes()"
-    assert bad_fragment not in source, (
-        "_nvcomp_batch_compress regressed to per-tile .get().tobytes() "
-        "D2H readback. See #1712."
-    )
-
-
-# ----------------------------------------------------------------------
-# End-to-end behaviour: GPU write + read round-trip stays correct
-# ----------------------------------------------------------------------
-
-@needs_cupy
-@pytest.mark.parametrize("compression", ["deflate", "zstd"])
-def test_gpu_write_roundtrip_after_batched_compress(compression):
-    """GPU compress path round-trips uncorrupted for deflate + zstd.
-
-    Catches the most likely regression mode: any off-by-one in the
-    cumulative-sum offsets used to slice the host-side concatenated
-    buffer would scramble tile order, which a round-trip equality
-    check picks up immediately.
-    """
-    from xrspatial.geotiff import open_geotiff, write_geotiff_gpu
-
-    rng = np.random.default_rng(seed=1712)
-    arr_cpu = rng.random((512, 512), dtype=np.float32)
-    arr_gpu = cupy.asarray(arr_cpu)
-    darr = xr.DataArray(arr_gpu, dims=["y", "x"])
-
-    with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td:
-        path = os.path.join(td, f"roundtrip_{compression}.tif")
-        try:
-            write_geotiff_gpu(
-                darr, path,
-                compression=compression,
-                tiled=True,
-                tile_size=64,
-            )
-        except RuntimeError as e:
-            # nvCOMP may be unavailable in this environment; the writer
-            # falls back to CPU and that path doesn't exercise the
-            # change. Skip rather than fail.
-            pytest.skip(f"nvCOMP unavailable for {compression}: {e}")
-
-        back = open_geotiff(path)
-        np.testing.assert_allclose(back.values, arr_cpu, rtol=0, atol=0)
-
-
-@needs_cupy
-def test_gpu_write_zero_tile_edge_case():
-    """A 0-tile compress returns an empty list without indexing into None.
-
-    The cumulative-sum / concat path must short-circuit before
-    ``cupy.concatenate`` (which would raise on an empty list). The
-    pre-fix loop simply iterated zero times, so the contract is the
-    same empty-list output.
-    """
-    # Direct call into the internal function with n_tiles=0 is
-    # awkward because it needs a libnvCOMP handle and matching opts.
-    # Instead, exercise the public writer with a tiny single-tile
-    # input and confirm the fast path does not crash. Real n_tiles==0
-    # never occurs via the writer (every image has at least one tile).
-    from xrspatial.geotiff import open_geotiff, write_geotiff_gpu
-    arr_gpu = cupy.zeros((32, 32), dtype=cupy.float32)
-    darr = xr.DataArray(arr_gpu, dims=["y", "x"])
-    with tempfile.TemporaryDirectory(prefix="nvcomp_batch_1712_") as td:
-        path = os.path.join(td, "tiny.tif")
-        try:
-            write_geotiff_gpu(darr, path, compression="zstd",
-                              tiled=True, tile_size=32)
-        except RuntimeError as e:
-            pytest.skip(f"nvCOMP unavailable: {e}")
-        back = open_geotiff(path)
-        assert back.shape == (32, 32)
diff --git a/xrspatial/geotiff/tests/test_nvcomp_batch_upload_p3.py b/xrspatial/geotiff/tests/test_nvcomp_batch_upload_p3.py
deleted file mode 100644
index 3b6009f60..000000000
--- a/xrspatial/geotiff/tests/test_nvcomp_batch_upload_p3.py
+++ /dev/null
@@ -1,206 +0,0 @@
-"""Regression tests for batched host->device upload in the nvCOMP path.
-
-Performance audit P3: ``_try_nvcomp_batch_decompress`` previously did one
-``cupy.asarray`` per compressed tile, costing ~6.07 ms for 256 x 64 KB
-tiles. The fix concatenates all tiles into a single host buffer, performs
-one H2D transfer, and derives per-tile device pointers via
-``base_ptr + offsets`` -- mirroring the pattern at
-``_gpu_decode.py`` L1714-1722 in the LZW/Deflate path. Measured ~1.66x
-speedup that scales worse with more tiles.
-
-The tests skip cleanly when neither libnvcomp nor kvikio.nvcomp is
-available on the host -- without one of those, the GPU decoder falls
-back to the per-tile numba kernel and the changed code path is not
-exercised, so a passing test would be misleading. When the path *is*
-available, the correctness test additionally asserts that
-``_try_nvcomp_batch_decompress`` returned a non-None CuPy array, so a
-silent fall-through to the slow path counts as a test failure.
-"""
-from __future__ import annotations
-
-import importlib.util
-import time
-import uuid
-
-import numpy as np
-import pytest
-
-
-def _gpu_available() -> bool:
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-def _kvikio_nvcomp_importable() -> bool:
-    """True iff ``import kvikio.nvcomp`` actually succeeds.
-
-    ``importlib.util.find_spec`` may report kvikio as installed even when
-    the underlying ``libkvikio.so`` is missing, so we attempt the real
-    import here.
-    """
-    try:
-        import kvikio.nvcomp  # noqa: F401
-    except Exception:
-        return False
-    return True
-
-
-def _nvcomp_path_available() -> bool:
-    """True when at least one nvCOMP backend is loadable on this host.
-
-    The optimised code path runs only when either the C nvCOMP library
-    (``libnvcomp.so``) or ``kvikio.nvcomp`` is importable. Without one
-    of those, ``_try_nvcomp_batch_decompress`` always returns None and
-    timing/correctness tests would silently exercise the slower fallback
-    decoder.
-    """
-    if not _gpu_available():
-        return False
-    try:
-        from xrspatial.geotiff._gpu_decode import _get_nvcomp
-    except Exception:
-        return False
-    if _get_nvcomp() is not None:
-        return True
-    return _kvikio_nvcomp_importable()
-
-
-_HAS_GPU = _gpu_available()
-_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None
-_HAS_NVCOMP = _nvcomp_path_available()
-_nvcomp_only = pytest.mark.skipif(
-    not (_HAS_GPU and _HAS_TIFFFILE and _HAS_NVCOMP),
-    reason="cupy + CUDA + tifffile + (libnvcomp or kvikio.nvcomp) required",
-)
-
-
-def _write_deflate_tiled(path, arr, tile=(256, 256)):
-    import tifffile
-    tifffile.imwrite(
-        str(path), arr, compression="deflate", tile=tile,
-    )
-
-
-def _wrap_nvcomp_with_call_recorder(monkeypatch):
-    """Replace ``_try_nvcomp_batch_decompress`` with a wrapper that records
-    each (compression, returned_non_none) call. Returns the records list."""
-    from xrspatial.geotiff import _gpu_decode
-
-    records: list[tuple[int, bool]] = []
-    original = _gpu_decode._try_nvcomp_batch_decompress
-
-    def _recording(compressed_tiles, tile_bytes, compression):
-        result = original(compressed_tiles, tile_bytes, compression)
-        records.append((compression, result is not None))
-        return result
-
-    monkeypatch.setattr(
-        _gpu_decode,
-        '_try_nvcomp_batch_decompress',
-        _recording,
-        raising=True,
-    )
-    return records
-
-
-@_nvcomp_only
-@pytest.mark.parametrize("size,tile", [
-    (256, (128, 128)),    # 4 tiles
-    (1024, (256, 256)),   # 16 tiles
-    (2048, (128, 128)),   # 256 tiles -- matches the audit measurement
-])
-def test_nvcomp_batch_upload_correctness(tmp_path, monkeypatch, size, tile):
-    """GPU decode of Deflate-tiled TIFFs is bit-exact vs CPU after the
-    batched H2D upload rewrite, AND the nvCOMP fast-path actually ran."""
-    from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial.geotiff._reader import read_to_array
-
-    rng = np.random.RandomState(20260508)
-    arr = rng.randint(0, 4096, size=(size, size), dtype=np.uint16)
-
-    name = f"deflate_{size}_{tile[0]}_{uuid.uuid4().hex[:8]}.tif"
-    path = tmp_path / name
-    _write_deflate_tiled(path, arr, tile=tile)
-
-    cpu, _ = read_to_array(str(path))
-    np.testing.assert_array_equal(cpu, arr)
-
-    records = _wrap_nvcomp_with_call_recorder(monkeypatch)
-    gpu_da = read_geotiff_gpu(str(path))
-    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
-
-    assert any(success for _, success in records), (
-        "_try_nvcomp_batch_decompress was never invoked or always returned "
-        f"None; records={records}. The optimised path was not exercised, so "
-        f"this test would pass even if the rewrite were broken."
-    )
-
-
-@_nvcomp_only
-def test_nvcomp_kvikio_fallback_skips_zstd(monkeypatch):
-    """When the C nvCOMP lib is missing and kvikio is the only backend,
-    ZSTD-compressed input must NOT take the kvikio DeflateManager path
-    (which would strip a fake zlib header and try to decode ZSTD frames
-    as Deflate). It must return None so the caller can fall through."""
-    import xrspatial.geotiff._gpu_decode as _gpu_decode
-
-    # Force the libnvcomp branch off so the kvikio fallback is the only
-    # path. If kvikio.nvcomp isn't importable, the fallback returns
-    # None for an unrelated reason -- skip in that case.
-    if not _kvikio_nvcomp_importable():
-        pytest.skip("kvikio.nvcomp not importable; the kvikio branch "
-                    "is never entered on this host")
-    monkeypatch.setattr(_gpu_decode, '_get_nvcomp', lambda: None)
-
-    # Pass any bytes; the gate must reject ZSTD before any decode work.
-    result = _gpu_decode._try_nvcomp_batch_decompress(
-        compressed_tiles=[b'\x28\xb5\x2f\xfd' + b'\x00' * 16],
-        tile_bytes=1024,
-        compression=50000,  # ZSTD
-    )
-    assert result is None, (
-        "_try_nvcomp_batch_decompress returned non-None for ZSTD via the "
-        "kvikio fallback; this would feed ZSTD bytes through DeflateManager "
-        "and produce garbage."
-    )
-
-
-@_nvcomp_only
-def test_nvcomp_batch_upload_perf_regression_guard(tmp_path, monkeypatch):
-    """Sanity guard: 2048x2048 Deflate-tiled GPU decode finishes under a
-    generous threshold WHILE going through the nvCOMP fast-path. Skipping
-    when nvCOMP isn't available is handled by ``_nvcomp_only``."""
-    from xrspatial.geotiff import read_geotiff_gpu
-
-    rng = np.random.RandomState(20260508)
-    arr = rng.randint(0, 4096, size=(2048, 2048), dtype=np.uint16)
-    path = tmp_path / f"deflate_2048_perf_{uuid.uuid4().hex[:8]}.tif"
-    _write_deflate_tiled(path, arr, tile=(128, 128))
-
-    # Warm up: first call may JIT-compile kernels and load CUDA libs.
-    _ = read_geotiff_gpu(str(path))
-
-    records = _wrap_nvcomp_with_call_recorder(monkeypatch)
-    t0 = time.perf_counter()
-    out = read_geotiff_gpu(str(path))
-    elapsed = time.perf_counter() - t0
-
-    assert any(success for _, success in records), (
-        "nvCOMP fast-path did not run during the timed call; the threshold "
-        f"is meaningless without it. Records: {records}"
-    )
-
-    # Generous regression threshold; the per-tile upload version was
-    # ~6 ms just for H2D so anything well above 200 ms is a real
-    # regression somewhere in the decode pipeline.
-    assert elapsed < 0.2, (
-        f"read_geotiff_gpu on 2048x2048 deflate-tiled TIFF took "
-        f"{elapsed * 1000:.1f} ms (threshold 200 ms) -- possible "
-        f"regression in the nvCOMP batched H2D upload path"
-    )
-    assert out.shape == (2048, 2048)
diff --git a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
deleted file mode 100644
index 4e0524490..000000000
--- a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
+++ /dev/null
@@ -1,155 +0,0 @@
-"""Regression tests for issue #1950.
-
-``_try_nvcomp_batch_decompress`` used to compute its per-tile host
-prefix-sum offsets via a Python ``for`` loop:
-
-```
-comp_sizes_list = [len(t) for t in raw_tiles]
-comp_offsets_h = np.zeros(n_tiles, dtype=np.int64)
-for i in range(1, n_tiles):
-    comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1]
-```
-
-The sibling batched-D2H helper ``_batched_d2h_to_bytes`` at ~L924 and
-the compress-side prefix sum in ``_nvcomp_batch_compress`` at ~L2572
-both use ``np.cumsum(sizes, out=offsets[1:])``. Aligning the
-decompress side keeps the codebase consistent and trims interpreter
-overhead.
-
-Two guards here:
-
-1. Correctness -- a tiny synthetic nvCOMP round-trip (when the lib is
-   available) still decodes every tile correctly. Without nvCOMP the
-   test exercises the same prefix-sum reshape via direct comparison
-   against ``np.cumsum``.
-2. Structural -- the source uses ``np.cumsum`` (not a Python
-   ``range(1, n_tiles)`` loop) for the prefix sum.
-"""
-from __future__ import annotations
-
-import importlib.util
-import os
-import re
-import tempfile
-
-import numpy as np
-import pytest
-
-
-def test_nvcomp_decompress_uses_cumsum_for_offsets_1950():
-    """Source-level guard against reintroducing the Python for loop.
-
-    The fix swaps the per-tile prefix-sum loop for ``np.cumsum``.
-    This test fires if anyone reverts to the loop or otherwise breaks
-    the alignment with ``_batched_d2h_to_bytes`` / ``_nvcomp_batch_compress``.
-    """
-    import pathlib
-
-    src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py"
-    src = src_path.read_text()
-
-    # Anchor on the exact decompress-side prefix-sum call. Regex is
-    # tighter than a fixed character window and survives surrounding
-    # code edits.
-    cumsum_call = re.compile(
-        r"np\.cumsum\(\s*comp_sizes_arr\[:-1\]\s*,\s*"
-        r"out\s*=\s*comp_offsets_h\[1:\]\s*\)"
-    )
-    assert cumsum_call.search(src), (
-        "decompress upload block should use "
-        "``np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])`` for "
-        "prefix-sum offsets, aligning with _batched_d2h_to_bytes "
-        "(issue #1950)."
-    )
-    # The legacy Python loop would have written
-    # ``comp_offsets_h[i] = comp_offsets_h[i - 1] + ...`` inside a
-    # ``for i in range(1, n_tiles):`` block.
-    legacy_loop = re.compile(
-        r"for\s+i\s+in\s+range\(\s*1\s*,\s*n_tiles\s*\)\s*:\s*\n"
-        r"\s*comp_offsets_h\[i\]"
-    )
-    assert not legacy_loop.search(src), (
-        "decompress upload block should no longer compute prefix-sum "
-        "offsets with a Python for loop (issue #1950)."
-    )
-
-
-def test_cumsum_matches_loop_prefix_sum_1950():
-    """Equivalence between the vectorised cumsum and the prior loop.
-
-    Numeric guard. Even though the two forms produce the same output
-    by construction, a runtime check confirms the cumsum form does not
-    drift away from the previous semantics across numpy versions.
-    """
-    rng = np.random.RandomState(1950)
-    n = 1024
-    sizes = rng.randint(100, 100_000, size=n).astype(np.int64)
-
-    # Vectorised form (matches the fix).
-    offsets_cumsum = np.zeros(n, dtype=np.int64)
-    if n > 1:
-        np.cumsum(sizes[:-1], out=offsets_cumsum[1:])
-
-    # Reference: explicit Python prefix sum.
-    offsets_loop = np.zeros(n, dtype=np.int64)
-    for i in range(1, n):
-        offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1]
-
-    np.testing.assert_array_equal(offsets_cumsum, offsets_loop)
-
-
-@pytest.mark.skipif(
-    importlib.util.find_spec("cupy") is None,
-    reason="cupy required for nvCOMP path",
-)
-def test_nvcomp_batch_decompress_roundtrip_1950():
-    """End-to-end check: a deflate-tiled raster still decodes correctly.
-
-    Exercises ``_try_nvcomp_batch_decompress`` on a real file via the
-    public ``read_geotiff_gpu`` entry point. If the prefix-sum
-    refactor mis-stages a tile, the decoded buffer would not match
-    the source, surfacing as a numerical regression here.
-
-    Gated on ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` so the run-time check
-    only fires in environments that actually carry nvCOMP. Without the
-    flag the GPU read path silently falls back to a CPU codec when
-    nvCOMP is missing, which bypasses the prefix-sum site entirely; a
-    pass under that fallback would be misleading, so we skip instead.
-    """
-    if os.environ.get("XRSPATIAL_GEOTIFF_STRICT_GPU") != "1":
-        pytest.skip(
-            "set XRSPATIAL_GEOTIFF_STRICT_GPU=1 to exercise the nvCOMP "
-            "prefix-sum site; without it the GPU path may fall back to "
-            "a CPU codec and bypass this regression."
-        )
-    try:
-        import cupy
-    except ImportError:
-        pytest.skip("cupy not importable")
-    if not cupy.cuda.is_available():
-        pytest.skip("CUDA device not available")
-
-    import xarray as xr
-
-    from xrspatial.geotiff import open_geotiff, to_geotiff
-
-    rng = np.random.RandomState(1950)
-    height, width = 1024, 1024
-    arr = rng.rand(height, width).astype(np.float32)
-    da = xr.DataArray(
-        arr, dims=["y", "x"],
-        coords={"y": np.arange(height), "x": np.arange(width)},
-        attrs={"crs": 4326},
-    )
-
-    with tempfile.TemporaryDirectory() as td:
-        path = os.path.join(td, "tmp_1950_deflate.tif")
-        to_geotiff(da, path, compression="deflate", tile_size=256)
-
-        # Read back through the GPU pipeline.
-        result = open_geotiff(path, gpu=True)
-        assert result.shape == (height, width)
-        decoded = cupy.asnumpy(result.data) if hasattr(
-            result.data, "get") else np.asarray(result.data)
-
-    np.testing.assert_allclose(decoded, arr, atol=0, rtol=0)
diff --git a/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py b/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py
deleted file mode 100644
index 1144f58c1..000000000
--- a/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py
+++ /dev/null
@@ -1,234 +0,0 @@
-"""Regression tests for the single-buffer pattern in _try_nvcomp_from_device_bufs.
-
-Issue #1659: ``_try_nvcomp_from_device_bufs`` used to allocate N separate
-``cupy.empty(tile_bytes)`` output buffers and run ``cupy.concatenate`` after
-the nvCOMP decompress kernel returned. That kept two copies of the
-decompressed data alive at once and ran a serial concat the other nvCOMP
-paths in this module already avoid. The fix matches the single-contiguous-
-buffer + pointer-offset pattern used by the deflate / LZW / host-buffer
-paths nearby.
-
-These tests skip when CuPy + CUDA are not available. They also skip the
-end-to-end nvCOMP integration check when ``kvikio`` or the nvCOMP shared
-library are not installed, which is the common case on developer hosts;
-the unit-level checks (contract + memory guard) run regardless.
-"""
-from __future__ import annotations
-
-import importlib.util
-
-import numpy as np
-import pytest
-
-from xrspatial.geotiff._gpu_decode import _try_nvcomp_from_device_bufs
-
-
-def _gpu_available() -> bool:
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-def _nvcomp_available() -> bool:
-    from xrspatial.geotiff._gpu_decode import _get_nvcomp
-    return _get_nvcomp() is not None
-
-
-@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required")
-def test_unsupported_codec_short_circuits_before_allocation():
-    """Non-ZSTD codecs must return None without allocating output buffers.
-
-    Pins the early-return contract that lets the caller pick a different
-    decoder when nvCOMP cannot handle this codec.
-    """
-    import cupy
-
-    # Use Deflate (8), which is unsupported by this function (ZSTD only).
-    d_tiles = [cupy.zeros(1024, dtype=cupy.uint8) for _ in range(4)]
-    assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 8) is None
-
-
-@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required")
-def test_no_nvcomp_lib_returns_none(monkeypatch):
-    """When the nvCOMP library is missing, the function must return None.
-
-    The caller relies on this signal to fall back to the bytes-based decode
-    path. Without it, callers would hit a ctypes ``getattr`` AttributeError
-    deeper in the function.
-    """
-    import cupy
-
-    from xrspatial.geotiff import _gpu_decode
-
-    monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: None)
-
-    d_tiles = [cupy.zeros(1024, dtype=cupy.uint8)]
-    assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 50000) is None
-
-
-@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required")
-def test_memory_guard_runs_with_full_decomp_size(monkeypatch):
-    """The single-buffer allocation must be size-checked before cupy.empty.
-
-    The new pattern allocates one contiguous ``n * tile_bytes`` buffer
-    instead of N small buffers. The OOM guard is what tells the caller
-    early that the decode will not fit on the device; a regression that
-    removed the guard would surface as an opaque CUDA OOM instead.
-    """
-    import cupy
-
-    from xrspatial.geotiff import _gpu_decode
-
-    seen = {"total_bytes": None, "what": None, "called": False}
-
-    def fake_check(required_bytes, what="tile buffer"):
-        seen["total_bytes"] = int(required_bytes)
-        seen["what"] = what
-        seen["called"] = True
-        raise MemoryError("simulated OOM")
-
-    # Pin _get_nvcomp to something truthy so the function does not bail
-    # before reaching the allocation step. The fake check raises before
-    # any nvCOMP call would happen, so the lib value never gets used.
-    monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: object())
-    monkeypatch.setattr(_gpu_decode, "_check_gpu_memory", fake_check)
-
-    n_tiles = 8
-    tile_bytes = 65536
-    d_tiles = [cupy.zeros(128, dtype=cupy.uint8) for _ in range(n_tiles)]
-
-    with pytest.raises(MemoryError):
-        _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000)
-
-    assert seen["called"], "_check_gpu_memory was not called"
-    expected_bytes = n_tiles * tile_bytes
-    assert seen["total_bytes"] == expected_bytes, (
-        f"expected total {expected_bytes}, got {seen['total_bytes']}"
-    )
-    assert "decompressed" in seen["what"] or "nvCOMP" in seen["what"], (
-        f"unhelpful 'what' label: {seen['what']!r}"
-    )
-
-
-@pytest.mark.skipif(
-    not _gpu_available() or not _nvcomp_available(),
-    reason="cupy + CUDA + nvCOMP shared lib required",
-)
-def test_zstd_decompress_roundtrip_returns_single_contiguous_buffer():
-    """End-to-end: feed real ZSTD-compressed device buffers in, check the
-    output is a single flat ``cupy.uint8`` array of length n*tile_bytes.
-
-    This test confirms the return contract that ``_apply_predictor_and_assemble``
-    depends on: ``out`` is the contiguous concatenation of the N decompressed
-    tiles, not a list. The previous implementation returned the same shape but
-    via ``cupy.concatenate``; the new one allocates the contig buffer up front
-    and writes through per-tile pointers, so a regression that dropped the
-    return value would surface here.
-    """
-    import cupy
-    import zstandard as zstd
-
-    rng = np.random.default_rng(seed=1659)
-    tile_bytes = 4096
-    n_tiles = 8
-
-    cctx = zstd.ZstdCompressor()
-    host_tiles = [rng.integers(0, 256, size=tile_bytes, dtype=np.uint8)
-                  for _ in range(n_tiles)]
-    compressed = [cctx.compress(t.tobytes()) for t in host_tiles]
-    d_tiles = [cupy.asarray(np.frombuffer(c, dtype=np.uint8))
-               for c in compressed]
-
-    result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000)
-
-    # nvCOMP may be present but mis-configured on the host (e.g. driver
-    # version mismatch); skip rather than fail in that case so the test is
-    # informative when run on a real GDS rig.
-    if result is None:
-        pytest.skip("nvCOMP returned None; library may be unusable on this host")
-
-    assert isinstance(result, cupy.ndarray)
-    assert result.dtype == cupy.uint8
-    assert result.shape == (n_tiles * tile_bytes,)
-    assert result.flags.c_contiguous
-
-    # Decoded payload must match the original host tiles. The buffer is a
-    # single flat array; tile i lives at offset i*tile_bytes.
-    host_out = result.get()
-    for i, expected in enumerate(host_tiles):
-        decoded = host_out[i * tile_bytes:(i + 1) * tile_bytes]
-        assert np.array_equal(decoded, expected), (
-            f"tile {i} decoded payload differs from input"
-        )
-
-
-@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required")
-def test_no_orphan_decomp_buffers_after_call(monkeypatch):
-    """Earlier code held a Python list of N device buffers in scope
-    alongside the concatenated result. The replacement allocates once
-    and returns that one buffer.
-
-    The check here is structural rather than numerical: after a successful
-    call the only cupy ndarray the caller receives is ``result`` itself,
-    and inspecting it confirms ``result.size == n_tiles * tile_bytes``.
-    """
-    import cupy
-
-    from xrspatial.geotiff import _gpu_decode
-
-    # Stub the nvCOMP entry points so the decompress "succeeds" without an
-    # actual library. Force the function down the success branch, capture
-    # the returned buffer, then verify shape + ownership.
-    monkeypatch.setattr(_gpu_decode, "_get_nvcomp",
-                        lambda: _FakeNvcompLib())
-
-    n_tiles = 4
-    tile_bytes = 2048
-    d_tiles = [cupy.zeros(64, dtype=cupy.uint8) for _ in range(n_tiles)]
-    result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000)
-
-    # The fake lib reports success and zero-fills the output buffer; the
-    # function returns the contiguous buffer as-is.
-    assert result is not None
-    assert isinstance(result, cupy.ndarray)
-    assert result.size == n_tiles * tile_bytes
-    assert result.flags.c_contiguous
-    # The contract requires uint8 -- not a uint8 view of something else.
-    assert result.dtype == cupy.uint8
-
-
-class _FakeNvcompLib:
-    """Stand-in for the nvCOMP CDLL handle used in tests.
-
-    The real function calls ``getattr(lib, fn_name)`` for two entry points
-    and invokes each as a ctypes function. We expose those entry-point
-    names as Python callables that pretend the work succeeded.
-    """
-
-    def __getattr__(self, name):
-        if name == 'nvcompBatchedZstdDecompressGetTempSizeAsync':
-            return _fake_temp_size_fn
-        if name == 'nvcompBatchedZstdDecompressAsync':
-            return _fake_decompress_fn
-        raise AttributeError(name)
-
-
-def _fake_temp_size_fn(n, tile_bytes, opts, p_temp_size, total):
-    """Stub for nvcompBatchedZstdDecompressGetTempSizeAsync."""
-    # Write a tiny temp-size value into the caller's c_size_t.
-    p_temp_size._obj.value = 1
-    return 0
-
-
-def _fake_decompress_fn(*args):
-    """Stub for nvcompBatchedZstdDecompressAsync.
-
-    The function's return-value test is ``s != 0``. We return 0 (success).
-    The caller's d_statuses array is already zero from ``cupy.zeros``, so
-    the post-decode any-nonzero check passes.
-    """
-    return 0
diff --git a/xrspatial/geotiff/tests/test_nvjpeg2k_single_alloc_2107.py b/xrspatial/geotiff/tests/test_nvjpeg2k_single_alloc_2107.py
deleted file mode 100644
index b542039ba..000000000
--- a/xrspatial/geotiff/tests/test_nvjpeg2k_single_alloc_2107.py
+++ /dev/null
@@ -1,343 +0,0 @@
-"""Tests for the nvJPEG2000 batch-decode allocation refactor (#2107).
-
-The fix replaces per-tile / per-component ``cupy.empty`` allocations and
-per-tile ``cupy.cuda.Device().synchronize()`` inside the decode loop with
-a single contiguous device pool and a single batch-end sync. The pattern
-matches the prior fixes for ``_try_nvcomp_from_device_bufs`` (#1659),
-``_try_kvikio_read_tiles`` (#1688), and ``_nvcomp_batch_compress`` (#1712).
-
-Since nvJPEG2000 is rarely available on test hosts (libnvjpeg2k.so is
-not part of cuda-toolkit's default install), these tests focus on
-structural properties of the implementation rather than running a real
-decode:
-
-* ``_try_nvjpeg2k_batch_decode`` early-returns ``None`` when the shared
-  library is missing -- no allocations or syncs happen on the common
-  test host.
-* Source inspection asserts the new contract:
-    - Exactly two ``cupy.empty`` calls in the decode loop region
-      (``d_comp_pool`` + ``d_all_tiles``), zero inside the per-tile loop.
-    - Exactly one ``Device().synchronize()`` after the loop, zero inside.
-    - The slab indexing math hits the per-tile-component slab.
-
-Structural tests like these mirror the approach taken by
-``test_nvcomp_from_device_bufs_single_alloc_1659.py`` and
-``test_kvikio_batched_pread_1688.py`` so a regression that re-introduces
-the per-tile alloc pattern fails CI without needing a GPU.
-"""
-from __future__ import annotations
-
-import ast
-import inspect
-
-import numpy as np
-import pytest
-
-
-def _function_source(func):
-    """Return the function's source plus its line range in the source file."""
-    src = inspect.getsource(func)
-    start_line = func.__code__.co_firstlineno
-    return src, start_line
-
-
-def _count_cupy_empty_calls(tree):
-    """Count ``cupy.empty(...)`` Call nodes inside the AST."""
-    n = 0
-    for node in ast.walk(tree):
-        if not isinstance(node, ast.Call):
-            continue
-        func = node.func
-        if not isinstance(func, ast.Attribute):
-            continue
-        if func.attr != 'empty':
-            continue
-        # ``cupy.empty`` matches both bare ``cupy`` and aliased ``cp``;
-        # we only care about the ``empty`` method name.
-        if not isinstance(func.value, ast.Name):
-            continue
-        if func.value.id not in ('cupy', 'cp'):
-            continue
-        n += 1
-    return n
-
-
-def _count_device_synchronize_calls(tree):
-    """Count ``cupy.cuda.Device(...).synchronize()`` Call nodes."""
-    n = 0
-    for node in ast.walk(tree):
-        if not isinstance(node, ast.Call):
-            continue
-        func = node.func
-        if not isinstance(func, ast.Attribute):
-            continue
-        if func.attr != 'synchronize':
-            continue
-        # We walk back through the chain: synchronize -> Device(...) ->
-        # cuda -> cupy. Allow any chain that ends in a ``Device`` call.
-        parent_call = func.value
-        if not isinstance(parent_call, ast.Call):
-            continue
-        if not isinstance(parent_call.func, ast.Attribute):
-            continue
-        if parent_call.func.attr != 'Device':
-            continue
-        n += 1
-    return n
-
-
-def _inside_for_loop(node: ast.AST, parents: dict) -> bool:
-    """Return True when ``node`` sits anywhere under a ``for`` statement."""
-    cur = parents.get(id(node))
-    while cur is not None:
-        if isinstance(cur, ast.For):
-            return True
-        cur = parents.get(id(cur))
-    return False
-
-
-def _parent_map(tree: ast.AST) -> dict:
-    """Build a ``{id(child): parent}`` lookup map for ``_inside_for_loop``."""
-    mapping: dict = {}
-    for parent in ast.walk(tree):
-        for child in ast.iter_child_nodes(parent):
-            mapping[id(child)] = parent
-    return mapping
-
-
-class TestNvjpeg2kSingleAllocStructural:
-    """Structural assertions on the refactored helper (no GPU required)."""
-
-    def setup_method(self):
-        from xrspatial.geotiff import _gpu_decode
-
-        self._fn = _gpu_decode._try_nvjpeg2k_batch_decode
-        src, start = _function_source(self._fn)
-        self._src = src
-        self._start_line = start
-        self._tree = ast.parse(src)
-        self._parents = _parent_map(self._tree)
-
-    def test_no_cupy_empty_inside_decode_loop(self):
-        """``cupy.empty`` must NOT appear inside the per-tile ``for`` loop.
-
-        The refactor moves the pool allocation outside the loop. A
-        regression that re-introduces a per-tile ``cupy.empty`` would
-        bring back the memory-pool round-trip the fix removed.
-        """
-        offending = []
-        for node in ast.walk(self._tree):
-            if not isinstance(node, ast.Call):
-                continue
-            func = node.func
-            if not isinstance(func, ast.Attribute):
-                continue
-            if func.attr != 'empty':
-                continue
-            if (not isinstance(func.value, ast.Name)
-                    or func.value.id not in ('cupy', 'cp')):
-                continue
-            if _inside_for_loop(node, self._parents):
-                offending.append(self._start_line + node.lineno - 1)
-        assert offending == [], (
-            f"_try_nvjpeg2k_batch_decode contains cupy.empty(...) calls "
-            f"inside a for-loop at file lines {offending}. The refactor "
-            f"in #2107 moved every output allocation outside the per-tile "
-            f"loop; reverting that defeats the pooling optimisation."
-        )
-
-    def test_no_device_synchronize_inside_decode_loop(self):
-        """``Device().synchronize()`` must NOT live inside the decode loop.
-
-        The previous implementation called it once per tile, forcing
-        default-stream serialisation. The refactor leaves exactly one
-        synchronize call after the loop body.
-        """
-        offending = []
-        for node in ast.walk(self._tree):
-            if not isinstance(node, ast.Call):
-                continue
-            func = node.func
-            if not isinstance(func, ast.Attribute):
-                continue
-            if func.attr != 'synchronize':
-                continue
-            parent_call = func.value
-            if (not isinstance(parent_call, ast.Call)
-                    or not isinstance(parent_call.func, ast.Attribute)
-                    or parent_call.func.attr != 'Device'):
-                continue
-            if _inside_for_loop(node, self._parents):
-                offending.append(self._start_line + node.lineno - 1)
-        assert offending == [], (
-            f"_try_nvjpeg2k_batch_decode contains Device().synchronize() "
-            f"calls inside a for-loop at file lines {offending}. The "
-            f"refactor in #2107 keeps exactly one batch-end sync outside "
-            f"the loop so successive tiles can pipeline through "
-            f"nvJPEG2000."
-        )
-
-    def test_pool_allocation_present(self):
-        """Source contains the expected pool buffer name and slab math.
-
-        The refactor introduces ``d_comp_pool`` and
-        ``per_tile_comp_bytes``; if either disappears, the test fails so
-        the reviewer notices the layout drift.
-        """
-        assert 'd_comp_pool' in self._src, (
-            "_try_nvjpeg2k_batch_decode no longer references the shared "
-            "d_comp_pool buffer; the refactor in #2107 is missing or "
-            "reverted."
-        )
-        assert 'per_tile_comp_bytes' in self._src, (
-            "_try_nvjpeg2k_batch_decode no longer references "
-            "per_tile_comp_bytes; the per-tile slab math from #2107 "
-            "is missing or renamed without an audit."
-        )
-
-    def test_check_gpu_memory_guard_present(self):
-        """The pool allocation must be guarded by ``_check_gpu_memory``.
-
-        Sibling helpers (``_try_nvcomp_from_device_bufs``,
-        ``_try_kvikio_read_tiles``, ``_nvcomp_batch_compress``) all guard
-        their pool allocations the same way; refusing the allocation
-        before cupy raises an opaque CUDA OOM keeps the failure mode
-        consistent (#2107 follows that pattern).
-        """
-        assert '_check_gpu_memory(' in self._src, (
-            "_try_nvjpeg2k_batch_decode no longer calls _check_gpu_memory "
-            "before allocating the per-tile component pool. The fail-fast "
-            "OOM contract from #2107 is missing."
-        )
-
-
-class TestNvjpeg2kLibAbsentShortCircuit:
-    """When the shared library is missing, the function returns None
-    without touching cupy / allocating any device memory."""
-
-    def test_returns_none_when_lib_missing(self, monkeypatch):
-        """The early-return is the path most test hosts take. Verify
-        nothing reaches the refactored allocation code on that path so
-        the refactor does not regress the lib-missing host behaviour.
-        """
-        from xrspatial.geotiff import _gpu_decode
-
-        monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: None)
-
-        result = _gpu_decode._try_nvjpeg2k_batch_decode(
-            compressed_tiles=[b''],
-            tile_width=8,
-            tile_height=8,
-            dtype=np.dtype('uint8'),
-            samples=1,
-        )
-        assert result is None
-
-    def test_returns_none_for_unsupported_dtype(self, monkeypatch):
-        """Unsupported dtypes (e.g. float32) short-circuit before any
-        device allocation. The refactor moved the pool alloc below the
-        dtype guard, so this exercises the dtype branch that must still
-        clean up the handles without touching the pool.
-        """
-        from xrspatial.geotiff import _gpu_decode
-
-        # Fake a lib that succeeds for handle/state/stream/params and
-        # exposes the destroy entry points so the dtype-guard cleanup
-        # path runs without crashing. We do not pretend to support the
-        # actual nvjpeg2k C API beyond what the early-return code uses.
-        class _FakeLib:
-            def __init__(self):
-                self.calls = []
-
-            def nvjpeg2kCreateSimple(self, *_args):
-                return 0
-
-            def nvjpeg2kDecodeStateCreate(self, *_args):
-                return 0
-
-            def nvjpeg2kStreamCreate(self, *_args):
-                return 0
-
-            def nvjpeg2kDecodeParamsCreate(self, *_args):
-                return 0
-
-            def nvjpeg2kDecodeParamsDestroy(self, *_args):
-                self.calls.append('params_destroy')
-
-            def nvjpeg2kStreamDestroy(self, *_args):
-                self.calls.append('stream_destroy')
-
-            def nvjpeg2kDecodeStateDestroy(self, *_args):
-                self.calls.append('state_destroy')
-
-            def nvjpeg2kDestroy(self, *_args):
-                self.calls.append('handle_destroy')
-
-        fake = _FakeLib()
-        monkeypatch.setattr(_gpu_decode, '_get_nvjpeg2k', lambda: fake)
-
-        # float32 is not in {uint8, uint16, int16} so the helper exits
-        # before any pool allocation -- and the host needs no cupy
-        # for this branch to run.
-        result = _gpu_decode._try_nvjpeg2k_batch_decode(
-            compressed_tiles=[b''],
-            tile_width=8,
-            tile_height=8,
-            dtype=np.dtype('float32'),
-            samples=1,
-        )
-        assert result is None
-        # The dtype-guard branch should have called all four destroy
-        # functions, mirroring the success-path cleanup.
-        assert fake.calls == [
-            'params_destroy',
-            'stream_destroy',
-            'state_destroy',
-            'handle_destroy',
-        ]
-
-
-@pytest.mark.gpu
-class TestNvjpeg2kPoolWithCupy:
-    """Lightweight cupy-only smoke tests for the pool layout.
-
-    These tests do not exercise the real nvJPEG2000 decode (the host
-    typically has no libnvjpeg2k.so) but they confirm the pool sizing
-    math and the per-tile slab indexing produce non-overlapping views
-    into the shared buffer for representative tile sizes.
-    """
-
-    def test_pool_slabs_are_non_overlapping(self):
-        """Tile-component slabs into the pool must not overlap.
-
-        We recompute the slab boundaries the helper uses and verify the
-        sequence covers exactly the pool size with no gaps and no
-        double-coverage. A regression that miscomputes
-        ``per_tile_comp_bytes`` would either OOB the pool or fold tile
-        N onto tile N-1's bytes; this test catches both.
-        """
-        cupy = pytest.importorskip('cupy')
-
-        n_tiles = 4
-        tile_width = 32
-        tile_height = 32
-        samples = 3
-        dtype = np.dtype('uint16')
-        pitch = tile_width * dtype.itemsize
-        per_tile_comp_bytes = samples * tile_height * pitch
-        pool = cupy.empty(n_tiles * per_tile_comp_bytes, dtype=cupy.uint8)
-
-        seen = set()
-        for i in range(n_tiles):
-            tile_pool_start = i * per_tile_comp_bytes
-            for c in range(samples):
-                start = tile_pool_start + c * tile_height * pitch
-                end = start + tile_height * pitch
-                for byte in range(start, end):
-                    assert byte not in seen, (
-                        f"pool byte {byte} appears in two slabs "
-                        f"(tile={i}, comp={c}); per-tile slab math is "
-                        f"wrong."
-                    )
-                    seen.add(byte)
-        assert len(seen) == int(pool.nbytes)
diff --git a/xrspatial/geotiff/tests/test_nvjpeg_encode_stream_sync_2212.py b/xrspatial/geotiff/tests/test_nvjpeg_encode_stream_sync_2212.py
deleted file mode 100644
index 0fc44f190..000000000
--- a/xrspatial/geotiff/tests/test_nvjpeg_encode_stream_sync_2212.py
+++ /dev/null
@@ -1,215 +0,0 @@
-"""Tests for the nvJPEG / nvJPEG2000 encoder default-stream-sync fix (#2212).
-
-The fix replaces ``cupy.cuda.Device().synchronize()`` inside the per-tile
-encode loops in ``_nvjpeg_batch_encode`` and ``_nvjpeg2k_batch_encode``
-with ``cupy.cuda.Stream.null.synchronize()``. ``Device().synchronize()``
-is a whole-device fence that blocks every CUDA stream; the encode /
-retrieve sequence only depends on the default stream the calls were
-issued on, so scoping the sync to the null stream lets concurrent work
-on other streams continue.
-
-The matching decoder ``_try_nvjpeg_batch_decode`` already uses
-``cupy.cuda.Stream.null.synchronize()`` (the pattern was inconsistent
-before this fix); the nvJPEG2000 decode-side regression was already
-caught by #2107.
-
-These tests skip the end-to-end encode path because nvJPEG / nvJPEG2000
-shared libraries are rarely installed on developer hosts; the
-structural AST checks run regardless and catch a future regression that
-re-introduces the device-wide sync.
-"""
-from __future__ import annotations
-
-import ast
-import inspect
-
-
-def _function_source(func):
-    src = inspect.getsource(func)
-    start_line = func.__code__.co_firstlineno
-    return src, start_line
-
-
-def _parent_map(tree: ast.AST) -> dict:
-    mapping: dict = {}
-    for parent in ast.walk(tree):
-        for child in ast.iter_child_nodes(parent):
-            mapping[id(child)] = parent
-    return mapping
-
-
-def _inside_for_loop(node: ast.AST, parents: dict) -> bool:
-    cur = parents.get(id(node))
-    while cur is not None:
-        if isinstance(cur, ast.For):
-            return True
-        cur = parents.get(id(cur))
-    return False
-
-
-def _device_synchronize_lines(tree: ast.AST, start_line: int,
-                              parents: dict, *, only_in_loop: bool):
-    """Return file line numbers of ``cupy.cuda.Device().synchronize()`` calls."""
-    out = []
-    for node in ast.walk(tree):
-        if not isinstance(node, ast.Call):
-            continue
-        func = node.func
-        if not isinstance(func, ast.Attribute):
-            continue
-        if func.attr != 'synchronize':
-            continue
-        parent_call = func.value
-        if not isinstance(parent_call, ast.Call):
-            continue
-        if not isinstance(parent_call.func, ast.Attribute):
-            continue
-        if parent_call.func.attr != 'Device':
-            continue
-        if only_in_loop and not _inside_for_loop(node, parents):
-            continue
-        if not only_in_loop and _inside_for_loop(node, parents):
-            continue
-        out.append(start_line + node.lineno - 1)
-    return out
-
-
-def _stream_null_synchronize_lines(tree: ast.AST, start_line: int,
-                                   parents: dict, *, only_in_loop: bool):
-    """Return file lines of ``cupy.cuda.Stream.null.synchronize()`` calls."""
-    out = []
-    for node in ast.walk(tree):
-        if not isinstance(node, ast.Call):
-            continue
-        func = node.func
-        if not isinstance(func, ast.Attribute):
-            continue
-        if func.attr != 'synchronize':
-            continue
-        # Stream.null is an Attribute chain, not a Call -- the value is
-        # ``cupy.cuda.Stream.null``.
-        chain = func.value
-        if isinstance(chain, ast.Call):
-            continue
-        if not isinstance(chain, ast.Attribute):
-            continue
-        # Walk back to find ``Stream`` in the chain.
-        found_stream_null = False
-        cur = chain
-        if cur.attr == 'null':
-            inner = cur.value
-            if isinstance(inner, ast.Attribute) and inner.attr == 'Stream':
-                found_stream_null = True
-        if not found_stream_null:
-            continue
-        if only_in_loop and not _inside_for_loop(node, parents):
-            continue
-        if not only_in_loop and _inside_for_loop(node, parents):
-            continue
-        out.append(start_line + node.lineno - 1)
-    return out
-
-
-class TestNvjpegEncodeStreamSync:
-    """Structural assertions on the encoder sync fix (no GPU required)."""
-
-    def setup_method(self):
-        from xrspatial.geotiff import _gpu_decode
-        self._fn = _gpu_decode._nvjpeg_batch_encode
-        src, start = _function_source(self._fn)
-        self._src = src
-        self._start_line = start
-        self._tree = ast.parse(src)
-        self._parents = _parent_map(self._tree)
-
-    def test_no_device_synchronize_inside_encode_loop(self):
-        offending = _device_synchronize_lines(
-            self._tree, self._start_line, self._parents, only_in_loop=True,
-        )
-        assert offending == [], (
-            "_nvjpeg_batch_encode contains cupy.cuda.Device().synchronize() "
-            f"calls inside a for-loop at file lines {offending}. The fix "
-            "in #2212 scopes the per-tile sync to the default stream via "
-            "cupy.cuda.Stream.null.synchronize() so concurrent CUDA work "
-            "on other streams is not serialised behind every tile encode."
-        )
-
-    def test_stream_null_synchronize_present(self):
-        """At least one ``Stream.null.synchronize()`` call must be present."""
-        found = _stream_null_synchronize_lines(
-            self._tree, self._start_line, self._parents, only_in_loop=True,
-        )
-        assert len(found) >= 1, (
-            "_nvjpeg_batch_encode no longer calls "
-            "cupy.cuda.Stream.null.synchronize() inside the encode loop. "
-            "The fix in #2212 requires the per-tile sync to be scoped to "
-            "the default stream so encode/retrieve ordering is preserved "
-            "without blocking other CUDA streams."
-        )
-
-
-class TestNvjpeg2kEncodeStreamSync:
-    """Structural assertions on the nvJPEG2000 encoder sync fix."""
-
-    def setup_method(self):
-        from xrspatial.geotiff import _gpu_decode
-        self._fn = _gpu_decode._nvjpeg2k_batch_encode
-        src, start = _function_source(self._fn)
-        self._src = src
-        self._start_line = start
-        self._tree = ast.parse(src)
-        self._parents = _parent_map(self._tree)
-
-    def test_no_device_synchronize_inside_encode_loop(self):
-        offending = _device_synchronize_lines(
-            self._tree, self._start_line, self._parents, only_in_loop=True,
-        )
-        assert offending == [], (
-            "_nvjpeg2k_batch_encode contains "
-            "cupy.cuda.Device().synchronize() calls inside a for-loop at "
-            f"file lines {offending}. The fix in #2212 scopes the per-tile "
-            "sync to the default stream via "
-            "cupy.cuda.Stream.null.synchronize()."
-        )
-
-    def test_stream_null_synchronize_present(self):
-        found = _stream_null_synchronize_lines(
-            self._tree, self._start_line, self._parents, only_in_loop=True,
-        )
-        assert len(found) >= 1, (
-            "_nvjpeg2k_batch_encode no longer calls "
-            "cupy.cuda.Stream.null.synchronize() inside the encode loop. "
-            "The fix in #2212 requires the per-tile sync to be scoped to "
-            "the default stream so encode/retrieve ordering is preserved "
-            "without blocking other CUDA streams."
-        )
-
-
-class TestDecodeReferencePattern:
-    """The decoder pattern is the contract we mirror. Pin it as the reference.
-
-    If ``_try_nvjpeg_batch_decode`` ever swaps back to
-    ``Device().synchronize()`` inside its loop, the encoder fix would
-    drift from the codebase's own established pattern; pin it.
-    """
-
-    def setup_method(self):
-        from xrspatial.geotiff import _gpu_decode
-        self._fn = _gpu_decode._try_nvjpeg_batch_decode
-        src, start = _function_source(self._fn)
-        self._src = src
-        self._start_line = start
-        self._tree = ast.parse(src)
-        self._parents = _parent_map(self._tree)
-
-    def test_decoder_uses_stream_null_sync_in_loop(self):
-        found = _stream_null_synchronize_lines(
-            self._tree, self._start_line, self._parents, only_in_loop=True,
-        )
-        assert len(found) >= 1, (
-            "_try_nvjpeg_batch_decode no longer uses "
-            "cupy.cuda.Stream.null.synchronize() inside the decode loop. "
-            "This is the pattern #2212 mirrors for the encoder side; "
-            "drifting away from it means both sides will need to be "
-            "re-aligned."
-        )
diff --git a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py b/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
deleted file mode 100644
index f03864a14..000000000
--- a/xrspatial/geotiff/tests/test_predictor2_big_endian_gpu_1517.py
+++ /dev/null
@@ -1,344 +0,0 @@
-"""Regression tests for issue #1517.
-
-PR #1515 fixed the ``AttributeError: 'ndarray' object has no attribute
-'byteswap'`` crash on big-endian multi-byte TIFFs read via
-``read_geotiff_gpu``. After that fix the GPU path no longer raised, but
-predictor=2 BE files came back with wrong values: the per-dtype
-predictor kernels view the byte buffer as native unsigned integers, so
-on a BE file the prefix-sum runs on the wrong integer interpretation
-and the differencing produces garbage.
-
-These tests confirm the GPU output now matches the CPU
-``read_to_array`` baseline for predictor=2 BE files across several
-dtypes and tile layouts, and that the LE predictor=2 path still
-round-trips.
-"""
-from __future__ import annotations
-
-import importlib.util
-
-import numpy as np
-import pytest
-
-
-def _gpu_available() -> bool:
-    """True if cupy is importable and CUDA is initialised."""
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-_HAS_GPU = _gpu_available()
-_HAS_TIFFFILE = importlib.util.find_spec("tifffile") is not None
-_gpu_only = pytest.mark.skipif(
-    not (_HAS_GPU and _HAS_TIFFFILE),
-    reason="cupy + CUDA + tifffile required",
-)
-
-
-def _block_cpu_fallback(monkeypatch):
-    """Make any call to ``read_to_array`` from ``read_geotiff_gpu`` fail loudly.
-
-    ``read_geotiff_gpu`` returns a cupy-backed array even when its silent CPU
-    fallback fires (the fallback wraps the CPU result with ``cupy.asarray``),
-    so ``isinstance(gpu_da.data, cupy.ndarray)`` cannot distinguish the two
-    paths. ``read_geotiff_gpu`` lives in ``xrspatial.geotiff._backends.gpu``
-    and calls the locally bound ``_read_to_array`` symbol there; patching
-    that binding to raise turns any silent fallback into a hard test failure,
-    which is what we want when the point of a test is to exercise the actual
-    GPU decode kernels.
-
-    Tests that legitimately rely on the CPU fallback (e.g. stripped
-    layouts) must not call this helper.
-    """
-    from xrspatial.geotiff._backends import gpu as gpu_backend
-
-    def _no_fallback(*args, **kwargs):
-        raise AssertionError(
-            "read_geotiff_gpu fell back to read_to_array; "
-            "the GPU decode path was not exercised."
-        )
-
-    monkeypatch.setattr(
-        gpu_backend, '_read_to_array', _no_fallback, raising=True,
-    )
-
-
-@_gpu_only
-def test_gpu_predictor2_big_endian_int32_tiled_reproducer(tmp_path, monkeypatch):
-    """Exact reproducer from issue #1517: BE int32 tiled deflate + pred=2."""
-    import cupy
-    import tifffile
-
-    from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial.geotiff._reader import read_to_array
-
-    rng = np.random.RandomState(20260507)
-    arr = rng.randint(
-        -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64
-    ).astype(np.int32)
-
-    path = tmp_path / "be_pred2_int32.tif"
-    tifffile.imwrite(
-        str(path), arr, byteorder=">", predictor=2,
-        compression="deflate", tile=(16, 16),
-    )
-
-    cpu, _ = read_to_array(str(path))
-    np.testing.assert_array_equal(cpu, arr)
-
-    _block_cpu_fallback(monkeypatch)
-    gpu_da = read_geotiff_gpu(str(path))
-    assert isinstance(gpu_da.data, cupy.ndarray)
-    assert gpu_da.data.dtype == np.dtype(np.int32)
-    assert gpu_da.data.dtype.isnative
-    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
-
-
-@_gpu_only
-@pytest.mark.parametrize(
-    "dtype",
-    [np.uint16, np.int16, np.uint32, np.int32],
-)
-def test_gpu_predictor2_big_endian_dtypes_tiled(tmp_path, monkeypatch, dtype):
-    """BE predictor=2 tiled files match CPU baseline across dtypes."""
-    import cupy
-    import tifffile
-
-    from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial.geotiff._reader import read_to_array
-
-    rng = np.random.RandomState(20260508)
-    info = np.iinfo(dtype)
-    arr = rng.randint(
-        max(info.min, -1_000_000),
-        min(info.max, 1_000_000),
-        size=(32, 48),
-        dtype=np.int64,
-    ).astype(dtype)
-
-    path = tmp_path / f"be_pred2_{np.dtype(dtype).name}.tif"
-    tifffile.imwrite(
-        str(path), arr, byteorder=">", predictor=2,
-        compression="deflate", tile=(16, 16),
-    )
-
-    cpu, _ = read_to_array(str(path))
-    np.testing.assert_array_equal(cpu, arr)
-
-    _block_cpu_fallback(monkeypatch)
-    gpu_da = read_geotiff_gpu(str(path))
-    assert isinstance(gpu_da.data, cupy.ndarray)
-    assert gpu_da.data.dtype == np.dtype(dtype)
-    assert gpu_da.data.dtype.isnative
-    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
-
-
-@_gpu_only
-def test_gpu_predictor2_big_endian_stripped_uint16(tmp_path):
-    """Stripped BE predictor=2 files take the CPU fallback but stay correct.
-
-    ``read_geotiff_gpu`` falls back to the CPU reader for stripped
-    layouts, then transfers the result to GPU. The fix must not regress
-    that path.
-    """
-    import cupy
-    import tifffile
-
-    from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial.geotiff._reader import read_to_array
-
-    rng = np.random.RandomState(20260509)
-    arr = rng.randint(0, 60000, size=(32, 48), dtype=np.uint16)
-
-    path = tmp_path / "be_pred2_uint16_strip.tif"
-    # Omit ``tile`` to get the strip layout.
-    tifffile.imwrite(
-        str(path), arr, byteorder=">", predictor=2, compression="deflate",
-    )
-
-    cpu, _ = read_to_array(str(path))
-    np.testing.assert_array_equal(cpu, arr)
-
-    gpu_da = read_geotiff_gpu(str(path))
-    assert isinstance(gpu_da.data, cupy.ndarray)
-    assert gpu_da.data.dtype == np.dtype(np.uint16)
-    assert gpu_da.data.dtype.isnative
-    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
-
-
-@_gpu_only
-def test_gpu_predictor2_little_endian_still_works(tmp_path, monkeypatch):
-    """LE predictor=2 must still round-trip after the BE fix."""
-    import cupy
-    import tifffile
-
-    from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial.geotiff._reader import read_to_array
-
-    rng = np.random.RandomState(20260510)
-    arr = rng.randint(
-        -1_000_000, 1_000_000, size=(32, 48), dtype=np.int64
-    ).astype(np.int32)
-
-    path = tmp_path / "le_pred2_int32.tif"
-    tifffile.imwrite(
-        str(path), arr, byteorder="<", predictor=2,
-        compression="deflate", tile=(16, 16),
-    )
-
-    cpu, _ = read_to_array(str(path))
-    np.testing.assert_array_equal(cpu, arr)
-
-    _block_cpu_fallback(monkeypatch)
-    gpu_da = read_geotiff_gpu(str(path))
-    assert isinstance(gpu_da.data, cupy.ndarray)
-    assert gpu_da.data.dtype == np.dtype(np.int32)
-    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
-
-
-@_gpu_only
-def test_gpu_predictor3_big_endian_still_works(tmp_path, monkeypatch):
-    """Floating-point predictor BE must still match CPU after the fix."""
-    import cupy
-    import tifffile
-
-    from xrspatial.geotiff import read_geotiff_gpu
-    from xrspatial.geotiff._reader import read_to_array
-
-    rng = np.random.RandomState(20260511)
-    arr = rng.standard_normal((32, 48)).astype(np.float32)
-
-    path = tmp_path / "be_pred3_float32.tif"
-    tifffile.imwrite(
-        str(path), arr, byteorder=">", predictor=3,
-        compression="deflate", tile=(16, 16),
-    )
-
-    cpu, _ = read_to_array(str(path))
-    np.testing.assert_array_equal(cpu, arr)
-
-    _block_cpu_fallback(monkeypatch)
-    gpu_da = read_geotiff_gpu(str(path))
-    assert isinstance(gpu_da.data, cupy.ndarray)
-    assert gpu_da.data.dtype == np.dtype(np.float32)
-    np.testing.assert_array_equal(gpu_da.data.get(), cpu)
-
-
-def test_swap_byte_lanes_numpy_bps2():
-    """The byte-swap helper reverses bytes per sample on a numpy buffer."""
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    # uint16 values 0x0102, 0x0304 in BE bytes: 01 02 03 04
-    buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8)
-    _swap_byte_lanes(buf, 2)
-    np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03],
-                                                dtype=np.uint8))
-
-
-def test_swap_byte_lanes_numpy_bps4():
-    """bps=4: full byte reversal within each 4-byte sample."""
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    buf = np.array([0x01, 0x02, 0x03, 0x04,
-                    0x05, 0x06, 0x07, 0x08], dtype=np.uint8)
-    _swap_byte_lanes(buf, 4)
-    np.testing.assert_array_equal(
-        buf, np.array([0x04, 0x03, 0x02, 0x01,
-                       0x08, 0x07, 0x06, 0x05], dtype=np.uint8))
-
-
-def test_swap_byte_lanes_numpy_bps8():
-    """bps=8: full byte reversal within each 8-byte sample."""
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    sample = np.arange(1, 9, dtype=np.uint8)
-    buf = np.tile(sample, 2).copy()
-    _swap_byte_lanes(buf, 8)
-    np.testing.assert_array_equal(
-        buf, np.tile(sample[::-1], 2))
-
-
-def test_swap_byte_lanes_uint8_noop():
-    """bps=1 must be a no-op."""
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    buf = np.array([1, 2, 3], dtype=np.uint8)
-    _swap_byte_lanes(buf, 1)
-    np.testing.assert_array_equal(buf, np.array([1, 2, 3], dtype=np.uint8))
-
-
-def test_swap_byte_lanes_rejects_unsupported_bps():
-    """Unsupported bps values raise ValueError rather than corrupt data."""
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    buf = np.zeros(6, dtype=np.uint8)
-    with pytest.raises(ValueError, match="unsupported bps"):
-        _swap_byte_lanes(buf, 3)
-
-
-def test_swap_byte_lanes_rejects_misaligned_size():
-    """Buffer size must be a multiple of bps."""
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    buf = np.zeros(5, dtype=np.uint8)
-    with pytest.raises(ValueError, match="not a multiple"):
-        _swap_byte_lanes(buf, 2)
-
-
-def test_swap_byte_lanes_numpy_is_zero_temp():
-    """The numpy path must mutate the original buffer without realloc."""
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    buf = np.array([0x01, 0x02, 0x03, 0x04], dtype=np.uint8)
-    addr_before = buf.ctypes.data
-    _swap_byte_lanes(buf, 2)
-    assert buf.ctypes.data == addr_before
-    np.testing.assert_array_equal(buf, np.array([0x02, 0x01, 0x04, 0x03],
-                                                dtype=np.uint8))
-
-
-@_gpu_only
-@pytest.mark.parametrize("bps,dtype", [
-    (2, np.uint16),
-    (4, np.uint32),
-    (8, np.uint64),
-])
-def test_swap_byte_lanes_cupy_kernel(bps, dtype):
-    """The cupy path runs the CUDA kernel and matches numpy.byteswap."""
-    import cupy
-
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    rng = np.random.RandomState(20260512 + bps)
-    n_samples = 1024
-    src = rng.randint(0, np.iinfo(dtype).max, size=n_samples,
-                      dtype=np.uint64).astype(dtype)
-    expected = src.byteswap()  # numpy reference, returns swapped copy
-
-    d_buf = cupy.asarray(src.view(np.uint8))
-    addr_before = int(d_buf.data.ptr)
-    _swap_byte_lanes(d_buf, bps)
-    addr_after = int(d_buf.data.ptr)
-
-    assert addr_after == addr_before, "kernel must operate in place"
-    np.testing.assert_array_equal(
-        d_buf.get().view(dtype), expected,
-    )
-
-
-@_gpu_only
-def test_swap_byte_lanes_cupy_uint8_noop():
-    """bps=1 leaves cupy buffers untouched (no kernel launch)."""
-    import cupy
-
-    from xrspatial.geotiff._gpu_decode import _swap_byte_lanes
-
-    src = np.arange(16, dtype=np.uint8)
-    d_buf = cupy.asarray(src)
-    _swap_byte_lanes(d_buf, 1)
-    np.testing.assert_array_equal(d_buf.get(), src)
diff --git a/xrspatial/geotiff/tests/test_predictor3_int_dtype_gpu_1933.py b/xrspatial/geotiff/tests/test_predictor3_int_dtype_gpu_1933.py
deleted file mode 100644
index 456ab424a..000000000
--- a/xrspatial/geotiff/tests/test_predictor3_int_dtype_gpu_1933.py
+++ /dev/null
@@ -1,284 +0,0 @@
-"""GPU + dask+GPU backend coverage for issue #1933.
-
-#1933 added ``_validate_predictor_sample_format`` and wired it into
-every IFD-read site (eager numpy, dask, GPU tiled, GPU stripped). The
-eager and dask paths are covered by ``test_predictor3_int_dtype_1933``;
-this module closes the GPU coverage gap.
-
-The validator is invoked at two GPU sites:
-
-* ``_backends/gpu.py:443`` -- the tiled eager GPU read path. Reached when
-  the file is tiled and ``bps == file_dtype.itemsize * 8`` (so the
-  bps_mismatch fallback at line 358 does not take over).
-* ``_backends/gpu.py:999`` -- the GDS chunked GPU path
-  (``_read_geotiff_gpu_chunked_gds``). Reached when the file qualifies
-  for direct disk->GPU decode.
-
-The stripped GPU path falls back to CPU via ``_read_to_array`` and the
-CPU-side validator there fires; the dask+GPU non-GDS path delegates to
-``read_geotiff_dask`` which has its own validator (covered by the
-existing dask test). The two NEW call sites have no targeted tests.
-
-A regression dropping either of those two validator calls would let
-malformed predictor=3 + integer tiled files decode silently to
-garbage bytes on GPU. The eager-test asserts the error path is wired
-on CPU; this module asserts the GPU dispatcher path is wired too.
-"""
-from __future__ import annotations
-
-import importlib.util
-
-import numpy as np
-import pytest
-
-from xrspatial.geotiff._compression import COMPRESSION_NONE
-from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype
-from xrspatial.geotiff._header import (TAG_BITS_PER_SAMPLE, TAG_COMPRESSION, TAG_IMAGE_LENGTH,
-                                       TAG_IMAGE_WIDTH, TAG_PHOTOMETRIC, TAG_PREDICTOR,
-                                       TAG_ROWS_PER_STRIP, TAG_SAMPLE_FORMAT, TAG_SAMPLES_PER_PIXEL,
-                                       TAG_STRIP_BYTE_COUNTS, TAG_STRIP_OFFSETS,
-                                       TAG_TILE_BYTE_COUNTS, TAG_TILE_LENGTH, TAG_TILE_OFFSETS,
-                                       TAG_TILE_WIDTH)
-from xrspatial.geotiff._writer import _assemble_standard_layout, _write_stripped
-
-
-def _gpu_available() -> bool:
-    if importlib.util.find_spec("cupy") is None:
-        return False
-    try:
-        import cupy
-
-        return bool(cupy.cuda.is_available())
-    except Exception:
-        return False
-
-
-_HAS_GPU = _gpu_available()
-pytestmark = pytest.mark.skipif(
-    not _HAS_GPU, reason="cupy + CUDA required",
-)
-
-
-def _build_predictor3_uint32_stripped_tiff(arr: np.ndarray) -> bytes:
-    """Build a stripped TIFF: predictor=3 + uint32 SampleFormat=1.
-
-    Mirrors the helper in ``test_predictor3_int_dtype_1933`` so the GPU
-    coverage gap can be exercised against the same shape of malformed
-    file the eager test uses. Compression is COMPRESSION_NONE so the
-    strip bytes are exactly the raw integer values.
-    """
-    rel_off, bc, chunks = _write_stripped(arr, COMPRESSION_NONE, False)
-    bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype)
-    tags = [
-        (TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]),
-        (TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]),
-        (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample),
-        (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE),
-        (TAG_PHOTOMETRIC, SHORT, 1, 1),
-        (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1),
-        (TAG_SAMPLE_FORMAT, SHORT, 1, 1),
-        (TAG_PREDICTOR, SHORT, 1, 3),
-        (TAG_ROWS_PER_STRIP, SHORT, 1, arr.shape[0]),
-        (TAG_STRIP_OFFSETS, LONG, len(rel_off), rel_off),
-        (TAG_STRIP_BYTE_COUNTS, LONG, len(bc), bc),
-    ]
-    parts = [(arr, arr.shape[1], arr.shape[0], rel_off, bc, chunks)]
-    return _assemble_standard_layout(8, [tags], parts, bigtiff=False)
-
-
-def _build_predictor3_uint32_tiled_tiff(
-    arr: np.ndarray, tile_w: int = 16, tile_h: int = 16,
-) -> bytes:
-    """Build a tiled malformed TIFF: predictor=3 + uint32 SampleFormat=1.
-
-    The tiled layout is the one that reaches the GPU validator at
-    ``_backends/gpu.py:443`` (no bps_mismatch fallback). Tile size is
-    16x16, the smallest tifffile/standard tile size.
-    """
-    bits_per_sample, _ = numpy_to_tiff_dtype(arr.dtype)
-    h, w = arr.shape
-
-    tiles_across = (w + tile_w - 1) // tile_w
-    tiles_down = (h + tile_h - 1) // tile_h
-    tiles: list[bytes] = []
-    rel_off: list[int] = []
-    bc: list[int] = []
-    offset = 0
-    for tr in range(tiles_down):
-        for tc in range(tiles_across):
-            r0 = tr * tile_h
-            c0 = tc * tile_w
-            r1 = min(r0 + tile_h, h)
-            c1 = min(c0 + tile_w, w)
-            tile_slice = arr[r0:r1, c0:c1]
-            if tile_slice.shape != (tile_h, tile_w):
-                padded = np.zeros((tile_h, tile_w), dtype=arr.dtype)
-                padded[: tile_slice.shape[0], : tile_slice.shape[1]] = (
-                    tile_slice)
-                tile_arr = padded
-            else:
-                tile_arr = np.ascontiguousarray(tile_slice)
-            chunk = tile_arr.tobytes()
-            rel_off.append(offset)
-            bc.append(len(chunk))
-            tiles.append(chunk)
-            offset += len(chunk)
-
-    tags = [
-        (TAG_IMAGE_WIDTH, LONG, 1, w),
-        (TAG_IMAGE_LENGTH, LONG, 1, h),
-        (TAG_BITS_PER_SAMPLE, SHORT, 1, bits_per_sample),
-        (TAG_COMPRESSION, SHORT, 1, COMPRESSION_NONE),
-        (TAG_PHOTOMETRIC, SHORT, 1, 1),
-        (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1),
-        (TAG_SAMPLE_FORMAT, SHORT, 1, 1),
-        (TAG_PREDICTOR, SHORT, 1, 3),
-        (TAG_TILE_WIDTH, LONG, 1, tile_w),
-        (TAG_TILE_LENGTH, LONG, 1, tile_h),
-        (TAG_TILE_OFFSETS, LONG, len(rel_off), rel_off),
-        (TAG_TILE_BYTE_COUNTS, LONG, len(bc), bc),
-    ]
-    parts = [(arr, w, h, rel_off, bc, tiles)]
-    return _assemble_standard_layout(8, [tags], parts, bigtiff=False)
-
-
-class TestGPUEagerRejectsMalformedFile:
-    """``read_geotiff_gpu`` rejects predictor=3 + integer SampleFormat."""
-
-    def test_gpu_eager_stripped_raises(self, tmp_path):
-        from xrspatial.geotiff import read_geotiff_gpu
-
-        arr = np.array(
-            [[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint32)
-        path = tmp_path / "pred3_uint32_stripped.tif"
-        path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr))
-        with pytest.raises(ValueError, match="Predictor=3"):
-            read_geotiff_gpu(str(path))
-
-    def test_gpu_eager_tiled_raises(self, tmp_path):
-        """Tiled layout hits the tiled GPU validator at gpu.py:443.
-
-        Distinct from the stripped fallback path -- a regression
-        dropping the line 443 call would leak through this test
-        because the stripped path's validator lives in
-        ``_read_to_array`` and would still raise.
-        """
-        from xrspatial.geotiff import read_geotiff_gpu
-
-        arr = np.arange(256, dtype=np.uint32).reshape(16, 16)
-        path = tmp_path / "pred3_uint32_tiled.tif"
-        path.write_bytes(_build_predictor3_uint32_tiled_tiff(arr))
-        with pytest.raises(ValueError, match="Predictor=3"):
-            read_geotiff_gpu(str(path))
-
-    def test_gpu_dispatcher_eager_raises(self, tmp_path):
-        """``open_geotiff(gpu=True)`` dispatcher rejects the file."""
-        from xrspatial.geotiff import open_geotiff
-
-        arr = np.arange(64, dtype=np.uint32).reshape(8, 8)
-        path = tmp_path / "pred3_uint32_dispatch.tif"
-        path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr))
-        with pytest.raises(ValueError, match="Predictor=3"):
-            open_geotiff(str(path), gpu=True)
-
-
-class TestGPUChunkedRejectsMalformedFile:
-    """The dask+GPU paths also reject predictor=3 + integer."""
-
-    def test_read_geotiff_gpu_chunked_stripped_raises(self, tmp_path):
-        from xrspatial.geotiff import read_geotiff_gpu
-
-        arr = np.arange(64, dtype=np.uint32).reshape(8, 8)
-        path = tmp_path / "pred3_uint32_chunked_str.tif"
-        path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr))
-        with pytest.raises(ValueError, match="Predictor=3"):
-            read_geotiff_gpu(str(path), chunks=4)
-
-    def test_read_geotiff_gpu_chunked_tiled_raises(self, tmp_path):
-        """Tiled chunked path with KvikIO available exercises gpu.py:999.
-
-        Gated on ``kvikio`` so the GDS qualification path
-        (``_read_geotiff_gpu_chunked_gds``) is the branch actually
-        taken. Without KvikIO the dispatcher falls back to the CPU
-        dask path and the line-999 validator is never reached, which
-        leaves the targeted call site untested. The CPU fallback
-        rejection is already covered by the eager/dask tests in
-        ``test_predictor3_int_dtype_1933``.
-        """
-        pytest.importorskip("kvikio")
-
-        from xrspatial.geotiff import read_geotiff_gpu
-
-        arr = np.arange(256, dtype=np.uint32).reshape(16, 16)
-        path = tmp_path / "pred3_uint32_chunked_tiled.tif"
-        path.write_bytes(_build_predictor3_uint32_tiled_tiff(arr))
-        with pytest.raises(ValueError, match="Predictor=3"):
-            read_geotiff_gpu(str(path), chunks=16)
-
-    def test_open_geotiff_chunks_gpu_dispatcher_raises(self, tmp_path):
-        """``open_geotiff(chunks=, gpu=True)`` dispatcher rejects the file."""
-        from xrspatial.geotiff import open_geotiff
-
-        arr = np.arange(256, dtype=np.uint32).reshape(16, 16)
-        path = tmp_path / "pred3_uint32_chunked_dispatch.tif"
-        path.write_bytes(_build_predictor3_uint32_tiled_tiff(arr))
-        with pytest.raises(ValueError, match="Predictor=3"):
-            open_geotiff(str(path), chunks=8, gpu=True)
-
-
-class TestValidPredictor3StillWorksOnGPU:
-    """A legitimate predictor=3 + float32 tiled file still decodes on GPU."""
-
-    def test_predictor3_float32_gpu_round_trip(self, tmp_path):
-        from xrspatial.geotiff import read_geotiff_gpu, to_geotiff
-
-        arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16)
-        path = tmp_path / "pred3_float32_tiled.tif"
-        to_geotiff(
-            arr, str(path), compression="deflate", predictor=3,
-            tiled=True, tile_size=16,
-        )
-
-        result = read_geotiff_gpu(str(path))
-        assert result.dtype == np.float32
-        np.testing.assert_array_equal(result.data.get(), arr)
-
-    def test_predictor3_float32_dask_gpu_round_trip(self, tmp_path):
-        from xrspatial.geotiff import read_geotiff_gpu, to_geotiff
-
-        arr = np.linspace(-1.0, 1.0, 256, dtype=np.float32).reshape(16, 16)
-        path = tmp_path / "pred3_float32_dask.tif"
-        to_geotiff(
-            arr, str(path), compression="deflate", predictor=3,
-            tiled=True, tile_size=16,
-        )
-
-        result = read_geotiff_gpu(str(path), chunks=8)
-        assert result.dtype == np.float32
-        np.testing.assert_array_equal(result.compute().data.get(), arr)
-
-
-class TestErrorMessageStable:
-    """The GPU error wording matches the eager/dask wording.
-
-    Cross-backend error parity is a real concern -- a regression that
-    fired the validator on GPU but with a different message would force
-    callers to special-case the backend on ``except ValueError``.
-    """
-
-    def test_gpu_error_message_matches_eager(self, tmp_path):
-        from xrspatial.geotiff import open_geotiff, read_geotiff_gpu
-
-        arr = np.arange(64, dtype=np.uint32).reshape(8, 8)
-        path = tmp_path / "pred3_uint32_msg.tif"
-        path.write_bytes(_build_predictor3_uint32_stripped_tiff(arr))
-
-        with pytest.raises(ValueError) as exc_eager:
-            open_geotiff(str(path))
-        with pytest.raises(ValueError) as exc_gpu:
-            read_geotiff_gpu(str(path))
-
-        assert str(exc_eager.value) == str(exc_gpu.value), (
-            "GPU and eager paths must surface the same Predictor=3 "
-            "error message so callers can use a single except branch."
-        )
diff --git a/xrspatial/geotiff/tests/unit/test_predictor.py b/xrspatial/geotiff/tests/unit/test_predictor.py
index 3d1216b87..39ad94f36 100644
--- a/xrspatial/geotiff/tests/unit/test_predictor.py
+++ b/xrspatial/geotiff/tests/unit/test_predictor.py
@@ -30,10 +30,9 @@
   predictor=2 on smooth float data (opt-in via env var).
 
 GPU predictor variants are intentionally out of scope here -- the
-dedicated GPU predictor files
-(``test_predictor2_big_endian_gpu_1517.py``,
-``test_predictor3_int_dtype_gpu_1933.py``) are folded into the GPU
-cluster #2438. GPU regressions that lived alongside CPU tests in the
+dedicated GPU predictor coverage lives in
+``xrspatial/geotiff/tests/gpu/test_codec.py`` (folded under epic
+#2438). GPU regressions that lived alongside CPU tests in the
 old files (predictor=2 int8 tiled/stripped, predictor=3 BE GPU,
 predictor=2/3 multi-sample GPU parity) move with this consolidation so
 the CPU and GPU coverage stay co-located by behaviour rather than