diff --git a/changes/2863.feature.md b/changes/2863.feature.md
new file mode 100644
index 0000000000..a8347157d9
--- /dev/null
+++ b/changes/2863.feature.md
@@ -0,0 +1,7 @@
+Added GPU-accelerated Zstd Codec
+
+This adds support for decoding with the Zstd Codec on NVIDIA GPUs using the
+nvidia-nvcomp library.
+
+With `zarr.config.enable_gpu()`, buffers will be decoded using the GPU
+and the output will reside in device memory.
diff --git a/docs/user-guide/config.md b/docs/user-guide/config.md
index 21fe9b5def..4ab8e33077 100644
--- a/docs/user-guide/config.md
+++ b/docs/user-guide/config.md
@@ -39,6 +39,25 @@ first register the implementations in the registry and then select them in the c
 For example, an implementation of the bytes codec in a class `'custompackage.NewBytesCodec'`,
 requires the value of `codecs.bytes.name` to be `'custompackage.NewBytesCodec'`.
 
+## Codecs
+
+Zarr and zarr-python split the logical codec definition from the implementation.
+The Zarr metadata serialized in the store specifies just the codec name and
+configuration. To resolve the specific implementation, a Python class, that's
+used at runtime to encode or decode data, zarr-python looks up the codec name
+in the codec registry.
+
+For example, after calling `zarr.config.enable_gpu()`, an nvcomp-based
+codec will be used:
+
+```python
+>>> with zarr.config.enable_gpu():
+...     print(zarr.config.get('codecs.zstd'))
+zarr.codecs.gpu.NvcompZstdCodec
+```
+
+## Default Configuration
+
 This is the current default configuration:
 
 ```python exec="true" session="config" source="above" result="ansi"
diff --git a/docs/user-guide/gpu.md b/docs/user-guide/gpu.md
index 3317bdf065..a54874a85b 100644
--- a/docs/user-guide/gpu.md
+++ b/docs/user-guide/gpu.md
@@ -2,15 +2,6 @@
 
 Zarr can use GPUs to accelerate your workload by running `zarr.Config.enable_gpu`.
 
-!!! note
-    `zarr-python` currently supports reading the ndarray data into device (GPU)
-    memory as the final stage of the codec pipeline. Data will still be read into
-    or copied to host (CPU) memory for encoding and decoding.
-
-    In the future, codecs will be available compressing and decompressing data on
-    the GPU, avoiding the need to move data between the host and device for
-    compression and decompression.
-
 ## Reading data into device memory
 
 [`zarr.config`][] configures Zarr to use GPU memory for the data
@@ -29,3 +20,9 @@ type(z[:10, :10])
 ```
 
 Note that the output type is a `cupy.ndarray` rather than a NumPy array.
+
+For supported codecs, data will be decoded using the GPU via the [nvcomp] library.
+See [runtime-configuration][] for more. Isseus and feature requestsfor NVIDIA nvCOMP can be reported in the nvcomp [issue tracker].
+
+[nvcomp]: https://docs.nvidia.com/cuda/nvcomp/samples/python_samples.html
+[issue tradcker]: https://github.com/NVIDIA/CUDALibrarySamples/issues
\ No newline at end of file
diff --git a/docs/user-guide/gpu.rst b/docs/user-guide/gpu.rst
new file mode 100644
index 0000000000..cd4f6c5eaf
--- /dev/null
+++ b/docs/user-guide/gpu.rst
@@ -0,0 +1,34 @@
+.. _user-guide-gpu:
+
+Using GPUs with Zarr
+====================
+
+Zarr can use GPUs to accelerate your workload by running
+:meth:`zarr.config.enable_gpu`.
+
+Reading data into device memory
+-------------------------------
+
+:meth:`zarr.config.enable_gpu` configures Zarr to use GPU memory for the data
+buffers used internally by Zarr.
+
+.. code-block:: python
+
+   >>> import zarr
+   >>> import cupy as cp  # doctest: +SKIP
+   >>> zarr.config.enable_gpu()  # doctest: +SKIP
+   >>> store = zarr.storage.MemoryStore()  # doctest: +SKIP
+   >>> z = zarr.create_array(  # doctest: +SKIP
+   ...     store=store, shape=(100, 100), chunks=(10, 10), dtype="float32",
+   ... )
+   >>> type(z[:10, :10])  # doctest: +SKIP
+   cupy.ndarray
+
+Note that the output type is a ``cupy.ndarray`` rather than a NumPy array.
+
+For supported codecs, data will be decoded using the GPU via the `nvcomp`_
+library. See :ref:`user-guide-config` for more. Isseus and feature requests
+for NVIDIA nvCOMP can be reported in the `nvcomp issue tracker`_.
+
+.. _nvcomp: https://docs.nvidia.com/cuda/nvcomp/samples/python_samples.html
+.. _nvcomp issue tracker: https://github.com/NVIDIA/CUDALibrarySamples/issues
diff --git a/pyproject.toml b/pyproject.toml
index 6164f69382..27d39a8e1a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,6 +67,7 @@ remote = [
 ]
 gpu = [
     "cupy-cuda12x",
+    "nvidia-nvcomp-cu12",
 ]
 cli = ["typer"]
 # Development extras
diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
index 4c621290e7..512e3252cb 100644
--- a/src/zarr/codecs/__init__.py
+++ b/src/zarr/codecs/__init__.py
@@ -3,6 +3,7 @@
 from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
 from zarr.codecs.bytes import BytesCodec, Endian
 from zarr.codecs.crc32c_ import Crc32cCodec
+from zarr.codecs.gpu import NvcompZstdCodec
 from zarr.codecs.gzip import GzipCodec
 from zarr.codecs.numcodecs import (
     BZ2,
@@ -41,6 +42,7 @@
     "Crc32cCodec",
     "Endian",
     "GzipCodec",
+    "NvcompZstdCodec",
     "ShardingCodec",
     "ShardingCodecIndexLocation",
     "TransposeCodec",
diff --git a/src/zarr/codecs/gpu.py b/src/zarr/codecs/gpu.py
new file mode 100644
index 0000000000..88df70fff6
--- /dev/null
+++ b/src/zarr/codecs/gpu.py
@@ -0,0 +1,176 @@
+from __future__ import annotations
+
+import asyncio
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from zarr.abc.codec import BytesBytesCodec
+from zarr.core.common import JSON, parse_named_configuration
+from zarr.registry import register_codec
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+    from typing import Self
+
+    from zarr.core.array_spec import ArraySpec
+    from zarr.core.buffer import Buffer
+
+try:
+    import cupy as cp
+except ImportError:  # pragma: no cover
+    cp = None
+
+try:
+    from nvidia import nvcomp
+except ImportError:  # pragma: no cover
+    nvcomp = None
+
+
+def _parse_zstd_level(data: JSON) -> int:
+    if isinstance(data, int):
+        if data >= 23:
+            raise ValueError(f"Value must be less than or equal to 22. Got {data} instead.")
+        return data
+    raise TypeError(f"Got value with type {type(data)}, but expected an int.")
+
+
+def _parse_checksum(data: JSON) -> bool:
+    if isinstance(data, bool):
+        return data
+    raise TypeError(f"Expected bool. Got {type(data)}.")
+
+
+@dataclass(frozen=True)
+class NvcompZstdCodec(BytesBytesCodec):
+    is_fixed_size = True
+
+    level: int = 0
+    checksum: bool = False
+
+    def __init__(self, *, level: int = 0, checksum: bool = False) -> None:
+        # TODO: Set CUDA device appropriately here and also set CUDA stream
+
+        level_parsed = _parse_zstd_level(level)
+        checksum_parsed = _parse_checksum(checksum)
+
+        object.__setattr__(self, "level", level_parsed)
+        object.__setattr__(self, "checksum", checksum_parsed)
+
+    @classmethod
+    def from_dict(cls, data: dict[str, JSON]) -> Self:
+        _, configuration_parsed = parse_named_configuration(data, "zstd")
+        return cls(**configuration_parsed)  # type: ignore[arg-type]
+
+    def to_dict(self) -> dict[str, JSON]:
+        return {
+            "name": "zstd",
+            "configuration": {"level": self.level, "checksum": self.checksum},
+        }
+
+    @cached_property
+    def _zstd_codec(self) -> nvcomp.Codec:
+        device = cp.cuda.Device()  # Select the current default device
+        stream = cp.cuda.get_current_stream()  # Use the current default stream
+        return nvcomp.Codec(
+            algorithm="Zstd",
+            bitstream_kind=nvcomp.BitstreamKind.RAW,
+            device_id=device.id,
+            cuda_stream=stream.ptr,
+        )
+
+    def _convert_to_nvcomp_arrays(
+        self,
+        chunks_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
+    ) -> tuple[list[nvcomp.Array], list[int]]:
+        none_indices = [i for i, (b, _) in enumerate(chunks_and_specs) if b is None]
+        filtered_inputs = [b.as_array_like() for b, _ in chunks_and_specs if b is not None]
+        # TODO: add CUDA stream here
+        return nvcomp.as_arrays(filtered_inputs), none_indices
+
+    def _convert_from_nvcomp_arrays(
+        self,
+        arrays: Iterable[nvcomp.Array],
+        chunks_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
+    ) -> Iterable[Buffer | None]:
+        return [
+            spec.prototype.buffer.from_array_like(cp.array(a, dtype=np.dtype("B"), copy=False))
+            if a
+            else None
+            for a, (_, spec) in zip(arrays, chunks_and_specs, strict=True)
+        ]
+
+    async def decode(
+        self,
+        chunks_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
+    ) -> Iterable[Buffer | None]:
+        """Decodes a batch of chunks.
+        Chunks can be None in which case they are ignored by the codec.
+
+        Parameters
+        ----------
+        chunks_and_specs : Iterable[tuple[Buffer | None, ArraySpec]]
+            Ordered set of encoded chunks with their accompanying chunk spec.
+
+        Returns
+        -------
+        Iterable[Buffer | None]
+        """
+        chunks_and_specs = list(chunks_and_specs)
+
+        # Convert to nvcomp arrays
+        filtered_inputs, none_indices = self._convert_to_nvcomp_arrays(chunks_and_specs)
+
+        outputs = self._zstd_codec.decode(filtered_inputs) if len(filtered_inputs) > 0 else []
+
+        # Record event for synchronization
+        event = cp.cuda.Event()
+        # Wait for decode to complete in a separate async thread
+        await asyncio.to_thread(event.synchronize)
+
+        for index in none_indices:
+            outputs.insert(index, None)
+
+        return self._convert_from_nvcomp_arrays(outputs, chunks_and_specs)
+
+    async def encode(
+        self,
+        chunks_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
+    ) -> Iterable[Buffer | None]:
+        """Encodes a batch of chunks.
+        Chunks can be None in which case they are ignored by the codec.
+
+        Parameters
+        ----------
+        chunks_and_specs : Iterable[tuple[Buffer | None, ArraySpec]]
+            Ordered set of to-be-encoded chunks with their accompanying chunk spec.
+
+        Returns
+        -------
+        Iterable[Buffer | None]
+        """
+        # TODO: Make this actually async
+        chunks_and_specs = list(chunks_and_specs)
+
+        # Convert to nvcomp arrays
+        filtered_inputs, none_indices = self._convert_to_nvcomp_arrays(chunks_and_specs)
+
+        outputs = self._zstd_codec.encode(filtered_inputs) if len(filtered_inputs) > 0 else []
+
+        # Record event for synchronization
+        event = cp.cuda.Event()
+        # Wait for decode to complete in a separate async thread
+        await asyncio.to_thread(event.synchronize)
+
+        for index in none_indices:
+            outputs.insert(index, None)
+
+        return self._convert_from_nvcomp_arrays(outputs, chunks_and_specs)
+
+    def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
+        raise NotImplementedError
+
+
+register_codec("zstd", NvcompZstdCodec)
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index 59ca8f5929..5cda8044c2 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -28,7 +28,6 @@
 from zarr.codecs._v2 import V2Codec
 from zarr.codecs.bytes import BytesCodec
 from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
-from zarr.codecs.zstd import ZstdCodec
 from zarr.core._info import ArrayInfo
 from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config
 from zarr.core.attributes import Attributes
@@ -128,6 +127,7 @@
     _parse_array_array_codec,
     _parse_array_bytes_codec,
     _parse_bytes_bytes_codec,
+    get_codec_class,
     get_pipeline_class,
 )
 from zarr.storage._common import StorePath, ensure_no_existing_node, make_store_path
@@ -5036,9 +5036,9 @@ def default_compressors_v3(dtype: ZDType[Any, Any]) -> tuple[BytesBytesCodec, ..
     """
     Given a data type, return the default compressors for that data type.
 
-    This is just a tuple containing ``ZstdCodec``
+    This is just a tuple containing an instance of the default "zstd" codec class.
     """
-    return (ZstdCodec(),)
+    return (cast(BytesBytesCodec, get_codec_class("zstd")()),)
 
 
 def default_serializer_v3(dtype: ZDType[Any, Any]) -> ArrayBytesCodec:
diff --git a/src/zarr/core/buffer/gpu.py b/src/zarr/core/buffer/gpu.py
index bfe977c50f..f0242ee8b4 100644
--- a/src/zarr/core/buffer/gpu.py
+++ b/src/zarr/core/buffer/gpu.py
@@ -8,9 +8,6 @@
     cast,
 )
 
-import numpy as np
-import numpy.typing as npt
-
 from zarr.core.buffer import core
 from zarr.core.buffer.core import ArrayLike, BufferPrototype, NDArrayLike
 from zarr.errors import ZarrUserWarning
@@ -23,8 +20,9 @@
     from collections.abc import Iterable
     from typing import Self
 
-    from zarr.core.common import BytesLike
+    import numpy.typing as npt
 
+    from zarr.core.common import BytesLike
 try:
     import cupy as cp
 except ImportError:
@@ -54,14 +52,14 @@ class Buffer(core.Buffer):
 
     def __init__(self, array_like: ArrayLike) -> None:
         if cp is None:
-            raise ImportError(
+            raise ImportError(  # pragma: no cover
                 "Cannot use zarr.buffer.gpu.Buffer without cupy. Please install cupy."
             )
 
         if array_like.ndim != 1:
             raise ValueError("array_like: only 1-dim allowed")
-        if array_like.dtype != np.dtype("B"):
-            raise ValueError("array_like: only byte dtype allowed")
+        if array_like.dtype.itemsize != 1:
+            raise ValueError("array_like: only dtypes with itemsize=1 allowed")
 
         if not hasattr(array_like, "__cuda_array_interface__"):
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
@@ -108,13 +106,13 @@ def as_numpy_array(self) -> npt.NDArray[Any]:
         return cast("npt.NDArray[Any]", cp.asnumpy(self._data))
 
     def __add__(self, other: core.Buffer) -> Self:
-        other_array = other.as_array_like()
-        assert other_array.dtype == np.dtype("B")
-        gpu_other = Buffer(other_array)
-        gpu_other_array = gpu_other.as_array_like()
-        return self.__class__(
-            cp.concatenate((cp.asanyarray(self._data), cp.asanyarray(gpu_other_array)))
-        )
+        other_array = cp.asanyarray(other.as_array_like())
+        left = self._data
+        if left.dtype != other_array.dtype:
+            other_array = other_array.view(left.dtype)
+
+        buffer = cp.concatenate([left, other_array])
+        return type(self)(buffer)
 
 
 class NDBuffer(core.NDBuffer):
@@ -144,7 +142,7 @@ class NDBuffer(core.NDBuffer):
 
     def __init__(self, array: NDArrayLike) -> None:
         if cp is None:
-            raise ImportError(
+            raise ImportError(  # pragma: no cover
                 "Cannot use zarr.buffer.gpu.NDBuffer without cupy. Please install cupy."
             )
 
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 5d463ec79c..a3cc9bb2eb 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -74,7 +74,15 @@ def enable_gpu(self) -> ConfigSet:
         Configure Zarr to use GPUs where possible.
         """
         return self.set(
-            {"buffer": "zarr.buffer.gpu.Buffer", "ndbuffer": "zarr.buffer.gpu.NDBuffer"}
+            {
+                "buffer": "zarr.buffer.gpu.Buffer",
+                "ndbuffer": "zarr.buffer.gpu.NDBuffer",
+                "codecs": {"zstd": "zarr.codecs.gpu.NvcompZstdCodec"},
+                "codec_pipeline": {
+                    "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
+                    "batch_size": 65536,
+                },
+            }
         )
 
 
diff --git a/tests/test_api.py b/tests/test_api.py
index 30f648a815..e96bd2f4bb 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -14,6 +14,7 @@
     from collections.abc import Callable
     from pathlib import Path
 
+    from zarr.abc.codec import Codec
     from zarr.abc.store import Store
     from zarr.core.common import JSON, MemoryOrder, ZarrFormat
 
@@ -41,6 +42,7 @@
     save_array,
     save_group,
 )
+from zarr.codecs import NvcompZstdCodec
 from zarr.core.buffer import NDArrayLike
 from zarr.errors import (
     ArrayNotFoundError,
@@ -1390,14 +1392,15 @@ def test_api_exports() -> None:
     assert zarr.api.asynchronous.__all__ == zarr.api.synchronous.__all__
 
 
-@gpu_test
+@gpu_test  # type: ignore[misc,unused-ignore]
 @pytest.mark.parametrize(
     "store",
     ["local", "memory", "zip"],
     indirect=True,
 )
 @pytest.mark.parametrize("zarr_format", [None, 2, 3])
-def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None:
+@pytest.mark.parametrize("codec", ["auto", NvcompZstdCodec()])
+def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None, codec: str | Codec) -> None:
     import cupy as cp
 
     if zarr_format == 2:
@@ -1405,7 +1408,7 @@ def test_gpu_basic(store: Store, zarr_format: ZarrFormat | None) -> None:
         # array to bytes.
         compressors = None
     else:
-        compressors = "auto"
+        compressors = codec
 
     with zarr.config.enable_gpu():
         src = cp.random.uniform(size=(100, 100))  # allocate on the device
diff --git a/tests/test_buffer.py b/tests/test_buffer.py
index b50e5abb67..43321ea2b0 100644
--- a/tests/test_buffer.py
+++ b/tests/test_buffer.py
@@ -193,6 +193,19 @@ def test_numpy_buffer_prototype() -> None:
         ndbuffer.as_scalar()
 
 
+@gpu_test
+def test_gpu_buffer_raises() -> None:
+    import cupy as cp
+
+    arr = cp.empty((10, 10), dtype="B")
+    with pytest.raises(ValueError, match="array_like: only 1-dim allowed"):
+        gpu.Buffer(arr)
+
+    arr = cp.arange(12, dtype="int32")
+    with pytest.raises(ValueError, match="array_like: only dtypes"):
+        gpu.Buffer(arr)
+
+
 @gpu_test
 def test_gpu_buffer_prototype() -> None:
     buffer = gpu.buffer_prototype.buffer.create_zero_length()
diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py
index 1884d501a5..7d7b4ed8aa 100644
--- a/tests/test_codecs/test_codecs.py
+++ b/tests/test_codecs/test_codecs.py
@@ -16,12 +16,14 @@
     GzipCodec,
     ShardingCodec,
     TransposeCodec,
+    ZstdCodec,
 )
 from zarr.core.buffer import default_buffer_prototype
 from zarr.core.indexing import BasicSelection, morton_order_iter
 from zarr.core.metadata.v3 import ArrayV3Metadata
 from zarr.dtype import UInt8
 from zarr.errors import ZarrUserWarning
+from zarr.registry import register_codec
 from zarr.storage import StorePath
 
 if TYPE_CHECKING:
@@ -362,3 +364,22 @@ async def test_resize(store: Store) -> None:
     assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None
     assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is None
     assert await store.get(f"{path}/1.1", prototype=default_buffer_prototype()) is None
+
+
+def test_uses_default_codec() -> None:
+    class MyZstdCodec(ZstdCodec):
+        pass
+
+    register_codec("zstd", MyZstdCodec)
+
+    with zarr.config.set(
+        {"codecs": {"zstd": f"{MyZstdCodec.__module__}.{MyZstdCodec.__qualname__}"}}
+    ):
+        a = zarr.create_array(
+            StorePath(zarr.storage.MemoryStore(), path="mycodec"),
+            shape=(10, 10),
+            chunks=(10, 10),
+            dtype="int32",
+        )
+        assert a.metadata.zarr_format == 3
+        assert isinstance(a.metadata.codecs[-1], MyZstdCodec)
diff --git a/tests/test_codecs/test_nvcomp.py b/tests/test_codecs/test_nvcomp.py
new file mode 100644
index 0000000000..9b5554d9a3
--- /dev/null
+++ b/tests/test_codecs/test_nvcomp.py
@@ -0,0 +1,190 @@
+import contextlib
+import typing
+from collections.abc import Iterator
+
+import numpy as np
+import pytest
+
+import zarr
+from zarr.abc.store import Store
+from zarr.buffer.gpu import buffer_prototype
+from zarr.codecs import NvcompZstdCodec
+from zarr.core.array_spec import ArrayConfig, ArraySpec
+from zarr.storage import StorePath
+from zarr.testing.utils import gpu_test
+
+if typing.TYPE_CHECKING:
+    from zarr.core.common import JSON
+
+
+# the type-ignores here are here thanks to not reliably having GPU
+# libraries in the pre-commit mypy environment.
+
+
+@gpu_test  # type: ignore[misc,unused-ignore]
+@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"])
+@pytest.mark.parametrize(
+    "checksum",
+    [
+        False,
+    ],
+)
+@pytest.mark.parametrize(
+    "selection",
+    [
+        (slice(None), slice(None)),  # everything
+        (slice(4, None), slice(4, None)),  # top-left chunk is empty
+    ],
+)
+def test_nvcomp_zstd(store: Store, checksum: bool, selection: tuple[slice, slice]) -> None:
+    import cupy as cp
+
+    with zarr.config.enable_gpu():
+        data = cp.arange(0, 256, dtype="uint16").reshape((16, 16))
+
+        a = zarr.create_array(
+            StorePath(store, path="nvcomp_zstd"),
+            shape=data.shape,
+            chunks=(4, 4),
+            dtype=data.dtype,
+            fill_value=0,
+            compressors=NvcompZstdCodec(level=0, checksum=checksum),
+        )
+
+        a[*selection] = data[*selection]
+
+        if selection == (slice(None), slice(None)):
+            cp.testing.assert_array_equal(data[*selection], a[*selection])
+            cp.testing.assert_array_equal(data[:, :], a[:, :])
+        else:
+            assert a.nchunks_initialized < a.nchunks
+            expected = cp.full(data.shape, a.fill_value)
+            expected[*selection] = data[*selection]
+            cp.testing.assert_array_equal(expected[*selection], a[*selection])
+            cp.testing.assert_array_equal(expected[:, :], a[:, :])
+
+
+@gpu_test  # type: ignore[misc,unused-ignore]
+@pytest.mark.parametrize("host_encode", [True, False])
+def test_gpu_codec_compatibility(host_encode: bool) -> None:
+    # Ensure that the we can decode CPU-encoded data with the GPU
+    # and GPU-encoded data with the CPU
+    import cupy as cp
+
+    @contextlib.contextmanager
+    def gpu_context() -> Iterator[None]:
+        with zarr.config.enable_gpu():
+            yield
+
+    if host_encode:
+        # CPU encode, GPU decode
+        write_ctx: contextlib.AbstractContextManager[None] = contextlib.nullcontext()
+        read_ctx: contextlib.AbstractContextManager[None] = gpu_context()
+        write_data = np.arange(16, dtype="int32").reshape(4, 4)
+        read_data = cp.array(write_data)
+        xp = cp
+        # MemoryStore holds Buffers; We write a CPU buffer, but read a GPU buffer
+        # which emits a warning.
+        expected_warning: pytest.WarningsRecorder | contextlib.AbstractContextManager[None] = (
+            pytest.warns(zarr.errors.ZarrUserWarning)
+        )
+    else:
+        # GPU encode, CPU decode
+        write_ctx = gpu_context()
+        read_ctx = contextlib.nullcontext()
+        write_data = cp.arange(16, dtype="int32").reshape(4, 4)
+        read_data = write_data.get()
+        xp = np
+        expected_warning = contextlib.nullcontext()
+
+    store = zarr.storage.MemoryStore()
+
+    with write_ctx:
+        z = zarr.create_array(
+            store=store,
+            shape=write_data.shape,
+            chunks=(4, 4),
+            dtype=write_data.dtype,
+        )
+        z[:] = write_data
+
+    with read_ctx, expected_warning:
+        # We need to reopen z, because `z.codec_pipeline` is set at creation
+        z = zarr.open_array(store=store, mode="r")
+        result = z[:]
+        assert isinstance(result, type(read_data))
+        xp.testing.assert_array_equal(result, read_data)
+
+
+@gpu_test  # type: ignore[misc,unused-ignore]
+def test_invalid_raises() -> None:
+    with pytest.raises(ValueError):
+        NvcompZstdCodec(level=100, checksum=False)
+
+    with pytest.raises(TypeError):
+        NvcompZstdCodec(level="100", checksum=False)  # type: ignore[arg-type,unused-ignore]
+
+    with pytest.raises(TypeError):
+        NvcompZstdCodec(checksum="False")  # type: ignore[arg-type,unused-ignore]
+
+
+@gpu_test  # type: ignore[misc,unused-ignore]
+def test_uses_default_codec() -> None:
+    with zarr.config.enable_gpu():
+        a = zarr.create_array(
+            StorePath(zarr.storage.MemoryStore(), path="nvcomp_zstd"),
+            shape=(10, 10),
+            chunks=(10, 10),
+            dtype="int32",
+        )
+        assert a.metadata.zarr_format == 3
+        assert isinstance(a.metadata.codecs[-1], NvcompZstdCodec)
+
+
+@gpu_test  # type: ignore[misc,unused-ignore]
+def test_nvcomp_from_dict() -> None:
+    config: dict[str, JSON] = {
+        "name": "zstd",
+        "configuration": {
+            "level": 0,
+            "checksum": False,
+        },
+    }
+    codec = NvcompZstdCodec.from_dict(config)
+    assert codec.level == 0
+    assert codec.checksum is False
+
+
+@gpu_test  # type: ignore[misc,unused-ignore]
+def test_compute_encoded_chunk_size() -> None:
+    codec = NvcompZstdCodec(level=0, checksum=False)
+    with pytest.raises(NotImplementedError):
+        codec.compute_encoded_size(
+            _input_byte_length=0,
+            _chunk_spec=ArraySpec(
+                shape=(10, 10),
+                dtype=zarr.core.dtype.npy.int.Int32(),
+                fill_value=0,
+                config=ArrayConfig(order="C", write_empty_chunks=False),
+                prototype=buffer_prototype,
+            ),
+        )
+
+
+@gpu_test  # type: ignore[misc,unused-ignore]
+async def test_nvcomp_zstd_encode_none() -> None:
+    codec = NvcompZstdCodec(level=0, checksum=False)
+    chunks_and_specs = [
+        (
+            None,
+            ArraySpec(
+                shape=(10, 10),
+                dtype=zarr.core.dtype.npy.int.Int32(),
+                fill_value=0,
+                config=ArrayConfig(order="C", write_empty_chunks=False),
+                prototype=buffer_prototype,
+            ),
+        )
+    ]
+    result = await codec.encode(chunks_and_specs)
+    assert result == [None]