zarr-developers · sehoffmann · Nov 19, 2025 · Nov 20, 2025 · Nov 21, 2025
diff --git a/src/zarr/core/dtype/__init__.py b/src/zarr/core/dtype/__init__.py
@@ -22,6 +22,7 @@
 from zarr.core.dtype.npy.float import Float16, Float32, Float64
 from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64
 from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3
+from zarr.core.dtype.npy.subarray import Subarray, SubarrayJSON_V3
 from zarr.core.dtype.npy.time import (
     DateTime64,
     DateTime64JSON_V2,
@@ -78,6 +79,8 @@
     "Structured",
     "StructuredJSON_V2",
     "StructuredJSON_V3",
+    "Subarray",
+    "SubarrayJSON_V3",
     "TBaseDType",
     "TBaseScalar",
     "TimeDelta64",
@@ -126,6 +129,7 @@
     | StringDType
     | BytesDType
     | Structured
+    | Subarray
     | TimeDType
     | VariableLengthBytes
 )
@@ -139,6 +143,7 @@
     *STRING_DTYPE,
     *BYTES_DTYPE,
     Structured,
+    Subarray,
     *TIME_DTYPE,
     VariableLengthBytes,
 )

diff --git a/src/zarr/core/dtype/common.py b/src/zarr/core/dtype/common.py
@@ -48,7 +48,7 @@
 # classes can perform a very specific type check.
 
 # This is the JSON representation of a structured dtype in zarr v2
-StructuredName_V2 = Sequence["str | StructuredName_V2"]
+StructuredName_V2 = Sequence[Sequence["str | StructuredName_V2 | Sequence[int]"]]
 
 # This models the type of the name a dtype might have in zarr v2 array metadata
 DTypeName_V2 = StructuredName_V2 | str
@@ -70,23 +70,39 @@ def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2
     A type guard for the inner elements of a structured dtype. This is a recursive check because
     the type is itself recursive.
 
-    This check ensures that all the elements are 2-element sequences beginning with a string
-    and ending with either another string or another 2-element sequence beginning with a string and
-    ending with another instance of that type.
+    This check ensures that all the elements are either 2-element or 3-element sequences that:
+        1. Begin with a string (name)
+        2. Have as their second element either a string (dtype) or another sequence (structured dtype)
+        3. If they have a third element, it is a sequence representing the shape of the field.
     """
     if isinstance(data, (str, Mapping)):
         return False
     if not isinstance(data, Sequence):
         return False
-    if len(data) != 2:
+    if len(data) != 2 and len(data) != 3:
         return False
-    if not (isinstance(data[0], str)):
+
+    name, dtype = data[0], data[1]
+
+    # check name element
+    if not (isinstance(name, str)):
         return False
-    if isinstance(data[-1], str):
+
+    # check shape element
+    if len(data) == 3:
+        shape = data[2]
+        if not isinstance(shape, Sequence):
+            return False
+        if not all(isinstance(dim, int) for dim in shape):
+            return False
+
+    # (recursively) check dtype element
+    if isinstance(dtype, str):
         return True
-    elif isinstance(data[-1], Sequence):
-        return check_structured_dtype_v2_inner(data[-1])
-    return False
+    elif isinstance(dtype, Sequence):
+        return check_structured_dtype_name_v2(dtype)
+    else:
+        return False
 
 
 def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]:

diff --git a/src/zarr/core/dtype/npy/bytes.py b/src/zarr/core/dtype/npy/bytes.py
@@ -606,7 +606,7 @@ def _check_native_dtype(
         Bool
             True if the dtype is an instance of np.dtypes.VoidDType with no fields, False otherwise.
         """
-        return cls.dtype_cls is type(dtype) and dtype.fields is None
+        return cls.dtype_cls is type(dtype) and dtype.fields is None and dtype.subdtype is None
 
     @classmethod
     def from_native_dtype(cls, dtype: TBaseDType) -> Self:

diff --git a/src/zarr/core/dtype/npy/structured.py b/src/zarr/core/dtype/npy/structured.py
@@ -22,6 +22,7 @@
     bytes_to_json,
     check_json_str,
 )
+from zarr.core.dtype.npy.subarray import Subarray
 from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
 
 if TYPE_CHECKING:
@@ -34,9 +35,11 @@ class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]):
     """
     A wrapper around the JSON representation of the ``Structured`` data type in Zarr V2.
 
-    The ``name`` field is a sequence of sequences, where each inner sequence has two values:
-    the field name and the data type name for that field (which could be another sequence).
-    The data type names are strings, and the object codec ID is always None.
+    The ``name`` field is a sequence of sequences, where each inner sequence has 2 or 3 values:
+        - First value: field name
+        - Second value: data type name (which could be another sequence for nested structured dtypes)
+        - Third value (optional): shape of the field (for subarray dtypes)
+    The object codec ID is always None.
 
     References
     ----------
@@ -49,7 +52,7 @@ class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]):
     {
         "name": [
             ["f0", "<m8[10s]"],
-            ["f1", "<m8[10s]"],
+            ["f1", "int32", [2, 2]],
         ],
         "object_codec_id": None
     }
@@ -252,17 +255,33 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self:
             # structured dtypes are constructed directly from a list of lists
             # note that we do not handle the object codec here! this will prevent structured
             # dtypes from containing object dtypes.
-            return cls(
-                fields=tuple(  # type: ignore[misc]
-                    (  # type: ignore[misc]
-                        f_name,
-                        get_data_type_from_json(
-                            {"name": f_dtype, "object_codec_id": None}, zarr_format=2
-                        ),
-                    )
-                    for f_name, f_dtype in data["name"]
+            fields = []
+            name = data["name"]
+            for tpl in name:
+                f_name = tpl[0]
+                if not isinstance(f_name, str):
+                    msg = f"Invalid field name. Got {f_name!r}, expected a string."
+                    raise DataTypeValidationError(msg)
+
+                f_dtype = tpl[1]
+                subdtype = get_data_type_from_json(
+                    {"name": f_dtype, "object_codec_id": None}, zarr_format=2
                 )
-            )
+
+                if len(tpl) == 3:
+                    f_shape = cast("tuple[int]", tuple(tpl[2]))
+                    if not all(isinstance(dim, int) for dim in f_shape):
+                        msg = f"Invalid shape for field {f_name!r}. Got {f_shape!r}, expected a sequence of integers."
+                        raise DataTypeValidationError(msg)
+                    subdtype = Subarray(
+                        subdtype=subdtype,
+                        shape=f_shape,
+                    )
+
+                fields.append((f_name, subdtype))
+
+            return cls(fields=tuple(fields))
+
         msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON array of arrays"
         raise DataTypeValidationError(msg)
 
@@ -309,11 +328,23 @@ def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON
             If the zarr_format is not 2 or 3.
         """
         if zarr_format == 2:
-            fields = [
-                [f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]]
-                for f_name, f_dtype in self.fields
-            ]
-            return {"name": fields, "object_codec_id": None}
+            fields = []
+            for f_name, f_dtype in self.fields:
+                if isinstance(f_dtype, Subarray):
+                    fields.append(
+                        [
+                            f_name,
+                            f_dtype.subdtype.to_json(zarr_format=zarr_format)["name"],
+                            list(f_dtype.shape),
+                        ]
+                    )
+                else:
+                    fields.append([f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]])
+            dct = {
+                "name": fields,
+                "object_codec_id": None,
+            }
+            return cast("StructuredJSON_V2", dct)
         elif zarr_format == 3:
             v3_unstable_dtype_warning(self)
             fields = [
@@ -415,7 +446,6 @@ def default_scalar(self) -> np.void:
             The default scalar value, which is the scalar representation of 0
             cast to this structured data type.
         """
-
         return self._cast_scalar_unchecked(0)
 
     def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void: