Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/zarr/core/dtype/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from zarr.core.dtype.npy.float import Float16, Float32, Float64
from zarr.core.dtype.npy.int import Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64
from zarr.core.dtype.npy.structured import Structured, StructuredJSON_V2, StructuredJSON_V3
from zarr.core.dtype.npy.subarray import Subarray, SubarrayJSON_V3
from zarr.core.dtype.npy.time import (
DateTime64,
DateTime64JSON_V2,
Expand Down Expand Up @@ -78,6 +79,8 @@
"Structured",
"StructuredJSON_V2",
"StructuredJSON_V3",
"Subarray",
"SubarrayJSON_V3",
"TBaseDType",
"TBaseScalar",
"TimeDelta64",
Expand Down Expand Up @@ -126,6 +129,7 @@
| StringDType
| BytesDType
| Structured
| Subarray
| TimeDType
| VariableLengthBytes
)
Expand All @@ -139,6 +143,7 @@
*STRING_DTYPE,
*BYTES_DTYPE,
Structured,
Subarray,
*TIME_DTYPE,
VariableLengthBytes,
)
Expand Down
36 changes: 26 additions & 10 deletions src/zarr/core/dtype/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
# classes can perform a very specific type check.

# This is the JSON representation of a structured dtype in zarr v2
StructuredName_V2 = Sequence["str | StructuredName_V2"]
StructuredName_V2 = Sequence[Sequence["str | StructuredName_V2 | Sequence[int]"]]

# This models the type of the name a dtype might have in zarr v2 array metadata
DTypeName_V2 = StructuredName_V2 | str
Expand All @@ -70,23 +70,39 @@ def check_structured_dtype_v2_inner(data: object) -> TypeGuard[StructuredName_V2
A type guard for the inner elements of a structured dtype. This is a recursive check because
the type is itself recursive.

This check ensures that all the elements are 2-element sequences beginning with a string
and ending with either another string or another 2-element sequence beginning with a string and
ending with another instance of that type.
This check ensures that all the elements are either 2-element or 3-element sequences that:
1. Begin with a string (name)
2. Have as their second element either a string (dtype) or another sequence (structured dtype)
3. If they have a third element, it is a sequence representing the shape of the field.
"""
if isinstance(data, (str, Mapping)):
return False
if not isinstance(data, Sequence):
return False
if len(data) != 2:
if len(data) != 2 and len(data) != 3:
return False
if not (isinstance(data[0], str)):

name, dtype = data[0], data[1]

# check name element
if not (isinstance(name, str)):
return False
if isinstance(data[-1], str):

# check shape element
if len(data) == 3:
shape = data[2]
if not isinstance(shape, Sequence):
return False
if not all(isinstance(dim, int) for dim in shape):
return False

# (recursively) check dtype element
if isinstance(dtype, str):
return True
elif isinstance(data[-1], Sequence):
return check_structured_dtype_v2_inner(data[-1])
return False
elif isinstance(dtype, Sequence):
return check_structured_dtype_name_v2(dtype)
else:
return False


def check_structured_dtype_name_v2(data: Sequence[object]) -> TypeGuard[StructuredName_V2]:
Expand Down
2 changes: 1 addition & 1 deletion src/zarr/core/dtype/npy/bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ def _check_native_dtype(
Bool
True if the dtype is an instance of np.dtypes.VoidDType with no fields, False otherwise.
"""
return cls.dtype_cls is type(dtype) and dtype.fields is None
return cls.dtype_cls is type(dtype) and dtype.fields is None and dtype.subdtype is None

@classmethod
def from_native_dtype(cls, dtype: TBaseDType) -> Self:
Expand Down
70 changes: 50 additions & 20 deletions src/zarr/core/dtype/npy/structured.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
bytes_to_json,
check_json_str,
)
from zarr.core.dtype.npy.subarray import Subarray
from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType

if TYPE_CHECKING:
Expand All @@ -34,9 +35,11 @@ class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]):
"""
A wrapper around the JSON representation of the ``Structured`` data type in Zarr V2.

The ``name`` field is a sequence of sequences, where each inner sequence has two values:
the field name and the data type name for that field (which could be another sequence).
The data type names are strings, and the object codec ID is always None.
The ``name`` field is a sequence of sequences, where each inner sequence has 2 or 3 values:
- First value: field name
- Second value: data type name (which could be another sequence for nested structured dtypes)
- Third value (optional): shape of the field (for subarray dtypes)
The object codec ID is always None.

References
----------
Expand All @@ -49,7 +52,7 @@ class StructuredJSON_V2(DTypeConfig_V2[StructuredName_V2, None]):
{
"name": [
["f0", "<m8[10s]"],
["f1", "<m8[10s]"],
["f1", "int32", [2, 2]],
],
"object_codec_id": None
}
Expand Down Expand Up @@ -252,17 +255,33 @@ def _from_json_v2(cls, data: DTypeJSON) -> Self:
# structured dtypes are constructed directly from a list of lists
# note that we do not handle the object codec here! this will prevent structured
# dtypes from containing object dtypes.
return cls(
fields=tuple( # type: ignore[misc]
( # type: ignore[misc]
f_name,
get_data_type_from_json(
{"name": f_dtype, "object_codec_id": None}, zarr_format=2
),
)
for f_name, f_dtype in data["name"]
fields = []
name = data["name"]
for tpl in name:
f_name = tpl[0]
if not isinstance(f_name, str):
msg = f"Invalid field name. Got {f_name!r}, expected a string."
raise DataTypeValidationError(msg)

f_dtype = tpl[1]
subdtype = get_data_type_from_json(
{"name": f_dtype, "object_codec_id": None}, zarr_format=2
)
)

if len(tpl) == 3:
f_shape = cast("tuple[int]", tuple(tpl[2]))
if not all(isinstance(dim, int) for dim in f_shape):
msg = f"Invalid shape for field {f_name!r}. Got {f_shape!r}, expected a sequence of integers."
raise DataTypeValidationError(msg)
subdtype = Subarray(
subdtype=subdtype,
shape=f_shape,
)

fields.append((f_name, subdtype))

return cls(fields=tuple(fields))

msg = f"Invalid JSON representation of {cls.__name__}. Got {data!r}, expected a JSON array of arrays"
raise DataTypeValidationError(msg)

Expand Down Expand Up @@ -309,11 +328,23 @@ def to_json(self, zarr_format: ZarrFormat) -> StructuredJSON_V2 | StructuredJSON
If the zarr_format is not 2 or 3.
"""
if zarr_format == 2:
fields = [
[f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]]
for f_name, f_dtype in self.fields
]
return {"name": fields, "object_codec_id": None}
fields = []
for f_name, f_dtype in self.fields:
if isinstance(f_dtype, Subarray):
fields.append(
[
f_name,
f_dtype.subdtype.to_json(zarr_format=zarr_format)["name"],
list(f_dtype.shape),
]
)
else:
fields.append([f_name, f_dtype.to_json(zarr_format=zarr_format)["name"]])
dct = {
"name": fields,
"object_codec_id": None,
}
return cast("StructuredJSON_V2", dct)
elif zarr_format == 3:
v3_unstable_dtype_warning(self)
fields = [
Expand Down Expand Up @@ -415,7 +446,6 @@ def default_scalar(self) -> np.void:
The default scalar value, which is the scalar representation of 0
cast to this structured data type.
"""

return self._cast_scalar_unchecked(0)

def from_json_scalar(self, data: JSON, *, zarr_format: ZarrFormat) -> np.void:
Expand Down
Loading