-
-
Notifications
You must be signed in to change notification settings - Fork 364
Description
Zarr version
main
Numcodecs version
0.16.1
Python Version
3.11
Operating System
Mac
Installation
script metadata
Description
On main you can write an array with the str
dtype. However, you cannot read it back. This is different than the behavior for 3.0.8 and seemingly an uncaught consequence of the dtype changes attn: @d-v-b
Steps to reproduce
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "zarr@git+https://github.com/zarr-developers/zarr-python.git@main",
# ]
# ///
import zarr
# each option here breaks in the same way
# string_data = ["abc", "ab", "a"]
# string_data = ["ab", "cd", "ef"]
string_data = ["ab"]
group = zarr.group("store.zarr", overwrite=True)
array = group.create("blah", shape=len(string_data), dtype=str)
array[:] = string_data
open_group = zarr.open("store.zarr")
open_group["blah"][:]
Gives:
traceback
Traceback (most recent call last):
File "/Users/ian/Documents/dev/xarray/tmp/string_test.py", line 19, in <module>
open_group["blah"][:]
~~~~~~~~~~~~~~~~~~^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/array.py", line 2454, in __getitem__
return self.get_orthogonal_selection(pure_selection, fields=fields)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/_compat.py", line 43, in inner_f
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/array.py", line 2896, in get_orthogonal_selection
return sync(
^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/sync.py", line 163, in sync
raise return_result
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/sync.py", line 119, in _runner
return await coro
^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/array.py", line 1315, in _get_selection
await self.codec_pipeline.read(
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/codec_pipeline.py", line 466, in read
await concurrent_map(
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/common.py", line 76, in concurrent_map
return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/common.py", line 74, in run
return await func(*item)
^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/codec_pipeline.py", line 270, in read_batch
chunk_array_batch = await self.decode_batch(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/codec_pipeline.py", line 190, in decode_batch
chunk_array_batch = await ab_codec.decode(
^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/abc/codec.py", line 134, in decode
return await _batching_helper(self._decode_single, chunks_and_specs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/abc/codec.py", line 414, in _batching_helper
return await concurrent_map(
^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/common.py", line 76, in concurrent_map
return await asyncio.gather(*[asyncio.ensure_future(run(item)) for item in items])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/core/common.py", line 74, in run
return await func(*item)
^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/abc/codec.py", line 427, in wrap
return await func(chunk, chunk_spec)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/ian/.cache/uv/environments-v2/string-test-a602d4454d39a8df/lib/python3.12/site-packages/zarr/codecs/bytes.py", line 87, in _decode_single
as_nd_array_like.view(dtype=dtype)
ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype
This is a super confusing error and ideally it would have errored sooner, but ultimately this about using an incompatible codec I think. See below for metadata issues
Additional output
Based on comparing the zarr.json metadata it seems that the str
dtype is no longer being correctly identified as str
and instead being turned in a 0 length fixed_length_utf32. (Notably this is the case for multiple elements with different lengths, or for a single string element, or multiple with the same length)
metadata for main
{
"shape": [
3
],
"data_type": {
"name": "fixed_length_utf32",
"configuration": {
"length_bytes": 0
}
},
"chunk_grid": {
"name": "regular",
"configuration": {
"chunk_shape": [
3
]
}
},
"chunk_key_encoding": {
"name": "default",
"configuration": {
"separator": "/"
}
},
"fill_value": "",
"codecs": [
{
"name": "bytes",
"configuration": {
"endian": "little"
}
},
{
"name": "zstd",
"configuration": {
"level": 0,
"checksum": false
}
}
],
"attributes": {},
"zarr_format": 3,
"node_type": "array",
"storage_transformers": []
}
and one 3.0.8 the metadata looks like this:
3.0.8 metadata
{
"shape": [
3
],
"data_type": "string",
"chunk_grid": {
"name": "regular",
"configuration": {
"chunk_shape": [
3
]
}
},
"chunk_key_encoding": {
"name": "default",
"configuration": {
"separator": "/"
}
},
"fill_value": "0",
"codecs": [
{
"name": "vlen-utf8",
"configuration": {}
},
{
"name": "zstd",
"configuration": {
"level": 0,
"checksum": false
}
}
],
"attributes": {},
"zarr_format": 3,
"node_type": "array",
"storage_transformers": []
}
Metadata Diff
--- a/pypi.json
+++ b/main.json
@@ -2,7 +2,12 @@
"shape": [
3
],
- "data_type": "string",
+ "data_type": {
+ "name": "fixed_length_utf32",
+ "configuration": {
+ "length_bytes": 0
+ }
+ },
"chunk_grid": {
"name": "regular",
"configuration": {
@@ -17,11 +22,13 @@
"separator": "/"
}
},
- "fill_value": "0",
+ "fill_value": "",
"codecs": [
{
- "name": "vlen-utf8",
- "configuration": {}
+ "name": "bytes",
+ "configuration": {
+ "endian": "little"
+ }
},
{
"name": "zstd",
@@ -37,4 +44,3 @@
"storage_transformers": []
}
-