From 605620b563911491b875a65b8e5aa25f4087df6e Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Thu, 28 Jul 2022 15:04:18 +0200 Subject: [PATCH 01/22] add storage_transformers and get/set_partial_values --- zarr/_storage/store.py | 149 ++++++++++++++++++++++++++++++++++ zarr/core.py | 12 +++ zarr/creation.py | 4 +- zarr/meta.py | 45 +++++++++- zarr/storage.py | 9 +- zarr/tests/test_creation.py | 9 ++ zarr/tests/test_storage_v3.py | 77 +++++++++++++++++- 7 files changed, 298 insertions(+), 7 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 6faf4a1250..ebaff601d6 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -1,6 +1,8 @@ import abc import os +from collections import defaultdict from collections.abc import MutableMapping +from copy import copy from string import ascii_letters, digits from typing import Any, List, Mapping, Optional, Union @@ -254,6 +256,44 @@ def __setitem__(self, key, value): def __getitem__(self, key): """Get a value.""" + def get_partial_values(self, key_ranges): + """Get multiple partial values. + key_ranges can be an iterable of key, range pairs, + where a range specifies two integers range_start and range_length + as a tuple, (range_start, range_length). + Length may be None to indicate to read until the end. + A key may occur multiple times with different ranges.""" + results = [None] * len(key_ranges) + indexed_ranges_by_key = defaultdict(list) + for i, (key, range_) in enumerate(key_ranges): + indexed_ranges_by_key[key].append((i, range_)) + for key, indexed_ranges in indexed_ranges_by_key.items(): + value = self[key] + for i, (range_from, range_length) in indexed_ranges: + if range_length is None: + results[i] = value[range_from:] + else: + results[i] = value[range_from:range_from + range_length] + return results + + def set_partial_values(self, key_start_values): + """Set multiple partial values. + key_start_values can be an iterable of key, start and value triplets + as tuples, (key, start, value), where start defines the offset in bytes. + A key may occur multiple times with different starts and non-overlapping values. + Also, start may only be beyond the current value if other values fill the gap.""" + unique_keys = set(next(zip(*key_start_values))) + values = {key: bytearray(self.get(key)) for key in unique_keys} + for key, start, value in key_start_values: + if values[key] is None: + assert start == 0 + values[key] = value + else: + assert start <= len(values[key]) + values[key][start:start + len(value)] = value + for key, value in values.items(): + self[key] = value + def clear(self): """Remove all items from store.""" self.erase_prefix("/") @@ -303,6 +343,115 @@ def _ensure_store(store): ) +class StorageTransformer(MutableMapping, abc.ABC): + def __init__(self, _type) -> None: + assert _type in self.valid_types + self.type = _type + self._inner_store = None + + def _copy_for_array(self, inner_store): + transformer_copy = copy(self) + transformer_copy._inner_store = inner_store + return transformer_copy + + @abc.abstractproperty + def extension_uri(self): + pass + + @abc.abstractproperty + def valid_types(self): + pass + + def get_config(self): + """Return a dictionary holding configuration parameters for this + storage transformer. All values must be compatible with JSON encoding.""" + # Override in sub-class if need special encoding of config values. + # By default, assume all non-private members are configuration + # parameters except for type . + return { + k: v for k, v in self.__dict__.items() + if not k.startswith('_') and k != "type" + } + + @classmethod + def from_config(cls, _type, config): + """Instantiate storage transformer from a configuration object.""" + # override in sub-class if need special decoding of config values + + # by default, assume constructor accepts configuration parameters as + # keyword arguments without any special decoding + return cls(_type, **config) + + def is_readable(self): + return self._inner_store.is_readable() + + def is_writeable(self): + return self._inner_store.is_writeable() + + def is_listable(self): + return self._inner_store.is_listable() + + def is_erasable(self): + return self._inner_store.is_erasable() + + def __enter__(self): + return self._inner_store.__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + return self._inner_store.__exit__(exc_type, exc_value, traceback) + + def close(self) -> None: + return self._inner_store.close() + + def rename(self, src_path: str, dst_path: str) -> None: + return self._inner_store.rename(src_path, dst_path) + + def list_prefix(self, prefix): + return self._inner_store.list_prefix(prefix) + + def erase(self, key): + return self._inner_store.erase(key) + + def erase_prefix(self, prefix): + return self._inner_store.erase_prefix(prefix) + + def list_dir(self, prefix): + return self._inner_store.list_dir(prefix) + + def list(self): + return self._inner_store.list() + + def __contains__(self, key): + return self._inner_store.__contains__(key) + + def __setitem__(self, key, value): + return self._inner_store.__setitem__(key, value) + + def __getitem__(self, key): + return self._inner_store.__getitem__(key) + + def __delitem__(self, key): + return self._inner_store.__delitem__(key) + + def __iter__(self): + return self._inner_store.__iter__() + + def __len__(self): + return self._inner_store.__len__() + + def get_partial_values(self, key_ranges): + return self._inner_store.get_partial_values(key_ranges) + + def set_partial_values(self, key_start_values): + return self._inner_store.set_partial_values(key_start_values) + + def clear(self): + return self._inner_store.clear() + + def __eq__(self, other): + return self._inner_store.__eq__(other) + + # allow MutableMapping for backwards compatibility StoreLike = Union[BaseStore, MutableMapping] diff --git a/zarr/core.py b/zarr/core.py index bd61639ef6..fd60f35f93 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -280,6 +280,18 @@ def _load_metadata_nosync(self): filters = [get_codec(config) for config in filters] self._filters = filters + if self._version == 3: + storage_transformers = meta.get('storage_transformers', []) + transformed_store = self._store + for storage_transformer in storage_transformers: + transformed_store = storage_transformer._copy_for_array(transformed_store) + self._store = transformed_store + if self._chunk_store is not None: + transformed_chunk_store = self._chunk_store + for storage_transformer in storage_transformers: + transformed_chunk_store = storage_transformer._copy_for_array(transformed_chunk_store) + self._chunk_store = transformed_chunk_store + def _refresh_metadata(self): if not self._cache_metadata: self._load_metadata() diff --git a/zarr/creation.py b/zarr/creation.py index e77f26b3e2..537c0b8560 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -21,7 +21,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, object_codec=None, dimension_separator=None, write_empty_chunks=True, - *, zarr_version=None, **kwargs): + storage_transformers=None, *, zarr_version=None, **kwargs): """Create an array. Parameters @@ -161,7 +161,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, overwrite=overwrite, path=path, chunk_store=chunk_store, filters=filters, object_codec=object_codec, - dimension_separator=dimension_separator) + dimension_separator=dimension_separator, storage_transformers=storage_transformers) # instantiate array z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer, diff --git a/zarr/meta.py b/zarr/meta.py index c290e90163..3008e11a98 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -9,7 +9,11 @@ from zarr.errors import MetadataError from zarr.util import json_dumps, json_loads -from typing import cast, Union, Any, List, Mapping as MappingType, Optional +from typing import cast, Union, Any, List, Mapping as MappingType, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from zarr._storage.store import StorageTransformer + ZARR_FORMAT = 2 ZARR_FORMAT_v3 = 3 @@ -459,6 +463,31 @@ def _decode_codec_metadata(cls, meta: Optional[Mapping]) -> Optional[Codec]: return codec + @classmethod + def _encode_storage_transformer_metadata(cls, storage_transformer: "StorageTransformer") -> Optional[Mapping]: + return { + "extension": storage_transformer.extension_uri, + "type": storage_transformer.type, + "configuration": storage_transformer.get_config(), + } + + @classmethod + def _decode_storage_transformer_metadata(cls, meta: Mapping) -> "StorageTransformer": + from zarr.tests.test_storage_v3 import DummyStorageTransfomer + KNOWN_STORAGE_TRANSFORMERS = [DummyStorageTransfomer] + + conf = meta.get('configuration', {}) + extension_uri = meta['extension'] + transformer_type = meta['type'] + + for StorageTransformerCls in KNOWN_STORAGE_TRANSFORMERS: + if StorageTransformerCls.extension_uri == extension_uri: + break + else: + raise NotImplementedError + + return StorageTransformerCls.from_config(transformer_type, conf) + @classmethod def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: meta = cls.parse_metadata(s) @@ -476,6 +505,11 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A # TODO: remove dimension_separator? compressor = cls._decode_codec_metadata(meta.get("compressor", None)) + storage_transformers = meta.get("storage_transformers", None) + if storage_transformers: + storage_transformers = [ + cls._decode_storage_transformer_metadata(i) for i in storage_transformers + ] extensions = meta.get("extensions", []) meta = dict( shape=tuple(meta["shape"]), @@ -493,6 +527,8 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A # compressor field should be absent when there is no compression if compressor: meta['compressor'] = compressor + if storage_transformers: + meta['storage_transformers'] = storage_transformers except Exception as e: raise MetadataError("error decoding metadata: %s" % e) @@ -514,6 +550,11 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: object_codec = None compressor = cls._encode_codec_metadata(meta.get("compressor", None)) + storage_transformers = meta.get("storage_transformers", None) + if storage_transformers: + storage_transformers = [ + cls._encode_storage_transformer_metadata(i) for i in storage_transformers + ] extensions = meta.get("extensions", []) meta = dict( shape=meta["shape"] + sdshape, @@ -532,6 +573,8 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: meta["compressor"] = compressor if dimension_separator: meta["dimension_separator"] = dimension_separator + if storage_transformers: + meta["storage_transformers"] = storage_transformers return json_dumps(meta) diff --git a/zarr/storage.py b/zarr/storage.py index 440b41ea07..e1f7e41d5f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -294,6 +294,7 @@ def init_array( filters=None, object_codec=None, dimension_separator=None, + storage_transformers=None, ): """Initialize an array store with the given configuration. Note that this is a low-level function and there should be no need to call this directly from user code. @@ -421,7 +422,8 @@ def init_array( order=order, overwrite=overwrite, path=path, chunk_store=chunk_store, filters=filters, object_codec=object_codec, - dimension_separator=dimension_separator) + dimension_separator=dimension_separator, + storage_transformers=storage_transformers) def _init_array_metadata( @@ -438,6 +440,7 @@ def _init_array_metadata( filters=None, object_codec=None, dimension_separator=None, + storage_transformers=None, ): store_version = getattr(store, '_store_version', 2) @@ -559,6 +562,7 @@ def _init_array_metadata( if store_version < 3: meta.update(dict(chunks=chunks, dtype=dtype, order=order, filters=filters_config)) + assert not storage_transformers else: if dimension_separator is None: dimension_separator = "/" @@ -572,7 +576,8 @@ def _init_array_metadata( separator=dimension_separator), chunk_memory_layout=order, data_type=dtype, - attributes=attributes) + attributes=attributes, + storage_transformers=storage_transformers) ) key = _prefix_to_array_key(store, _path_to_prefix(path)) diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index b8ab118329..6b0e0ee3e3 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -20,6 +20,7 @@ from zarr._storage.store import v3_api_available from zarr._storage.v3 import DirectoryStoreV3, KVStoreV3 from zarr.sync import ThreadSynchronizer +from zarr.tests.test_storage_v3 import DummyStorageTransfomer _VERSIONS = ((None, 2, 3) if v3_api_available else (None, 2)) _VERSIONS2 = ((2, 3) if v3_api_available else (2, )) @@ -724,3 +725,11 @@ def test_create_read_only(zarr_version): def test_json_dumps_chunks_numpy_dtype(): z = zeros((10,), chunks=(np.int64(2),)) assert np.all(z[...] == 0) + + +def test_create_with_storage_transformers(): + kwargs = _init_creation_kwargs(zarr_version=3) + transformer = DummyStorageTransfomer("dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT) + z = create(1000000000, chunks=True, storage_transformers=[transformer], **kwargs) + assert isinstance(z._store, DummyStorageTransfomer) + assert z._store.test_value == DummyStorageTransfomer.TEST_CONSTANT diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index a33f274621..a76d60f550 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -1,6 +1,7 @@ import array import atexit import copy +import inspect import os import tempfile @@ -8,7 +9,7 @@ import pytest import zarr -from zarr._storage.store import _get_hierarchy_metadata, v3_api_available +from zarr._storage.store import _get_hierarchy_metadata, v3_api_available, StorageTransformer, StoreV3 from zarr.meta import _default_entry_point_metadata_v3 from zarr.storage import (atexit_rmglob, atexit_rmtree, data_root, default_compressor, getsize, init_array, meta_root, @@ -88,6 +89,18 @@ def keys(self): """keys""" +class DummyStorageTransfomer(StorageTransformer): + TEST_CONSTANT = "test1234" + + extension_uri="https://purl.org/zarr/spec/storage_transformers/dummy/1.0" + valid_types=["dummy_type"] + + def __init__(self, _type, test_value) -> None: + super().__init__(_type) + assert test_value == self.TEST_CONSTANT + self.test_value = test_value + + def test_ensure_store_v3(): class InvalidStore: pass @@ -190,8 +203,9 @@ def test_init_array(self, dimension_separator_fixture_v3): store = self.create_store() path = 'arr1' + transformer = DummyStorageTransfomer("dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT) init_array(store, path=path, shape=1000, chunks=100, - dimension_separator=pass_dim_sep) + dimension_separator=pass_dim_sep, storage_transformers=[transformer]) # check metadata mkey = meta_root + path + '.array.json' @@ -204,6 +218,9 @@ def test_init_array(self, dimension_separator_fixture_v3): assert meta['fill_value'] is None # Missing MUST be assumed to be "/" assert meta['chunk_grid']['separator'] is want_dim_sep + assert len(meta["storage_transformers"]) == 1 + assert isinstance(meta["storage_transformers"][0], DummyStorageTransfomer) + assert meta["storage_transformers"][0].test_value == DummyStorageTransfomer.TEST_CONSTANT store.close() def test_list_prefix(self): @@ -235,6 +252,46 @@ def test_rename_nonexisting(self): with pytest.raises(NotImplementedError): store.rename('a', 'b') + def test_get_partial_values(self): + store = self.create_store() + store[data_root + 'foo'] = b'abcdefg' + store[data_root + 'baz'] = b'z' + assert [b'a'] == store.get_partial_values( + [ + (data_root + 'foo', (0, 1)) + ] + ) + assert [b'd', b'b', b'z', b'abc', b'defg'] == store.get_partial_values( + [ + (data_root + 'foo', (3, 1)), + (data_root + 'foo', (1, 1)), + (data_root + 'baz', (0, 1)), + (data_root + 'foo', (0, 3)), + (data_root + 'foo', (3, 4)), + ] + ) + + def test_set_partial_values(self): + store = self.create_store() + store[data_root + 'foo'] = b'abcdefg' + store[data_root + 'baz'] = b'z' + store.set_partial_values( + [ + (data_root + 'foo', 0, b'hey') + ] + ) + assert store[data_root + 'foo'] == b'heydefg' + store.set_partial_values( + [ + (data_root + 'foo', 1, b'oo'), + (data_root + 'baz', 1, b'zzz'), + (data_root + 'baz', 4, b'aaaa'), + (data_root + 'foo', 6, b'done'), + ] + ) + assert store[data_root + 'foo'] == b'hoodefdone' + assert store[data_root + 'baz'] == b'zzzzaaaa' + class TestMappingStoreV3(StoreV3Tests): @@ -530,3 +587,19 @@ def test_top_level_imports(): assert hasattr(zarr, store_name) # pragma: no cover else: assert not hasattr(zarr, store_name) # pragma: no cover + + +def _get_public_and_dunder_methods(some_class): + return set( + name for name, _ in inspect.getmembers(some_class, predicate=inspect.isfunction) + if not name.startswith("_") or name.startswith("__") + ) + + +def test_storage_transformer_interface(): + store_v3_methods = _get_public_and_dunder_methods(StoreV3) + store_v3_methods.discard("__init__") + storage_transformer_methods = _get_public_and_dunder_methods(StorageTransformer) + storage_transformer_methods.discard("__init__") + storage_transformer_methods.discard("get_config") + assert storage_transformer_methods == store_v3_methods From 566e4b0d325d00861287182d76291e4074554758 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Thu, 28 Jul 2022 15:25:00 +0200 Subject: [PATCH 02/22] formatting --- zarr/core.py | 4 +++- zarr/meta.py | 5 ++++- zarr/tests/test_creation.py | 5 ++++- zarr/tests/test_storage_v3.py | 10 ++++++---- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index fd60f35f93..4faa19727b 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -289,7 +289,9 @@ def _load_metadata_nosync(self): if self._chunk_store is not None: transformed_chunk_store = self._chunk_store for storage_transformer in storage_transformers: - transformed_chunk_store = storage_transformer._copy_for_array(transformed_chunk_store) + transformed_chunk_store = ( + storage_transformer._copy_for_array(transformed_chunk_store) + ) self._chunk_store = transformed_chunk_store def _refresh_metadata(self): diff --git a/zarr/meta.py b/zarr/meta.py index 3008e11a98..c719e1f22a 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -464,7 +464,10 @@ def _decode_codec_metadata(cls, meta: Optional[Mapping]) -> Optional[Codec]: return codec @classmethod - def _encode_storage_transformer_metadata(cls, storage_transformer: "StorageTransformer") -> Optional[Mapping]: + def _encode_storage_transformer_metadata( + cls, + storage_transformer: "StorageTransformer" + ) -> Optional[Mapping]: return { "extension": storage_transformer.extension_uri, "type": storage_transformer.type, diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 6b0e0ee3e3..c289fbc639 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -729,7 +729,10 @@ def test_json_dumps_chunks_numpy_dtype(): def test_create_with_storage_transformers(): kwargs = _init_creation_kwargs(zarr_version=3) - transformer = DummyStorageTransfomer("dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT) + transformer = DummyStorageTransfomer( + "dummy_type", + test_value=DummyStorageTransfomer.TEST_CONSTANT + ) z = create(1000000000, chunks=True, storage_transformers=[transformer], **kwargs) assert isinstance(z._store, DummyStorageTransfomer) assert z._store.test_value == DummyStorageTransfomer.TEST_CONSTANT diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index a76d60f550..5b9ba8afd7 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -9,7 +9,7 @@ import pytest import zarr -from zarr._storage.store import _get_hierarchy_metadata, v3_api_available, StorageTransformer, StoreV3 +from zarr._storage.store import _get_hierarchy_metadata, v3_api_available, StorageTransformer from zarr.meta import _default_entry_point_metadata_v3 from zarr.storage import (atexit_rmglob, atexit_rmtree, data_root, default_compressor, getsize, init_array, meta_root, @@ -92,8 +92,8 @@ def keys(self): class DummyStorageTransfomer(StorageTransformer): TEST_CONSTANT = "test1234" - extension_uri="https://purl.org/zarr/spec/storage_transformers/dummy/1.0" - valid_types=["dummy_type"] + extension_uri = "https://purl.org/zarr/spec/storage_transformers/dummy/1.0" + valid_types = ["dummy_type"] def __init__(self, _type, test_value) -> None: super().__init__(_type) @@ -203,7 +203,9 @@ def test_init_array(self, dimension_separator_fixture_v3): store = self.create_store() path = 'arr1' - transformer = DummyStorageTransfomer("dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT) + transformer = DummyStorageTransfomer( + "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT + ) init_array(store, path=path, shape=1000, chunks=100, dimension_separator=pass_dim_sep, storage_transformers=[transformer]) From 5f85439ff7f0a8643c2b272a97f37df6712ebf8b Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Thu, 28 Jul 2022 17:55:19 +0200 Subject: [PATCH 03/22] add docs and release notes --- docs/release.rst | 10 ++++++++++ zarr/_storage/store.py | 3 +++ zarr/creation.py | 8 ++++++++ zarr/meta.py | 2 ++ 4 files changed, 23 insertions(+) diff --git a/docs/release.rst b/docs/release.rst index b729f20ee0..138929881e 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -11,6 +11,16 @@ Release notes Unreleased ---------- +Enhancements +~~~~~~~~~~~~ + +* **Improve Zarr V3 support, adding partial store read/write and storage transformers.** + Add two features of the [v3 spec](https://zarr-specs.readthedocs.io/en/latest/core/v3.0.html): + * storage transformers + * `get_partial_values` and `set_partial_values` + By :user:`Jonathan Striebel `; :issue:`1096`. + + Documentation ~~~~~~~~~~~~~ diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index ebaff601d6..76b101ae60 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -344,6 +344,9 @@ def _ensure_store(store): class StorageTransformer(MutableMapping, abc.ABC): + """Base class for storage transformers. The methods simply pass on the data as-is + and should be overwritten by sub-classes.""" + def __init__(self, _type) -> None: assert _type in self.valid_types self.type = _type diff --git a/zarr/creation.py b/zarr/creation.py index 537c0b8560..446c85b3d6 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -84,6 +84,14 @@ def create(shape, chunks=True, dtype=None, compressor='default', .. versionadded:: 2.11 + storage_transformers : sequence of StorageTransformers, optional + May only be set when using zarr_version 3. + Setting storage transformers, changing the storage structure and behaviour + of data coming in the underlying store. The transformers are applied in the + order of the given sequence. + + .. versionadded:: 2.13 + zarr_version : {None, 2, 3}, optional The zarr protocol version of the created array. If None, it will be inferred from ``store`` or ``chunk_store`` if they are provided, diff --git a/zarr/meta.py b/zarr/meta.py index c719e1f22a..a469d83807 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -477,6 +477,8 @@ def _encode_storage_transformer_metadata( @classmethod def _decode_storage_transformer_metadata(cls, meta: Mapping) -> "StorageTransformer": from zarr.tests.test_storage_v3 import DummyStorageTransfomer + + # This might be changed to a proper registry in the future KNOWN_STORAGE_TRANSFORMERS = [DummyStorageTransfomer] conf = meta.get('configuration', {}) From dd7fedb9186870d4e7cd2b418a9385d4ecb4366b Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Fri, 29 Jul 2022 11:26:04 +0200 Subject: [PATCH 04/22] add test_core testcase --- zarr/_storage/store.py | 3 +++ zarr/tests/test_core.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 76b101ae60..417ac450a8 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -347,6 +347,9 @@ class StorageTransformer(MutableMapping, abc.ABC): """Base class for storage transformers. The methods simply pass on the data as-is and should be overwritten by sub-classes.""" + _store_version = 3 + _metadata_class = Metadata3 + def __init__(self, _type) -> None: assert _type in self.valid_types self.type = _type diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index f5f043e6e3..bccc192d66 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -55,6 +55,7 @@ SQLiteStoreV3, StoreV3, ) +from zarr.tests.test_storage_v3 import DummyStorageTransfomer from zarr.util import buffer_size from zarr.tests.util import abs_container, skip_test_env_var, have_fsspec @@ -3348,6 +3349,35 @@ def expected(self): ] +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +class TestArrayWithStorageTransformersV3(TestArrayWithPathV3): + + @staticmethod + def create_array(array_path='arr1', read_only=False, **kwargs): + store = KVStoreV3(dict()) + kwargs.setdefault('compressor', Zlib(level=1)) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) + dummy_storage_transformer = DummyStorageTransfomer( + "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT + ) + init_array(store, path=array_path, storage_transformers=[dummy_storage_transformer], + **kwargs) + return Array(store, path=array_path, read_only=read_only, + cache_metadata=cache_metadata, cache_attrs=cache_attrs, + write_empty_chunks=write_empty_chunks) + + def expected(self): + return [ + "0bc73a90578b908bfe8d5b90aaf79511cc0a5f18", + "ae4ce0caa648d312e9cbe09bc35a3d197945f648", + "c3a018158668c18a615e38f32b1ea3ce248f4d1f", + "aaa1558d072f3d7fc30959992dbd9923458c25ba", + "9587eb0d9662b6b6c1e1fa4a623b5facc1110e5f", + ] + + @pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") def test_array_mismatched_store_versions(): store_v3 = KVStoreV3(dict()) From e33b36522177ca7b8a1dd0aca60c5efb2f0fad14 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Fri, 29 Jul 2022 16:29:36 +0200 Subject: [PATCH 05/22] Update zarr/creation.py Co-authored-by: Gregory Lee --- zarr/creation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/zarr/creation.py b/zarr/creation.py index 446c85b3d6..5551cb7f35 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -85,10 +85,9 @@ def create(shape, chunks=True, dtype=None, compressor='default', .. versionadded:: 2.11 storage_transformers : sequence of StorageTransformers, optional - May only be set when using zarr_version 3. - Setting storage transformers, changing the storage structure and behaviour - of data coming in the underlying store. The transformers are applied in the - order of the given sequence. + Setting storage transformers, changes the storage structure and behaviour + of data coming from the underlying store. The transformers are applied in the + order of the given sequence. May only be set when using zarr_version 3. .. versionadded:: 2.13 From 81ebf68d854956887f07fc75cb08fccfc89074b2 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Fri, 29 Jul 2022 16:36:29 +0200 Subject: [PATCH 06/22] apply PR feedback --- zarr/_storage/store.py | 13 +++++++++++-- zarr/creation.py | 2 +- zarr/tests/test_storage_v3.py | 3 ++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 417ac450a8..ff35c4fbd2 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -289,7 +289,12 @@ def set_partial_values(self, key_start_values): assert start == 0 values[key] = value else: - assert start <= len(values[key]) + if start > len(values[key]): + raise ValueError( + f"Cannot set value at start {start}, " + + f"since it is beyond the data at key {key}, " + + f"having length {len(values[key])}." + ) values[key][start:start + len(value)] = value for key, value in values.items(): self[key] = value @@ -351,7 +356,11 @@ class StorageTransformer(MutableMapping, abc.ABC): _metadata_class = Metadata3 def __init__(self, _type) -> None: - assert _type in self.valid_types + if _type not in self.valid_types: + raise ValueError( + f"Storage transformer cannot be initialized with type {_type}, " + + f"must be one of {list(self.valid_types)}." + ) self.type = _type self._inner_store = None diff --git a/zarr/creation.py b/zarr/creation.py index 5551cb7f35..bbafb53b00 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -21,7 +21,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, object_codec=None, dimension_separator=None, write_empty_chunks=True, - storage_transformers=None, *, zarr_version=None, **kwargs): + *, zarr_version=None, storage_transformers=None, **kwargs): """Create an array. Parameters diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index 5b9ba8afd7..ad2775ad64 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -263,13 +263,14 @@ def test_get_partial_values(self): (data_root + 'foo', (0, 1)) ] ) - assert [b'd', b'b', b'z', b'abc', b'defg'] == store.get_partial_values( + assert [b'd', b'b', b'z', b'abc', b'defg', b'defg'] == store.get_partial_values( [ (data_root + 'foo', (3, 1)), (data_root + 'foo', (1, 1)), (data_root + 'baz', (0, 1)), (data_root + 'foo', (0, 3)), (data_root + 'foo', (3, 4)), + (data_root + 'foo', (3, None)), ] ) From ca28471972e0981a96e6162041ae351cece471f6 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Fri, 29 Jul 2022 16:42:23 +0200 Subject: [PATCH 07/22] add comment that storage_transformers=None is the same as storage_transformers=[] --- zarr/creation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zarr/creation.py b/zarr/creation.py index bbafb53b00..90f22f33da 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -87,7 +87,9 @@ def create(shape, chunks=True, dtype=None, compressor='default', storage_transformers : sequence of StorageTransformers, optional Setting storage transformers, changes the storage structure and behaviour of data coming from the underlying store. The transformers are applied in the - order of the given sequence. May only be set when using zarr_version 3. + order of the given sequence. Supplying an empty sequence is the same as omitting + the argument or setting it to None. May only be set when using zarr_version 3. + Supplying an empty seq .. versionadded:: 2.13 From 85f3309bf09f90c03b563a10d601dd67cd464870 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 1 Aug 2022 09:37:38 +0200 Subject: [PATCH 08/22] use empty tuple as default for storage_transformers --- zarr/creation.py | 2 +- zarr/meta.py | 18 ++++++++---------- zarr/storage.py | 4 ++-- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/zarr/creation.py b/zarr/creation.py index 90f22f33da..0aa71c4137 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -21,7 +21,7 @@ def create(shape, chunks=True, dtype=None, compressor='default', overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, object_codec=None, dimension_separator=None, write_empty_chunks=True, - *, zarr_version=None, storage_transformers=None, **kwargs): + *, zarr_version=None, storage_transformers=(), **kwargs): """Create an array. Parameters diff --git a/zarr/meta.py b/zarr/meta.py index a469d83807..653e74937b 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -510,11 +510,10 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A # TODO: remove dimension_separator? compressor = cls._decode_codec_metadata(meta.get("compressor", None)) - storage_transformers = meta.get("storage_transformers", None) - if storage_transformers: - storage_transformers = [ - cls._decode_storage_transformer_metadata(i) for i in storage_transformers - ] + storage_transformers = meta.get("storage_transformers", ()) + storage_transformers = [ + cls._decode_storage_transformer_metadata(i) for i in storage_transformers + ] extensions = meta.get("extensions", []) meta = dict( shape=tuple(meta["shape"]), @@ -555,11 +554,10 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: object_codec = None compressor = cls._encode_codec_metadata(meta.get("compressor", None)) - storage_transformers = meta.get("storage_transformers", None) - if storage_transformers: - storage_transformers = [ - cls._encode_storage_transformer_metadata(i) for i in storage_transformers - ] + storage_transformers = meta.get("storage_transformers", ()) + storage_transformers = [ + cls._encode_storage_transformer_metadata(i) for i in storage_transformers + ] extensions = meta.get("extensions", []) meta = dict( shape=meta["shape"] + sdshape, diff --git a/zarr/storage.py b/zarr/storage.py index e1f7e41d5f..e0c7407c28 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -294,7 +294,7 @@ def init_array( filters=None, object_codec=None, dimension_separator=None, - storage_transformers=None, + storage_transformers=(), ): """Initialize an array store with the given configuration. Note that this is a low-level function and there should be no need to call this directly from user code. @@ -440,7 +440,7 @@ def _init_array_metadata( filters=None, object_codec=None, dimension_separator=None, - storage_transformers=None, + storage_transformers=(), ): store_version = getattr(store, '_store_version', 2) From 41eaafb3ffe2c6879bc7ed943ea490c61a84fe96 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 1 Aug 2022 10:17:17 +0200 Subject: [PATCH 09/22] make mypy happy --- zarr/_storage/store.py | 53 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index ff35c4fbd2..5178e1c34e 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -397,74 +397,81 @@ def from_config(cls, _type, config): # keyword arguments without any special decoding return cls(_type, **config) + @property + def inner_store(self) -> Union["StorageTransformer", StoreV3]: + assert self._inner_store is not None, ( + "inner_store is not initialized, first get a copy via _copy_for_array." + ) + return self._inner_store + def is_readable(self): - return self._inner_store.is_readable() + return self.inner_store.is_readable() def is_writeable(self): - return self._inner_store.is_writeable() + return self.inner_store.is_writeable() def is_listable(self): - return self._inner_store.is_listable() + return self.inner_store.is_listable() def is_erasable(self): - return self._inner_store.is_erasable() + return self.inner_store.is_erasable() def __enter__(self): - return self._inner_store.__enter__() + return self.inner_store.__enter__() def __exit__(self, exc_type, exc_value, traceback): - return self._inner_store.__exit__(exc_type, exc_value, traceback) + return self.inner_store.__exit__(exc_type, exc_value, traceback) def close(self) -> None: - return self._inner_store.close() + return self.inner_store.close() def rename(self, src_path: str, dst_path: str) -> None: - return self._inner_store.rename(src_path, dst_path) + return self.inner_store.rename(src_path, dst_path) def list_prefix(self, prefix): - return self._inner_store.list_prefix(prefix) + return self.inner_store.list_prefix(prefix) def erase(self, key): - return self._inner_store.erase(key) + return self.inner_store.erase(key) def erase_prefix(self, prefix): - return self._inner_store.erase_prefix(prefix) + return self.inner_store.erase_prefix(prefix) def list_dir(self, prefix): - return self._inner_store.list_dir(prefix) + return self.inner_store.list_dir(prefix) def list(self): - return self._inner_store.list() + return self.inner_store.list() def __contains__(self, key): - return self._inner_store.__contains__(key) + return self.inner_store.__contains__(key) def __setitem__(self, key, value): - return self._inner_store.__setitem__(key, value) + return self.inner_store.__setitem__(key, value) def __getitem__(self, key): - return self._inner_store.__getitem__(key) + return self.inner_store.__getitem__(key) def __delitem__(self, key): - return self._inner_store.__delitem__(key) + return self.inner_store.__delitem__(key) def __iter__(self): - return self._inner_store.__iter__() + return self.inner_store.__iter__() def __len__(self): - return self._inner_store.__len__() + return self.inner_store.__len__() def get_partial_values(self, key_ranges): - return self._inner_store.get_partial_values(key_ranges) + return self.inner_store.get_partial_values(key_ranges) def set_partial_values(self, key_start_values): - return self._inner_store.set_partial_values(key_start_values) + return self.inner_store.set_partial_values(key_start_values) def clear(self): - return self._inner_store.clear() + return self.inner_store.clear() def __eq__(self, other): - return self._inner_store.__eq__(other) + return self.inner_store.__eq__(other) # allow MutableMapping for backwards compatibility From 5d7be7671a179b81bb51979b110ce22f546d456e Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 1 Aug 2022 14:04:09 +0200 Subject: [PATCH 10/22] better coverage, minor fix, adding rmdir --- zarr/_storage/store.py | 26 ++++++++++++++++++++------ zarr/meta.py | 4 ++-- zarr/tests/test_core.py | 2 +- zarr/tests/test_storage_v3.py | 18 +++++++++++++++++- 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 5178e1c34e..8ffd895034 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -256,6 +256,10 @@ def __setitem__(self, key, value): def __getitem__(self, key): """Get a value.""" + @abc.abstractmethod + def rmdir(self, path=None): + pass + def get_partial_values(self, key_ranges): """Get multiple partial values. key_ranges can be an iterable of key, range pairs, @@ -283,13 +287,16 @@ def set_partial_values(self, key_start_values): A key may occur multiple times with different starts and non-overlapping values. Also, start may only be beyond the current value if other values fill the gap.""" unique_keys = set(next(zip(*key_start_values))) - values = {key: bytearray(self.get(key)) for key in unique_keys} + values = {} + for key in unique_keys: + old_value = self.get(key) + values[key] = None if old_value is None else bytearray(old_value) for key, start, value in key_start_values: if values[key] is None: assert start == 0 values[key] = value else: - if start > len(values[key]): + if start > len(values[key]): # pragma: no cover raise ValueError( f"Cannot set value at start {start}, " + f"since it is beyond the data at key {key}, " @@ -356,7 +363,7 @@ class StorageTransformer(MutableMapping, abc.ABC): _metadata_class = Metadata3 def __init__(self, _type) -> None: - if _type not in self.valid_types: + if _type not in self.valid_types: # pragma: no cover raise ValueError( f"Storage transformer cannot be initialized with type {_type}, " + f"must be one of {list(self.valid_types)}." @@ -371,11 +378,11 @@ def _copy_for_array(self, inner_store): @abc.abstractproperty def extension_uri(self): - pass + pass # pragma: no cover @abc.abstractproperty def valid_types(self): - pass + pass # pragma: no cover def get_config(self): """Return a dictionary holding configuration parameters for this @@ -437,6 +444,9 @@ def erase(self, key): def erase_prefix(self, prefix): return self.inner_store.erase_prefix(prefix) + def rmdir(self, path=None): + return self.inner_store.rmdir(path) + def list_dir(self, prefix): return self.inner_store.list_dir(prefix) @@ -471,7 +481,11 @@ def clear(self): return self.inner_store.clear() def __eq__(self, other): - return self.inner_store.__eq__(other) + return ( + type(self) == type(other) and + self._inner_store == other._inner_store and + self.get_config() == other.get_config() + ) # allow MutableMapping for backwards compatibility diff --git a/zarr/meta.py b/zarr/meta.py index 653e74937b..6cd0f6b5ac 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -11,7 +11,7 @@ from typing import cast, Union, Any, List, Mapping as MappingType, Optional, TYPE_CHECKING -if TYPE_CHECKING: +if TYPE_CHECKING: # pragma: no cover from zarr._storage.store import StorageTransformer @@ -488,7 +488,7 @@ def _decode_storage_transformer_metadata(cls, meta: Mapping) -> "StorageTransfor for StorageTransformerCls in KNOWN_STORAGE_TRANSFORMERS: if StorageTransformerCls.extension_uri == extension_uri: break - else: + else: # pragma: no cover raise NotImplementedError return StorageTransformerCls.from_config(transformer_type, conf) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index bccc192d66..3c60ea2b98 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -3363,7 +3363,7 @@ def create_array(array_path='arr1', read_only=False, **kwargs): "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT ) init_array(store, path=array_path, storage_transformers=[dummy_storage_transformer], - **kwargs) + chunk_store=store, **kwargs) return Array(store, path=array_path, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index ad2775ad64..c91fc7e523 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -277,13 +277,19 @@ def test_get_partial_values(self): def test_set_partial_values(self): store = self.create_store() store[data_root + 'foo'] = b'abcdefg' - store[data_root + 'baz'] = b'z' store.set_partial_values( [ (data_root + 'foo', 0, b'hey') ] ) assert store[data_root + 'foo'] == b'heydefg' + + store.set_partial_values( + [ + (data_root + 'baz', 0, b'z') + ] + ) + assert store[data_root + 'baz'] == b'z' store.set_partial_values( [ (data_root + 'foo', 1, b'oo'), @@ -503,6 +509,16 @@ def create_store(self, **kwargs): return store +class TestStorageTransformerV3(TestMappingStoreV3): + + def create_store(self, **kwargs): + inner_store = super().create_store(**kwargs) + storage_transformer = DummyStorageTransfomer( + "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT + ) + return storage_transformer._copy_for_array(inner_store) + + class TestLRUStoreCacheV3(_TestLRUStoreCache, StoreV3Tests): CountingClass = CountingDictV3 From 46229ad8123e95e70d3e3a7008904d8ccac2606f Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 1 Aug 2022 14:17:11 +0200 Subject: [PATCH 11/22] add missing rmdir to test --- zarr/tests/test_core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 3c60ea2b98..0ee8a3defb 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -52,6 +52,7 @@ KVStoreV3, LMDBStoreV3, LRUStoreCacheV3, + RmdirV3, SQLiteStoreV3, StoreV3, ) @@ -3088,7 +3089,7 @@ def test_nbytes_stored(self): # Note: this custom mapping doesn't actually have all methods in the # v3 spec (e.g. erase), but they aren't needed here. -class CustomMappingV3(StoreV3): +class CustomMappingV3(RmdirV3, StoreV3): def __init__(self): self.inner = KVStoreV3(dict()) From 3a9f7ccfd08dc68c2f2d148e44d1ccffd4b840ad Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Tue, 2 Aug 2022 23:00:32 +0200 Subject: [PATCH 12/22] increase coverage --- zarr/_storage/store.py | 3 ++- zarr/tests/test_storage_v3.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 8ffd895034..6e4076d23c 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -258,7 +258,8 @@ def __getitem__(self, key): @abc.abstractmethod def rmdir(self, path=None): - pass + """Remove a data path and all its subkeys and related metadata. + Expects a path without the data or meta root prefix.""" def get_partial_values(self, key_ranges): """Get multiple partial values. diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index c91fc7e523..a6fef788db 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -518,6 +518,21 @@ def create_store(self, **kwargs): ) return storage_transformer._copy_for_array(inner_store) + def test_method_forwarding(self): + store = self.create_store() + assert store.list() == store.inner_store.list() + assert store.list_dir(data_root) == store.inner_store.list_dir(data_root) + + assert store.is_readable() + assert store.is_writeable() + assert store.is_listable() + store.inner_store._readable = False + store.inner_store._writeable = False + store.inner_store._listable = False + assert not store.is_readable() + assert not store.is_writeable() + assert not store.is_listable() + class TestLRUStoreCacheV3(_TestLRUStoreCache, StoreV3Tests): From efa4e07cff4c2e9ef0bd1e4656ec80739167949e Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Wed, 3 Aug 2022 09:34:59 +0200 Subject: [PATCH 13/22] improve test coverage --- zarr/tests/test_core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 0ee8a3defb..c28a8b467a 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -3356,6 +3356,7 @@ class TestArrayWithStorageTransformersV3(TestArrayWithPathV3): @staticmethod def create_array(array_path='arr1', read_only=False, **kwargs): store = KVStoreV3(dict()) + chunk_store = KVStoreV3(dict()) kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) @@ -3364,10 +3365,10 @@ def create_array(array_path='arr1', read_only=False, **kwargs): "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT ) init_array(store, path=array_path, storage_transformers=[dummy_storage_transformer], - chunk_store=store, **kwargs) + chunk_store=chunk_store, **kwargs) return Array(store, path=array_path, read_only=read_only, cache_metadata=cache_metadata, cache_attrs=cache_attrs, - write_empty_chunks=write_empty_chunks) + write_empty_chunks=write_empty_chunks, chunk_store=chunk_store) def expected(self): return [ From b4668a86d58f6dadfe8899902788e4447148b4d2 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Wed, 3 Aug 2022 09:49:15 +0200 Subject: [PATCH 14/22] fix TestArrayWithStorageTransformersV3 --- zarr/tests/test_core.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index c28a8b467a..3f6ceb9359 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -3351,32 +3351,32 @@ def expected(self): @pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") -class TestArrayWithStorageTransformersV3(TestArrayWithPathV3): +class TestArrayWithStorageTransformersV3(TestArrayWithChunkStoreV3): @staticmethod def create_array(array_path='arr1', read_only=False, **kwargs): store = KVStoreV3(dict()) + # separate chunk store chunk_store = KVStoreV3(dict()) - kwargs.setdefault('compressor', Zlib(level=1)) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) write_empty_chunks = kwargs.pop('write_empty_chunks', True) dummy_storage_transformer = DummyStorageTransfomer( "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT ) - init_array(store, path=array_path, storage_transformers=[dummy_storage_transformer], - chunk_store=chunk_store, **kwargs) + init_array(store, path=array_path, chunk_store=chunk_store, + storage_transformers=[dummy_storage_transformer], **kwargs) return Array(store, path=array_path, read_only=read_only, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, - write_empty_chunks=write_empty_chunks, chunk_store=chunk_store) + chunk_store=chunk_store, cache_metadata=cache_metadata, + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) def expected(self): return [ - "0bc73a90578b908bfe8d5b90aaf79511cc0a5f18", - "ae4ce0caa648d312e9cbe09bc35a3d197945f648", - "c3a018158668c18a615e38f32b1ea3ce248f4d1f", - "aaa1558d072f3d7fc30959992dbd9923458c25ba", - "9587eb0d9662b6b6c1e1fa4a623b5facc1110e5f", + "3fb9a4f8233b09ad02067b6b7fc9fd5caa405c7d", + "89c8eb364beb84919fc9153d2c1ed2696274ec18", + "73307055c3aec095dd1232c38d793ef82a06bd97", + "6152c09255a5efa43b1a115546e35affa00c138c", + "2f8802fc391f67f713302e84fad4fd8f1366d6c2", ] From e454046f8b4f72f6f6782488a8ebb55bc8c12ec2 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 8 Aug 2022 10:41:08 +0200 Subject: [PATCH 15/22] Update zarr/creation.py Co-authored-by: Gregory Lee --- zarr/creation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zarr/creation.py b/zarr/creation.py index 0aa71c4137..271512c4cc 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -89,7 +89,6 @@ def create(shape, chunks=True, dtype=None, compressor='default', of data coming from the underlying store. The transformers are applied in the order of the given sequence. Supplying an empty sequence is the same as omitting the argument or setting it to None. May only be set when using zarr_version 3. - Supplying an empty seq .. versionadded:: 2.13 From 696d5ca405c005861fad0142d304204b39b247e3 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 22 Aug 2022 15:10:32 +0200 Subject: [PATCH 16/22] pick generic storage transformer changes from #1111 --- zarr/_storage/store.py | 94 +++++++++++++++++++++++++---------- zarr/core.py | 25 +++++----- zarr/tests/test_creation.py | 4 +- zarr/tests/test_storage_v3.py | 16 +++++- 4 files changed, 97 insertions(+), 42 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 6e4076d23c..a55de9c799 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -261,19 +261,28 @@ def rmdir(self, path=None): """Remove a data path and all its subkeys and related metadata. Expects a path without the data or meta root prefix.""" + def supports_efficient_get_partial_values(self): + return False + def get_partial_values(self, key_ranges): """Get multiple partial values. key_ranges can be an iterable of key, range pairs, where a range specifies two integers range_start and range_length as a tuple, (range_start, range_length). - Length may be None to indicate to read until the end. - A key may occur multiple times with different ranges.""" + range_length may be None to indicate to read until the end. + range_start may be negative to start reading range_start bytes + from the end of the file. + A key may occur multiple times with different ranges. + Inserts None for missing keys into the returned list.""" results = [None] * len(key_ranges) indexed_ranges_by_key = defaultdict(list) for i, (key, range_) in enumerate(key_ranges): indexed_ranges_by_key[key].append((i, range_)) for key, indexed_ranges in indexed_ranges_by_key.items(): - value = self[key] + try: + value = self[key] + except KeyError: + continue for i, (range_from, range_length) in indexed_ranges: if range_length is None: results[i] = value[range_from:] @@ -281,12 +290,17 @@ def get_partial_values(self, key_ranges): results[i] = value[range_from:range_from + range_length] return results + def supports_efficient_set_partial_values(self): + return False + def set_partial_values(self, key_start_values): """Set multiple partial values. key_start_values can be an iterable of key, start and value triplets as tuples, (key, start, value), where start defines the offset in bytes. A key may occur multiple times with different starts and non-overlapping values. - Also, start may only be beyond the current value if other values fill the gap.""" + Also, start may only be beyond the current value if other values fill the gap. + start may be negative to start writing start bytes from the current + end of the file, ending the file with the new value.""" unique_keys = set(next(zip(*key_start_values))) values = {} for key in unique_keys: @@ -303,7 +317,10 @@ def set_partial_values(self, key_start_values): + f"since it is beyond the data at key {key}, " + f"having length {len(values[key])}." ) - values[key][start:start + len(value)] = value + if start < 0: + values[key][start:] = value + else: + values[key][start:start + len(value)] = value for key, value in values.items(): self[key] = value @@ -372,7 +389,7 @@ def __init__(self, _type) -> None: self.type = _type self._inner_store = None - def _copy_for_array(self, inner_store): + def _copy_for_array(self, array, inner_store): transformer_copy = copy(self) transformer_copy._inner_store = inner_store return transformer_copy @@ -412,6 +429,40 @@ def inner_store(self) -> Union["StorageTransformer", StoreV3]: ) return self._inner_store + # The following implementations are usually fine to keep as-is: + + def __eq__(self, other): + return ( + type(self) == type(other) and + self._inner_store == other._inner_store and + self.get_config() == other.get_config() + ) + + def erase(self, key): + self.__delitem__(key) + + def list(self): + return list(self.keys()) + + def list_dir(self, prefix): + """ + TODO: carefully test this with trailing/leading slashes + """ + if prefix: # allow prefix = "" ? + assert prefix.endswith("/") + + all_keys = self.list_prefix(prefix) + len_prefix = len(prefix) + keys = [] + prefixes = [] + for k in all_keys: + trail = k[len_prefix:] + if "/" not in trail: + keys.append(prefix + trail) + else: + prefixes.append(prefix + trail.split("/", maxsplit=1)[0] + "/") + return keys, list(set(prefixes)) + def is_readable(self): return self.inner_store.is_readable() @@ -424,6 +475,9 @@ def is_listable(self): def is_erasable(self): return self.inner_store.is_erasable() + def clear(self): + return self.inner_store.clear() + def __enter__(self): return self.inner_store.__enter__() @@ -433,27 +487,21 @@ def __exit__(self, exc_type, exc_value, traceback): def close(self) -> None: return self.inner_store.close() + # The following implementations might need to be re-implemented + # by subclasses implementing storage transformers: + def rename(self, src_path: str, dst_path: str) -> None: return self.inner_store.rename(src_path, dst_path) def list_prefix(self, prefix): return self.inner_store.list_prefix(prefix) - def erase(self, key): - return self.inner_store.erase(key) - def erase_prefix(self, prefix): return self.inner_store.erase_prefix(prefix) def rmdir(self, path=None): return self.inner_store.rmdir(path) - def list_dir(self, prefix): - return self.inner_store.list_dir(prefix) - - def list(self): - return self.inner_store.list() - def __contains__(self, key): return self.inner_store.__contains__(key) @@ -472,22 +520,18 @@ def __iter__(self): def __len__(self): return self.inner_store.__len__() + def supports_efficient_get_partial_values(self): + return self.inner_store.supports_efficient_get_partial_values() + def get_partial_values(self, key_ranges): return self.inner_store.get_partial_values(key_ranges) + def supports_efficient_set_partial_values(self): + return self.inner_store.supports_efficient_set_partial_values() + def set_partial_values(self, key_start_values): return self.inner_store.set_partial_values(key_start_values) - def clear(self): - return self.inner_store.clear() - - def __eq__(self, other): - return ( - type(self) == type(other) and - self._inner_store == other._inner_store and - self.get_config() == other.get_config() - ) - # allow MutableMapping for backwards compatibility StoreLike = Union[BaseStore, MutableMapping] diff --git a/zarr/core.py b/zarr/core.py index 4faa19727b..3acf9c9ea8 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -180,6 +180,7 @@ def __init__( self._store = store self._chunk_store = chunk_store + self._transformed_chunk_store = None self._path = normalize_storage_path(path) if self._path: self._key_prefix = self._path + '/' @@ -282,17 +283,13 @@ def _load_metadata_nosync(self): if self._version == 3: storage_transformers = meta.get('storage_transformers', []) - transformed_store = self._store - for storage_transformer in storage_transformers: - transformed_store = storage_transformer._copy_for_array(transformed_store) - self._store = transformed_store - if self._chunk_store is not None: - transformed_chunk_store = self._chunk_store + if storage_transformers: + transformed_store = self._chunk_store or self._store for storage_transformer in storage_transformers: - transformed_chunk_store = ( - storage_transformer._copy_for_array(transformed_chunk_store) + transformed_store = storage_transformer._copy_for_array( + self, transformed_store ) - self._chunk_store = transformed_chunk_store + self._transformed_chunk_store = transformed_store def _refresh_metadata(self): if not self._cache_metadata: @@ -373,10 +370,12 @@ def read_only(self, value): @property def chunk_store(self): """A MutableMapping providing the underlying storage for array chunks.""" - if self._chunk_store is None: - return self._store - else: + if self._transformed_chunk_store is not None: + return self._transformed_chunk_store + elif self._chunk_store is not None: return self._chunk_store + else: + return self._store @property def shape(self): @@ -1790,7 +1789,7 @@ def _set_selection(self, indexer, value, fields=None): check_array_shape('value', value, sel_shape) # iterate over chunks in range - if not hasattr(self.store, "setitems") or self._synchronizer is not None \ + if not hasattr(self.chunk_store, "setitems") or self._synchronizer is not None \ or any(map(lambda x: x == 0, self.shape)): # iterative approach for chunk_coords, chunk_selection, out_selection in indexer: diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index c289fbc639..fea146d832 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -734,5 +734,5 @@ def test_create_with_storage_transformers(): test_value=DummyStorageTransfomer.TEST_CONSTANT ) z = create(1000000000, chunks=True, storage_transformers=[transformer], **kwargs) - assert isinstance(z._store, DummyStorageTransfomer) - assert z._store.test_value == DummyStorageTransfomer.TEST_CONSTANT + assert isinstance(z.chunk_store, DummyStorageTransfomer) + assert z.chunk_store.test_value == DummyStorageTransfomer.TEST_CONSTANT diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index a6fef788db..3666c01594 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -263,7 +263,9 @@ def test_get_partial_values(self): (data_root + 'foo', (0, 1)) ] ) - assert [b'd', b'b', b'z', b'abc', b'defg', b'defg'] == store.get_partial_values( + assert [ + b'd', b'b', b'z', b'abc', b'defg', b'defg', b'g', b'ef' + ] == store.get_partial_values( [ (data_root + 'foo', (3, 1)), (data_root + 'foo', (1, 1)), @@ -271,6 +273,8 @@ def test_get_partial_values(self): (data_root + 'foo', (0, 3)), (data_root + 'foo', (3, 4)), (data_root + 'foo', (3, None)), + (data_root + 'foo', (-1, None)), + (data_root + 'foo', (-3, 2)), ] ) @@ -300,6 +304,14 @@ def test_set_partial_values(self): ) assert store[data_root + 'foo'] == b'hoodefdone' assert store[data_root + 'baz'] == b'zzzzaaaa' + store.set_partial_values( + [ + (data_root + 'foo', -2, b'NE'), + (data_root + 'baz', -5, b'q'), + ] + ) + assert store[data_root + 'foo'] == b'hoodefdoNE' + assert store[data_root + 'baz'] == b'zzzq' class TestMappingStoreV3(StoreV3Tests): @@ -516,7 +528,7 @@ def create_store(self, **kwargs): storage_transformer = DummyStorageTransfomer( "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT ) - return storage_transformer._copy_for_array(inner_store) + return storage_transformer._copy_for_array(None, inner_store) def test_method_forwarding(self): store = self.create_store() From c099440fbea496e656d50ecd4c959c867de3274e Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 22 Aug 2022 17:32:08 +0200 Subject: [PATCH 17/22] increase coverage --- zarr/_storage/store.py | 20 ++------------------ zarr/tests/test_storage_v3.py | 2 ++ 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index a55de9c799..67e95393ad 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -281,7 +281,7 @@ def get_partial_values(self, key_ranges): for key, indexed_ranges in indexed_ranges_by_key.items(): try: value = self[key] - except KeyError: + except KeyError: # pragma: no cover continue for i, (range_from, range_length) in indexed_ranges: if range_length is None: @@ -445,23 +445,7 @@ def list(self): return list(self.keys()) def list_dir(self, prefix): - """ - TODO: carefully test this with trailing/leading slashes - """ - if prefix: # allow prefix = "" ? - assert prefix.endswith("/") - - all_keys = self.list_prefix(prefix) - len_prefix = len(prefix) - keys = [] - prefixes = [] - for k in all_keys: - trail = k[len_prefix:] - if "/" not in trail: - keys.append(prefix + trail) - else: - prefixes.append(prefix + trail.split("/", maxsplit=1)[0] + "/") - return keys, list(set(prefixes)) + return StoreV3.list_dir(self, prefix) def is_readable(self): return self.inner_store.is_readable() diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index 3666c01594..8d077ef129 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -256,6 +256,7 @@ def test_rename_nonexisting(self): def test_get_partial_values(self): store = self.create_store() + store.supports_efficient_get_partial_values() store[data_root + 'foo'] = b'abcdefg' store[data_root + 'baz'] = b'z' assert [b'a'] == store.get_partial_values( @@ -280,6 +281,7 @@ def test_get_partial_values(self): def test_set_partial_values(self): store = self.create_store() + store.supports_efficient_set_partial_values() store[data_root + 'foo'] = b'abcdefg' store.set_partial_values( [ From be98c0138505e90ff8e5682d837d8772ee0b1186 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Wed, 24 Aug 2022 15:26:10 +0200 Subject: [PATCH 18/22] fix order of storage transformers --- zarr/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/core.py b/zarr/core.py index 3acf9c9ea8..5781d41317 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -285,7 +285,7 @@ def _load_metadata_nosync(self): storage_transformers = meta.get('storage_transformers', []) if storage_transformers: transformed_store = self._chunk_store or self._store - for storage_transformer in storage_transformers: + for storage_transformer in storage_transformers[::-1]: transformed_store = storage_transformer._copy_for_array( self, transformed_store ) From 7c2767af26b6cb1f46fd611d647cd55ff8e88566 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Thu, 25 Aug 2022 13:55:49 +0200 Subject: [PATCH 19/22] retrigger CI From 59cca8bb3292aa01140fcc9e6a992359896208ee Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 12 Dec 2022 13:24:17 +0100 Subject: [PATCH 20/22] minor fixes --- zarr/core.py | 3 ++- zarr/tests/test_creation.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 1baf19cd56..64953060fe 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2242,7 +2242,8 @@ def _encode_chunk(self, chunk): cdata = chunk # ensure in-memory data is immutable and easy to compare - if isinstance(self.chunk_store, KVStore): + if (isinstance(self.chunk_store, KVStore) + or isinstance(self._chunk_store, KVStore)): cdata = ensure_bytes(cdata) return cdata diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 763925cc33..c012bca461 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -738,8 +738,10 @@ def test_json_dumps_chunks_numpy_dtype(): assert np.all(z[...] == 0) -def test_create_with_storage_transformers(): - kwargs = _init_creation_kwargs(zarr_version=3) +@pytest.mark.skipif(not v3_api_available, reason="V3 is disabled") +@pytest.mark.parametrize('at_root', [False, True]) +def test_create_with_storage_transformers(at_root): + kwargs = _init_creation_kwargs(zarr_version=3, at_root=at_root) transformer = DummyStorageTransfomer( "dummy_type", test_value=DummyStorageTransfomer.TEST_CONSTANT From c2dc0d6967a027b1b7797ad43fe9a806d2c5cc77 Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Mon, 12 Dec 2022 13:27:06 +0100 Subject: [PATCH 21/22] make flake8 happy --- zarr/core.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/zarr/core.py b/zarr/core.py index 64953060fe..5d37570831 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -2242,8 +2242,10 @@ def _encode_chunk(self, chunk): cdata = chunk # ensure in-memory data is immutable and easy to compare - if (isinstance(self.chunk_store, KVStore) - or isinstance(self._chunk_store, KVStore)): + if ( + isinstance(self.chunk_store, KVStore) + or isinstance(self._chunk_store, KVStore) + ): cdata = ensure_bytes(cdata) return cdata From 91f0c2c34c96f7baead79fde373aeec2cc2ce37d Mon Sep 17 00:00:00 2001 From: Jonathan Striebel Date: Thu, 22 Dec 2022 11:52:35 +0100 Subject: [PATCH 22/22] apply PR feedback --- zarr/_storage/store.py | 19 ++++++++++++++----- zarr/tests/test_storage_v3.py | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index d1ae487070..4d813b8e05 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -4,7 +4,7 @@ from collections.abc import MutableMapping from copy import copy from string import ascii_letters, digits -from typing import Any, List, Mapping, Optional, Union +from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Union from zarr.meta import Metadata2, Metadata3 from zarr.util import normalize_storage_path @@ -261,10 +261,14 @@ def rmdir(self, path=None): """Remove a data path and all its subkeys and related metadata. Expects a path without the data or meta root prefix.""" + @property def supports_efficient_get_partial_values(self): return False - def get_partial_values(self, key_ranges): + def get_partial_values( + self, + key_ranges: Sequence[Tuple[str, Tuple[int, Optional[int]]]] + ) -> List[Union[bytes, memoryview, bytearray]]: """Get multiple partial values. key_ranges can be an iterable of key, range pairs, where a range specifies two integers range_start and range_length @@ -274,8 +278,12 @@ def get_partial_values(self, key_ranges): from the end of the file. A key may occur multiple times with different ranges. Inserts None for missing keys into the returned list.""" - results = [None] * len(key_ranges) - indexed_ranges_by_key = defaultdict(list) + results: List[Union[bytes, memoryview, bytearray]] = ( + [None] * len(key_ranges) # type: ignore[list-item] + ) + indexed_ranges_by_key: Dict[str, List[Tuple[int, Tuple[int, Optional[int]]]]] = ( + defaultdict(list) + ) for i, (key, range_) in enumerate(key_ranges): indexed_ranges_by_key[key].append((i, range_)) for key, indexed_ranges in indexed_ranges_by_key.items(): @@ -504,8 +512,9 @@ def __iter__(self): def __len__(self): return self.inner_store.__len__() + @property def supports_efficient_get_partial_values(self): - return self.inner_store.supports_efficient_get_partial_values() + return self.inner_store.supports_efficient_get_partial_values def get_partial_values(self, key_ranges): return self.inner_store.get_partial_values(key_ranges) diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py index 2790fbd13c..b6e59c9deb 100644 --- a/zarr/tests/test_storage_v3.py +++ b/zarr/tests/test_storage_v3.py @@ -256,7 +256,7 @@ def test_rename_nonexisting(self): def test_get_partial_values(self): store = self.create_store() - store.supports_efficient_get_partial_values() + store.supports_efficient_get_partial_values in [True, False] store[data_root + 'foo'] = b'abcdefg' store[data_root + 'baz'] = b'z' assert [b'a'] == store.get_partial_values(