-
-
Notifications
You must be signed in to change notification settings - Fork 262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[REVIEW] Support of alternative array classes #934
Changes from 9 commits
590ace6
254bbae
40d084e
947152a
a60b2f4
8da9f17
0bf1cf0
9795f77
bf03fd7
d78ce33
c0402e2
39ffef5
86a1ec6
dc2be53
bb5538f
cb0c02f
745b612
9976f06
be9099d
082f299
3dd64dd
4c921e6
4ce89b2
2127ffa
2abc111
cc85f91
eb7f650
1706cad
a7857d6
b3fc488
2e2b022
dabf502
18c4c6b
2efa2fe
440753c
0dc8f93
fdcf949
d40593b
149d511
3ed7a7e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,9 +7,7 @@ | |
from functools import reduce | ||
|
||
import numpy as np | ||
from numcodecs.compat import ensure_bytes, ensure_ndarray | ||
|
||
from collections.abc import MutableMapping | ||
from numcodecs.compat import ensure_bytes | ||
|
||
from zarr.attrs import Attributes | ||
from zarr.codecs import AsType, get_codec | ||
|
@@ -31,7 +29,7 @@ | |
is_scalar, | ||
pop_fields, | ||
) | ||
from zarr.storage import array_meta_key, attrs_key, getsize, listdir, BaseStore | ||
from zarr.storage import KVStore, array_meta_key, attrs_key, getsize, listdir, BaseStore | ||
from zarr.util import ( | ||
all_equal, | ||
InfoReporter, | ||
|
@@ -44,6 +42,7 @@ | |
normalize_shape, | ||
normalize_storage_path, | ||
PartialReadBuffer, | ||
ensure_ndarray | ||
) | ||
|
||
|
||
|
@@ -91,6 +90,12 @@ class Array: | |
|
||
.. versionadded:: 2.11 | ||
|
||
meta_array : array-like, optional | ||
An array instance to use for determining arrays to create and return | ||
to users. Use `numpy.empty(())` by default. | ||
|
||
.. versionadded:: 2.12 | ||
|
||
|
||
Attributes | ||
---------- | ||
|
@@ -122,6 +127,7 @@ class Array: | |
vindex | ||
oindex | ||
write_empty_chunks | ||
meta_array | ||
|
||
Methods | ||
------- | ||
|
@@ -155,6 +161,7 @@ def __init__( | |
cache_attrs=True, | ||
partial_decompress=False, | ||
write_empty_chunks=True, | ||
meta_array=None, | ||
): | ||
# N.B., expect at this point store is fully initialized with all | ||
# configuration metadata fully specified and normalized | ||
|
@@ -175,6 +182,11 @@ def __init__( | |
self._is_view = False | ||
self._partial_decompress = partial_decompress | ||
self._write_empty_chunks = write_empty_chunks | ||
self._meta_array = meta_array | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if meta_array is not None: | ||
self._meta_array = np.empty_like(meta_array) | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
else: | ||
self._meta_array = np.empty(()) | ||
|
||
# initialize metadata | ||
self._load_metadata() | ||
|
@@ -487,6 +499,13 @@ def write_empty_chunks(self) -> bool: | |
""" | ||
return self._write_empty_chunks | ||
|
||
@property | ||
def meta_array(self): | ||
"""An array-like instance to use for determining arrays to create and return | ||
to users. | ||
""" | ||
return self._meta_array | ||
|
||
def __eq__(self, other): | ||
return ( | ||
isinstance(other, Array) and | ||
|
@@ -861,7 +880,7 @@ def _get_basic_selection_zd(self, selection, out=None, fields=None): | |
|
||
except KeyError: | ||
# chunk not initialized | ||
chunk = np.zeros((), dtype=self._dtype) | ||
chunk = np.zeros_like(self._meta_array, shape=(), dtype=self._dtype) | ||
if self._fill_value is not None: | ||
chunk.fill(self._fill_value) | ||
|
||
|
@@ -1165,7 +1184,8 @@ def _get_selection(self, indexer, out=None, fields=None): | |
|
||
# setup output array | ||
if out is None: | ||
out = np.empty(out_shape, dtype=out_dtype, order=self._order) | ||
out = np.empty_like(self._meta_array, shape=out_shape, | ||
dtype=out_dtype, order=self._order) | ||
else: | ||
check_array_shape('out', out, out_shape) | ||
|
||
|
@@ -1539,9 +1559,13 @@ def set_coordinate_selection(self, selection, value, fields=None): | |
# setup indexer | ||
indexer = CoordinateIndexer(selection, self) | ||
|
||
# handle value - need to flatten | ||
# handle value - need ndarray-like flatten value | ||
if not is_scalar(value, self._dtype): | ||
value = np.asanyarray(value) | ||
try: | ||
value = ensure_ndarray(value) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before, I don't think this make any difference in practice. |
||
except TypeError: | ||
# Handle types like `list` or `tuple` | ||
value = np.array(value, like=self._meta_array) | ||
if hasattr(value, 'shape') and len(value.shape) > 1: | ||
value = value.reshape(-1) | ||
|
||
|
@@ -1644,7 +1668,7 @@ def _set_basic_selection_zd(self, selection, value, fields=None): | |
|
||
except KeyError: | ||
# chunk not initialized | ||
chunk = np.zeros((), dtype=self._dtype) | ||
chunk = np.zeros_like(self._meta_array, shape=(), dtype=self._dtype) | ||
if self._fill_value is not None: | ||
chunk.fill(self._fill_value) | ||
|
||
|
@@ -1704,7 +1728,7 @@ def _set_selection(self, indexer, value, fields=None): | |
pass | ||
else: | ||
if not hasattr(value, 'shape'): | ||
value = np.asanyarray(value) | ||
value = np.array(value, like=self._meta_array) | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
check_array_shape('value', value, sel_shape) | ||
|
||
# iterate over chunks in range | ||
|
@@ -1772,8 +1796,9 @@ def _process_chunk( | |
self._dtype != object): | ||
|
||
dest = out[out_selection] | ||
dest_is_writable = getattr(dest, "writeable", True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't we need to check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CuPy array doesn't have a # Assume that array-like objects that doesn't have a
# `writeable` flag is writable.
dest_is_writable = getattr(dest, "writeable", True) |
||
write_direct = ( | ||
dest.flags.writeable and | ||
dest_is_writable and | ||
( | ||
(self._order == 'C' and dest.flags.c_contiguous) or | ||
(self._order == 'F' and dest.flags.f_contiguous) | ||
|
@@ -1800,7 +1825,7 @@ def _process_chunk( | |
if partial_read_decode: | ||
cdata.prepare_chunk() | ||
# size of chunk | ||
tmp = np.empty(self._chunks, dtype=self.dtype) | ||
tmp = np.empty(self._chunks, dtype=self.dtype, like=self._meta_array) | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
index_selection = PartialChunkIterator(chunk_selection, self.chunks) | ||
for start, nitems, partial_out_selection in index_selection: | ||
expected_shape = [ | ||
|
@@ -2014,7 +2039,9 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): | |
if is_scalar(value, self._dtype): | ||
|
||
# setup array filled with value | ||
chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) | ||
chunk = np.empty( | ||
self._chunks, dtype=self._dtype, order=self._order, like=self._meta_array | ||
) | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
chunk.fill(value) | ||
|
||
else: | ||
|
@@ -2034,14 +2061,18 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): | |
|
||
# chunk not initialized | ||
if self._fill_value is not None: | ||
chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) | ||
chunk = np.empty( | ||
self._chunks, dtype=self._dtype, order=self._order, like=self._meta_array | ||
) | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
chunk.fill(self._fill_value) | ||
elif self._dtype == object: | ||
chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order) | ||
else: | ||
# N.B., use zeros here so any region beyond the array has consistent | ||
# and compressible data | ||
chunk = np.zeros(self._chunks, dtype=self._dtype, order=self._order) | ||
chunk = np.zeros( | ||
self._chunks, dtype=self._dtype, order=self._order, like=self._meta_array | ||
) | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
else: | ||
|
||
|
@@ -2120,7 +2151,7 @@ def _encode_chunk(self, chunk): | |
cdata = chunk | ||
|
||
# ensure in-memory data is immutable and easy to compare | ||
if isinstance(self.chunk_store, MutableMapping): | ||
if isinstance(self.chunk_store, KVStore): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this being changed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since all store classes derives from |
||
cdata = ensure_bytes(cdata) | ||
|
||
return cdata | ||
|
@@ -2371,7 +2402,7 @@ def append(self, data, axis=0): | |
|
||
Parameters | ||
---------- | ||
data : array_like | ||
data : array-like | ||
Data to be appended. | ||
axis : int | ||
Axis along which to append. | ||
|
@@ -2407,7 +2438,7 @@ def _append_nosync(self, data, axis=0): | |
|
||
# ensure data is array-like | ||
if not hasattr(data, 'shape'): | ||
data = np.asanyarray(data) | ||
data = np.array(data, like=self._meta_array) | ||
madsbk marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# ensure shapes are compatible for non-append dimensions | ||
self_shape_preserved = tuple(s for i, s in enumerate(self._shape) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from numcodecs.abc import Codec | ||
from numcodecs.registry import get_codec, register_codec | ||
|
||
from .util import ensure_contiguous_ndarray | ||
|
||
|
||
class CuPyCPUCompressor(Codec): | ||
"""CPU compressor for CuPy arrays | ||
|
||
This compressor converts CuPy arrays host memory before compressing | ||
the arrays using `compressor`. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, it would make sense to implement a compressor, However, in order to limit the scope of this PR, I think we should use the existing CPU compressors for now. Then in a follow up PR, we can implement compression directly in the GPU using something like nvCOMP. This will be very important when we introduce stores that can bypass the CPU and access the GPU directly. |
||
|
||
Parameters | ||
---------- | ||
compressor : numcodecs.abc.Codec | ||
The codec to use for compression and decompression. | ||
""" | ||
|
||
codec_id = "cupy_cpu_compressor" | ||
|
||
def __init__(self, compressor: Codec = None): | ||
self.compressor = compressor | ||
|
||
def encode(self, buf): | ||
import cupy | ||
|
||
buf = cupy.asnumpy(ensure_contiguous_ndarray(buf)) | ||
if self.compressor: | ||
buf = self.compressor.encode(buf) | ||
return buf | ||
|
||
def decode(self, chunk, out=None): | ||
import cupy | ||
|
||
if self.compressor: | ||
cpu_out = None if out is None else cupy.asnumpy(out) | ||
chunk = self.compressor.decode(chunk, cpu_out) | ||
|
||
chunk = cupy.asarray(ensure_contiguous_ndarray(chunk)) | ||
if out is not None: | ||
cupy.copyto(out, chunk.view(dtype=out.dtype), casting="no") | ||
chunk = out | ||
return chunk | ||
|
||
def get_config(self): | ||
cc_config = self.compressor.get_config() if self.compressor else None | ||
return { | ||
"id": self.codec_id, | ||
"compressor_config": cc_config, | ||
} | ||
|
||
@classmethod | ||
def from_config(cls, config): | ||
cc_config = config.get("compressor_config", None) | ||
compressor = get_codec(cc_config) if cc_config else None | ||
return cls(compressor=compressor) | ||
|
||
|
||
register_codec(CuPyCPUCompressor) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if this could be picked up from the
Store
itself. That way aStore
could specify its return type (NumPy or otherwise). This would save the user from getting involved as much in the process and hopefully make it easier for them to get started. WDYT?@grlee77 would be good to get your thoughts as well in light of the
BaseStore
work 🙂There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be a good idea to have
Store
define a defaultmeta_array
but in the general case, I think aStore
should be able to handle both NumPy and other types like CuPy arrays simultaneously.