Skip to content

Commit

Permalink
Merge pull request #96 from jakirkham/astype_filter
Browse files Browse the repository at this point in the history
AsType Filter
  • Loading branch information
alimanfoo committed Jan 4, 2017
2 parents 70f7c1d + 328d04c commit eef8940
Show file tree
Hide file tree
Showing 7 changed files with 272 additions and 2 deletions.
1 change: 1 addition & 0 deletions docs/api/codecs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ code of this module for details.
.. autoclass:: BZ2
.. autoclass:: LZMA
.. autoclass:: Delta
.. autoclass:: AsType
.. autoclass:: FixedScaleOffset
.. autoclass:: Quantize
.. autoclass:: PackBits
Expand Down
1 change: 1 addition & 0 deletions docs/api/core.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ The Array class (``zarr.core``)
.. automethod:: resize
.. automethod:: append
.. automethod:: view
.. automethod:: astype
83 changes: 83 additions & 0 deletions zarr/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,89 @@ def __repr__(self):
codec_registry[Delta.codec_id] = Delta


class AsType(Codec):
"""Filter to convert data between different types.
Parameters
----------
encode_dtype : dtype
Data type to use for encoded data.
decode_dtype : dtype, optional
Data type to use for decoded data.
Notes
-----
If `encode_dtype` is of lower precision than `decode_dtype`, please be
aware that data loss can occur by writing data to disk using this filter.
No checks are made to ensure the casting will work in that direction and
data corruption will occur.
Examples
--------
>>> import zarr
>>> import numpy as np
>>> x = np.arange(100, 120, 2, dtype=np.int8)
>>> x
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
>>> f = zarr.AsType(encode_dtype=x.dtype, decode_dtype=np.int64)
>>> y = f.decode(x)
>>> y
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118])
>>> z = f.encode(y)
>>> z
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
""" # flake8: noqa

codec_id = 'astype'

def __init__(self, encode_dtype, decode_dtype):
self.encode_dtype = np.dtype(encode_dtype)
self.decode_dtype = np.dtype(decode_dtype)

def encode(self, buf):

# view input data as 1D array
arr = _ndarray_from_buffer(buf, self.decode_dtype)

# convert and copy
enc = arr.astype(self.encode_dtype)

return enc

def decode(self, buf, out=None):

# view encoded data as 1D array
enc = _ndarray_from_buffer(buf, self.encode_dtype)

# convert and copy
dec = enc.astype(self.decode_dtype)

# handle output
out = _buffer_copy(dec, out)

return out

def get_config(self):
config = dict()
config['id'] = self.codec_id
config['encode_dtype'] = self.encode_dtype.str
config['decode_dtype'] = self.decode_dtype.str
return config

def __repr__(self):
return (
'%s(encode_dtype=%s, decode_dtype=%s)' % (
type(self).__name__,
self.encode_dtype,
self.decode_dtype
)
)


codec_registry[AsType.codec_id] = AsType


class FixedScaleOffset(Codec):
"""Simplified version of the scale-offset filter available in HDF5.
Applies the transformation `(x - offset) * scale` to all chunks. Results
Expand Down
63 changes: 62 additions & 1 deletion zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from zarr.attrs import Attributes
from zarr.errors import PermissionError, err_read_only, err_array_not_found
from zarr.compat import reduce
from zarr.codecs import get_codec
from zarr.codecs import AsType, get_codec


class Array(object):
Expand Down Expand Up @@ -73,6 +73,7 @@ class Array(object):
resize
append
view
astype
""" # flake8: noqa

Expand Down Expand Up @@ -1176,3 +1177,63 @@ def view(self, shape=None, chunks=None, dtype=None,
a._filters = filters

return a

def astype(self, dtype):
"""Does on the fly type conversion of the underlying data.
Parameters
----------
dtype : string or dtype
NumPy dtype.
Notes
-----
This method returns a new Array object which is a view on the same
underlying chunk data. Modifying any data via the view is currently
not permitted and will result in an error. This is an experimental
feature and its behavior is subject to change in the future.
See Also
--------
Array.view
Examples
--------
>>> import zarr
>>> import numpy as np
>>> data = np.arange(100, dtype=np.uint8)
>>> a = zarr.array(data, chunks=10)
>>> a[:]
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99], dtype=uint8)
>>> v = a.astype(np.float32)
>>> v.is_view
True
>>> v[:]
array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9.,
10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
20., 21., 22., 23., 24., 25., 26., 27., 28., 29.,
30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
40., 41., 42., 43., 44., 45., 46., 47., 48., 49.,
50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,
60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
80., 81., 82., 83., 84., 85., 86., 87., 88., 89.,
90., 91., 92., 93., 94., 95., 96., 97., 98., 99.],
dtype=float32)
""" # flake8: noqa

dtype = np.dtype(dtype)

filters = []
if self._filters:
filters.extend(self._filters)
filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype))

return self.view(filters=filters, dtype=dtype, read_only=True)
58 changes: 58 additions & 0 deletions zarr/tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,64 @@ def test_repr(self):
eq(expect, actual)


class TestAsType(CodecTests, unittest.TestCase):

codec_id = 'astype'

def test_encode(self):
for arr in test_arrays:
if arr.dtype.kind in {'f', 'i', 'u'}:
self._test_encode(
arr,
encode_dtype=arr.dtype,
decode_dtype=arr.dtype
)

def test_decode(self):
for arr in test_arrays:
if arr.dtype.kind == 'f':
self._test_decode_lossy(
arr,
decimal=10,
encode_dtype=arr.dtype,
decode_dtype=arr.dtype
)
elif arr.dtype.kind in {'i', 'u'}:
self._test_decode_lossless(
arr, encode_dtype=arr.dtype, decode_dtype=arr.dtype
)

def test_encode_output(self):
encode_dtype = 'i4'
decode_dtype = 'i8'
codec = self.init_codec(
encode_dtype=encode_dtype, decode_dtype=decode_dtype
)
arr = np.arange(10, 20, 1, dtype=decode_dtype)
expect = arr.astype(encode_dtype)
actual = codec.encode(arr)
assert_array_equal(expect, actual)
eq(np.dtype(encode_dtype), actual.dtype)

def test_decode_input(self):
encode_dtype = 'i4'
decode_dtype = 'i8'
codec = self.init_codec(
encode_dtype=encode_dtype, decode_dtype=decode_dtype
)
arr = np.arange(10, 20, 1, dtype=encode_dtype)
expect = arr.astype(decode_dtype)
actual = codec.decode(arr)
assert_array_equal(expect, actual)
eq(np.dtype(decode_dtype), actual.dtype)

def test_repr(self):
codec = self.init_codec(encode_dtype='i4', decode_dtype='i8')
expect = 'AsType(encode_dtype=int32, decode_dtype=int64)'
actual = repr(codec)
eq(expect, actual)


class TestFixedScaleOffset(CodecTests, unittest.TestCase):

codec_id = 'fixedscaleoffset'
Expand Down
34 changes: 34 additions & 0 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,40 @@ def test_repr(self):
for l1, l2 in zip(expect.split('\n'), actual.split('\n')):
eq(l1, l2)

def test_astype_no_filters(self):
shape = (100,)
dtype = np.dtype(np.int8)
astype = np.dtype(np.float32)

store = dict()
init_array(store, shape=shape, chunks=10, dtype=dtype)

data = np.arange(np.prod(shape), dtype=dtype).reshape(shape)

z1 = Array(store)
z1[...] = data
z2 = z1.astype(astype)

expected = data.astype(astype)
assert_array_equal(expected, z2)
eq(z2.read_only, True)

def test_astype(self):
shape = (100,)
chunks = (10,)

dtype = np.dtype(np.int8)
astype = np.dtype(np.float32)

data = np.arange(np.prod(shape), dtype=dtype).reshape(shape)

z1 = self.create_array(shape=shape, chunks=chunks, dtype=dtype)
z1[...] = data
z2 = z1.astype(astype)

expected = data.astype(astype)
assert_array_equal(expected, z2)


# custom store, does not support getsize()
class CustomMapping(object):
Expand Down
34 changes: 33 additions & 1 deletion zarr/tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from nose.tools import eq_ as eq


from zarr.codecs import Delta, FixedScaleOffset, \
from zarr.codecs import AsType, Delta, FixedScaleOffset, \
Quantize, PackBits, Categorize, \
Zlib, Blosc, BZ2
from zarr.creation import array
Expand Down Expand Up @@ -55,6 +55,38 @@ def test_array_with_delta_filter():
assert_array_equal(expect, actual)


def test_array_with_astype_filter():

# setup
encode_dtype = 'i1'
decode_dtype = 'i8'
filters = [AsType(encode_dtype=encode_dtype, decode_dtype=decode_dtype)]
chunks = 10
chunk_size = 10
shape = chunks * chunk_size
data = np.arange(shape, dtype=decode_dtype)

for compressor in compressors:
print(repr(compressor))

a = array(data, chunks=chunks, compressor=compressor, filters=filters)

# check round-trip
assert data.dtype == a.dtype
assert_array_equal(data, a[:])

# check chunks
for i in range(chunks):
cdata = a.store[str(i)]
if compressor:
chunk = compressor.decode(cdata)
else:
chunk = cdata
actual = np.frombuffer(chunk, dtype=encode_dtype)
expect = data.astype(encode_dtype)[i*chunk_size:(i+1)*chunk_size]
assert_array_equal(expect, actual)


def test_array_with_scaleoffset_filter():

# setup
Expand Down

0 comments on commit eef8940

Please sign in to comment.