Merge pull request #96 from jakirkham/astype_filter

AsType Filter
zarr-developers · Jan 4, 2017 · eef8940 · eef8940
2 parents 70f7c1d + 328d04c
commit eef8940
Show file tree

Hide file tree

Showing 7 changed files with 272 additions and 2 deletions.
diff --git a/docs/api/codecs.rst b/docs/api/codecs.rst
@@ -21,6 +21,7 @@ code of this module for details.
 .. autoclass:: BZ2
 .. autoclass:: LZMA
 .. autoclass:: Delta
+.. autoclass:: AsType
 .. autoclass:: FixedScaleOffset
 .. autoclass:: Quantize
 .. autoclass:: PackBits

diff --git a/docs/api/core.rst b/docs/api/core.rst
@@ -9,3 +9,4 @@ The Array class (``zarr.core``)
     .. automethod:: resize
     .. automethod:: append
     .. automethod:: view
+    .. automethod:: astype
diff --git a/zarr/codecs.py b/zarr/codecs.py
@@ -485,6 +485,89 @@ def __repr__(self):
 codec_registry[Delta.codec_id] = Delta
 
 
+class AsType(Codec):
+    """Filter to convert data between different types.
+
+    Parameters
+    ----------
+    encode_dtype : dtype
+        Data type to use for encoded data.
+    decode_dtype : dtype, optional
+        Data type to use for decoded data.
+
+    Notes
+    -----
+    If `encode_dtype` is of lower precision than `decode_dtype`, please be
+    aware that data loss can occur by writing data to disk using this filter.
+    No checks are made to ensure the casting will work in that direction and
+    data corruption will occur.
+
+    Examples
+    --------
+    >>> import zarr
+    >>> import numpy as np
+    >>> x = np.arange(100, 120, 2, dtype=np.int8)
+    >>> x
+    array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
+    >>> f = zarr.AsType(encode_dtype=x.dtype, decode_dtype=np.int64)
+    >>> y = f.decode(x)
+    >>> y
+    array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118])
+    >>> z = f.encode(y)
+    >>> z
+    array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
+
+    """  # flake8: noqa
+
+    codec_id = 'astype'
+
+    def __init__(self, encode_dtype, decode_dtype):
+        self.encode_dtype = np.dtype(encode_dtype)
+        self.decode_dtype = np.dtype(decode_dtype)
+
+    def encode(self, buf):
+
+        # view input data as 1D array
+        arr = _ndarray_from_buffer(buf, self.decode_dtype)
+
+        # convert and copy
+        enc = arr.astype(self.encode_dtype)
+
+        return enc
+
+    def decode(self, buf, out=None):
+
+        # view encoded data as 1D array
+        enc = _ndarray_from_buffer(buf, self.encode_dtype)
+
+        # convert and copy
+        dec = enc.astype(self.decode_dtype)
+
+        # handle output
+        out = _buffer_copy(dec, out)
+
+        return out
+
+    def get_config(self):
+        config = dict()
+        config['id'] = self.codec_id
+        config['encode_dtype'] = self.encode_dtype.str
+        config['decode_dtype'] = self.decode_dtype.str
+        return config
+
+    def __repr__(self):
+        return (
+            '%s(encode_dtype=%s, decode_dtype=%s)' % (
+                type(self).__name__,
+                self.encode_dtype,
+                self.decode_dtype
+            )
+        )
+
+
+codec_registry[AsType.codec_id] = AsType
+
+
 class FixedScaleOffset(Codec):
     """Simplified version of the scale-offset filter available in HDF5.
     Applies the transformation `(x - offset) * scale` to all chunks. Results

diff --git a/zarr/core.py b/zarr/core.py
@@ -15,7 +15,7 @@
 from zarr.attrs import Attributes
 from zarr.errors import PermissionError, err_read_only, err_array_not_found
 from zarr.compat import reduce
-from zarr.codecs import get_codec
+from zarr.codecs import AsType, get_codec
 
 
 class Array(object):
@@ -73,6 +73,7 @@ class Array(object):
     resize
     append
     view
+    astype
 
     """  # flake8: noqa
 
@@ -1176,3 +1177,63 @@ def view(self, shape=None, chunks=None, dtype=None,
             a._filters = filters
 
         return a
+
+    def astype(self, dtype):
+        """Does on the fly type conversion of the underlying data.
+
+        Parameters
+        ----------
+        dtype : string or dtype
+            NumPy dtype.
+
+        Notes
+        -----
+        This method returns a new Array object which is a view on the same
+        underlying chunk data. Modifying any data via the view is currently
+        not permitted and will result in an error. This is an experimental
+        feature and its behavior is subject to change in the future.
+
+        See Also
+        --------
+        Array.view
+
+        Examples
+        --------
+
+        >>> import zarr
+        >>> import numpy as np
+        >>> data = np.arange(100, dtype=np.uint8)
+        >>> a = zarr.array(data, chunks=10)
+        >>> a[:]
+        array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+               64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+               80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+               96, 97, 98, 99], dtype=uint8)
+        >>> v = a.astype(np.float32)
+        >>> v.is_view
+        True
+        >>> v[:]
+        array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,
+                10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,
+                20.,  21.,  22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,
+                30.,  31.,  32.,  33.,  34.,  35.,  36.,  37.,  38.,  39.,
+                40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,  48.,  49.,
+                50.,  51.,  52.,  53.,  54.,  55.,  56.,  57.,  58.,  59.,
+                60.,  61.,  62.,  63.,  64.,  65.,  66.,  67.,  68.,  69.,
+                70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,  78.,  79.,
+                80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,  89.,
+                90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.],
+              dtype=float32)
+        """  # flake8: noqa
+
+        dtype = np.dtype(dtype)
+
+        filters = []
+        if self._filters:
+            filters.extend(self._filters)
+        filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype))
+
+        return self.view(filters=filters, dtype=dtype, read_only=True)
diff --git a/zarr/tests/test_codecs.py b/zarr/tests/test_codecs.py
@@ -294,6 +294,64 @@ def test_repr(self):
         eq(expect, actual)
 
 
+class TestAsType(CodecTests, unittest.TestCase):
+
+    codec_id = 'astype'
+
+    def test_encode(self):
+        for arr in test_arrays:
+            if arr.dtype.kind in {'f', 'i', 'u'}:
+                self._test_encode(
+                    arr,
+                    encode_dtype=arr.dtype,
+                    decode_dtype=arr.dtype
+                )
+
+    def test_decode(self):
+        for arr in test_arrays:
+            if arr.dtype.kind == 'f':
+                self._test_decode_lossy(
+                    arr,
+                    decimal=10,
+                    encode_dtype=arr.dtype,
+                    decode_dtype=arr.dtype
+                )
+            elif arr.dtype.kind in {'i', 'u'}:
+                self._test_decode_lossless(
+                    arr, encode_dtype=arr.dtype, decode_dtype=arr.dtype
+                )
+
+    def test_encode_output(self):
+        encode_dtype = 'i4'
+        decode_dtype = 'i8'
+        codec = self.init_codec(
+            encode_dtype=encode_dtype, decode_dtype=decode_dtype
+        )
+        arr = np.arange(10, 20, 1, dtype=decode_dtype)
+        expect = arr.astype(encode_dtype)
+        actual = codec.encode(arr)
+        assert_array_equal(expect, actual)
+        eq(np.dtype(encode_dtype), actual.dtype)
+
+    def test_decode_input(self):
+        encode_dtype = 'i4'
+        decode_dtype = 'i8'
+        codec = self.init_codec(
+            encode_dtype=encode_dtype, decode_dtype=decode_dtype
+        )
+        arr = np.arange(10, 20, 1, dtype=encode_dtype)
+        expect = arr.astype(decode_dtype)
+        actual = codec.decode(arr)
+        assert_array_equal(expect, actual)
+        eq(np.dtype(decode_dtype), actual.dtype)
+
+    def test_repr(self):
+        codec = self.init_codec(encode_dtype='i4', decode_dtype='i8')
+        expect = 'AsType(encode_dtype=int32, decode_dtype=int64)'
+        actual = repr(codec)
+        eq(expect, actual)
+
+
 class TestFixedScaleOffset(CodecTests, unittest.TestCase):
 
     codec_id = 'fixedscaleoffset'

diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py
@@ -766,6 +766,40 @@ def test_repr(self):
             for l1, l2 in zip(expect.split('\n'), actual.split('\n')):
                 eq(l1, l2)
 
+    def test_astype_no_filters(self):
+        shape = (100,)
+        dtype = np.dtype(np.int8)
+        astype = np.dtype(np.float32)
+
+        store = dict()
+        init_array(store, shape=shape, chunks=10, dtype=dtype)
+
+        data = np.arange(np.prod(shape), dtype=dtype).reshape(shape)
+
+        z1 = Array(store)
+        z1[...] = data
+        z2 = z1.astype(astype)
+
+        expected = data.astype(astype)
+        assert_array_equal(expected, z2)
+        eq(z2.read_only, True)
+
+    def test_astype(self):
+        shape = (100,)
+        chunks = (10,)
+
+        dtype = np.dtype(np.int8)
+        astype = np.dtype(np.float32)
+
+        data = np.arange(np.prod(shape), dtype=dtype).reshape(shape)
+
+        z1 = self.create_array(shape=shape, chunks=chunks, dtype=dtype)
+        z1[...] = data
+        z2 = z1.astype(astype)
+
+        expected = data.astype(astype)
+        assert_array_equal(expected, z2)
+
 
 # custom store, does not support getsize()
 class CustomMapping(object):

diff --git a/zarr/tests/test_filters.py b/zarr/tests/test_filters.py
@@ -7,7 +7,7 @@
 from nose.tools import eq_ as eq
 
 
-from zarr.codecs import Delta, FixedScaleOffset, \
+from zarr.codecs import AsType, Delta, FixedScaleOffset, \
     Quantize, PackBits, Categorize, \
     Zlib, Blosc, BZ2
 from zarr.creation import array
@@ -55,6 +55,38 @@ def test_array_with_delta_filter():
             assert_array_equal(expect, actual)
 
 
+def test_array_with_astype_filter():
+
+    # setup
+    encode_dtype = 'i1'
+    decode_dtype = 'i8'
+    filters = [AsType(encode_dtype=encode_dtype, decode_dtype=decode_dtype)]
+    chunks = 10
+    chunk_size = 10
+    shape = chunks * chunk_size
+    data = np.arange(shape, dtype=decode_dtype)
+
+    for compressor in compressors:
+        print(repr(compressor))
+
+        a = array(data, chunks=chunks, compressor=compressor, filters=filters)
+
+        # check round-trip
+        assert data.dtype == a.dtype
+        assert_array_equal(data, a[:])
+
+        # check chunks
+        for i in range(chunks):
+            cdata = a.store[str(i)]
+            if compressor:
+                chunk = compressor.decode(cdata)
+            else:
+                chunk = cdata
+            actual = np.frombuffer(chunk, dtype=encode_dtype)
+            expect = data.astype(encode_dtype)[i*chunk_size:(i+1)*chunk_size]
+            assert_array_equal(expect, actual)
+
+
 def test_array_with_scaleoffset_filter():
 
     # setup