Merge pull request #70 from alimanfoo/issue_55

blosc returns bytes; resolves #55
zarr-developers · Sep 9, 2016 · c8db5b1 · c8db5b1
2 parents 1ddaa66 + cc11c65
commit c8db5b1
Show file tree

Hide file tree

Showing 6 changed files with 999 additions and 609 deletions.
diff --git a/docs/release.rst b/docs/release.rst
@@ -1,6 +1,11 @@
 Release notes
 =============
 
+* The Blosc extension has been modified to return bytes instead of array
+  objects from compress and decompress function calls. This should
+  improve compatibility and also provides a small performance increase for
+  compressing high compression ratio data
+  (`#55 <https://github.com/alimanfoo/zarr/issues/55>`_).
 * Added ``overwrite`` keyword argument to array and group creation methods
   on the :class:`zarr.hierarchy.Group` class
   (`#71 <https://github.com/alimanfoo/zarr/issues/71>`_).

diff --git a/notebooks/.ipynb_checkpoints/blosc_microbench-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/blosc_microbench-checkpoint.ipynb
@@ -0,0 +1,200 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2.0.1'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import zarr\n",
+    "zarr.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 loops, best of 3: 110 ms per loop\n",
+      "1 loop, best of 3: 235 ms per loop\n",
+      "Array((100000000,), int64, chunks=(200000,), order=C)\n",
+      "  nbytes: 762.9M; nbytes_stored: 11.2M; ratio: 67.8; initialized: 500/500\n",
+      "  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)\n",
+      "  store: dict\n"
+     ]
+    }
+   ],
+   "source": [
+    "z = zarr.empty(shape=100000000, chunks=200000, dtype='i8')\n",
+    "data = np.arange(100000000, dtype='i8')\n",
+    "%timeit z[:] = data\n",
+    "%timeit z[:]\n",
+    "print(z)\n",
+    "assert np.all(z[:] == data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 331 ms per loop\n",
+      "1 loop, best of 3: 246 ms per loop\n",
+      "Array((100000000,), float64, chunks=(200000,), order=C)\n",
+      "  nbytes: 762.9M; nbytes_stored: 724.8M; ratio: 1.1; initialized: 500/500\n",
+      "  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)\n",
+      "  store: dict\n"
+     ]
+    }
+   ],
+   "source": [
+    "z = zarr.empty(shape=100000000, chunks=200000, dtype='f8')\n",
+    "data = np.random.normal(size=100000000)\n",
+    "%timeit z[:] = data\n",
+    "%timeit z[:]\n",
+    "print(z)\n",
+    "assert np.all(z[:] == data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2.0.2.dev0+dirty'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "import sys\n",
+    "sys.path.insert(0, '..')\n",
+    "import zarr\n",
+    "zarr.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 loops, best of 3: 92.7 ms per loop\n",
+      "1 loop, best of 3: 230 ms per loop\n",
+      "Array((100000000,), int64, chunks=(200000,), order=C)\n",
+      "  nbytes: 762.9M; nbytes_stored: 11.2M; ratio: 67.8; initialized: 500/500\n",
+      "  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)\n",
+      "  store: dict\n"
+     ]
+    }
+   ],
+   "source": [
+    "z = zarr.empty(shape=100000000, chunks=200000, dtype='i8')\n",
+    "data = np.arange(100000000, dtype='i8')\n",
+    "%timeit z[:] = data\n",
+    "%timeit z[:]\n",
+    "print(z)\n",
+    "assert np.all(z[:] == data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 338 ms per loop\n",
+      "1 loop, best of 3: 253 ms per loop\n",
+      "Array((100000000,), float64, chunks=(200000,), order=C)\n",
+      "  nbytes: 762.9M; nbytes_stored: 724.8M; ratio: 1.1; initialized: 500/500\n",
+      "  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)\n",
+      "  store: dict\n"
+     ]
+    }
+   ],
+   "source": [
+    "z = zarr.empty(shape=100000000, chunks=200000, dtype='f8')\n",
+    "data = np.random.normal(size=100000000)\n",
+    "%timeit z[:] = data\n",
+    "%timeit z[:]\n",
+    "print(z)\n",
+    "assert np.all(z[:] == data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}