Merge pull request #67 from alimanfoo/issue_66

fix ZipStore performance; resolves #66
zarr-developers · Sep 9, 2016 · ebaddd2 · ebaddd2
2 parents c8db5b1 + 62f4360
commit ebaddd2
Show file tree

Hide file tree

Showing 9 changed files with 773 additions and 44 deletions.
diff --git a/docs/api/storage.rst b/docs/api/storage.rst
@@ -13,4 +13,7 @@ can be used as a Zarr array store.
 .. autoclass:: DirectoryStore
 .. autoclass:: ZipStore
 
+    .. automethod:: close
+    .. automethod:: flush
+
 .. autofunction:: migrate_1to2
diff --git a/docs/release.rst b/docs/release.rst
@@ -1,6 +1,8 @@
 Release notes
 =============
 
+* Fixed performance issues with ``ZipStore`` class
+  (`#66 <https://github.com/alimanfoo/zarr/issues/27>`_)
 * The Blosc extension has been modified to return bytes instead of array
   objects from compress and decompress function calls. This should
   improve compatibility and also provides a small performance increase for

diff --git a/docs/spec/v2.rst b/docs/spec/v2.rst
@@ -442,6 +442,7 @@ Here is the same example using a Zip file as storage::
     >>> sub_grp = root_grp.create_group('foo')
     >>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10))
     >>> a[:] = 42
+    >>> store.close()
 
 What has been stored::
 

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -515,6 +515,7 @@ Here is an example storing an array directly into a Zip file::
       nbytes: 3.8M; nbytes_stored: 21.8K; ratio: 179.2; initialized: 100/100
       compressor: Blosc(cname='lz4', clevel=5, shuffle=1)
       store: ZipStore
+    >>> store.close()
     >>> import os
     >>> os.path.getsize('example.zip')
     30721
@@ -536,12 +537,17 @@ Re-open and check that data have been written::
            [42, 42, 42, ..., 42, 42, 42],
            [42, 42, 42, ..., 42, 42, 42],
            [42, 42, 42, ..., 42, 42, 42]], dtype=int32)
+    >>> store.close()
 
 Note that there are some restrictions on how Zip files can be used,
 because items within a Zip file cannot be updated in place. This means
 that data in the array should only be written once and write
 operations should be aligned with chunk boundaries.
 
+Note also that the ``close()`` method must be called after writing any data to
+the store, otherwise essential records will not be written to the underlying
+zip file.
+
 The Dask project has implementations of the ``MutableMapping``
 interface for distributed storage systems, see the `S3Map
 <http://s3fs.readthedocs.io/en/latest/api.html#s3fs.mapping.S3Map>`_

diff --git a/notebooks/.ipynb_checkpoints/zip_benchmark-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/zip_benchmark-checkpoint.ipynb
@@ -0,0 +1,343 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2.0.2.dev0+dirty'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '..')\n",
+    "import zarr\n",
+    "zarr.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n",
+       "  nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n",
+       "  compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n",
+       "  store: ZipStore"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "store = zarr.ZipStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes.zip',\n",
+    "                      mode='r')\n",
+    "grp = zarr.Group(store)\n",
+    "z = grp['3L/calldata/genotype']\n",
+    "z"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "         1832 function calls in 0.024 seconds\n",
+      "\n",
+      "   Ordered by: cumulative time\n",
+      "\n",
+      "   ncalls  tottime  percall  cumtime  percall filename:lineno(function)\n",
+      "        1    0.000    0.000    0.024    0.024 {built-in method builtins.exec}\n",
+      "        1    0.000    0.000    0.024    0.024 <string>:1(<module>)\n",
+      "        1    0.000    0.000    0.024    0.024 core.py:292(__getitem__)\n",
+      "       20    0.000    0.000    0.023    0.001 core.py:539(_chunk_getitem)\n",
+      "       20    0.000    0.000    0.020    0.001 core.py:679(_decode_chunk)\n",
+      "       20    0.000    0.000    0.020    0.001 codecs.py:355(decode)\n",
+      "       20    0.020    0.001    0.020    0.001 {zarr.blosc.decompress}\n",
+      "       20    0.000    0.000    0.002    0.000 storage.py:766(__getitem__)\n",
+      "       20    0.000    0.000    0.001    0.000 zipfile.py:1235(open)\n",
+      "       20    0.000    0.000    0.001    0.000 zipfile.py:821(read)\n",
+      "       20    0.000    0.000    0.001    0.000 zipfile.py:901(_read1)\n",
+      "       80    0.000    0.000    0.001    0.000 zipfile.py:660(read)\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:854(_update_crc)\n",
+      "       40    0.000    0.000    0.000    0.000 {built-in method zlib.crc32}\n",
+      "       80    0.000    0.000    0.000    0.000 {method 'read' of '_io.BufferedReader' objects}\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:937(_read2)\n",
+      "       80    0.000    0.000    0.000    0.000 core.py:390(<genexpr>)\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:953(close)\n",
+      "       20    0.000    0.000    0.000    0.000 {method 'reshape' of 'numpy.ndarray' objects}\n",
+      "       20    0.000    0.000    0.000    0.000 util.py:106(is_total_slice)\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:708(__init__)\n",
+      "       20    0.000    0.000    0.000    0.000 {method 'decode' of 'bytes' objects}\n",
+      "       20    0.000    0.000    0.000    0.000 core.py:676(_chunk_key)\n",
+      "       80    0.000    0.000    0.000    0.000 {method 'seek' of '_io.BufferedReader' objects}\n",
+      "       20    0.000    0.000    0.000    0.000 {built-in method numpy.core.multiarray.frombuffer}\n",
+      "       80    0.000    0.000    0.000    0.000 core.py:398(<genexpr>)\n",
+      "       20    0.000    0.000    0.000    0.000 {method 'join' of 'str' objects}\n",
+      "       20    0.000    0.000    0.000    0.000 core.py:386(<listcomp>)\n",
+      "       20    0.000    0.000    0.000    0.000 {built-in method builtins.all}\n",
+      "       40    0.000    0.000    0.000    0.000 util.py:121(<genexpr>)\n",
+      "      231    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}\n",
+      "       20    0.000    0.000    0.000    0.000 cp437.py:14(decode)\n",
+      "       80    0.000    0.000    0.000    0.000 {method 'tell' of '_io.BufferedReader' objects}\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:667(close)\n",
+      "       20    0.000    0.000    0.000    0.000 {built-in method _struct.unpack}\n",
+      "      140    0.000    0.000    0.000    0.000 {built-in method builtins.max}\n",
+      "       20    0.000    0.000    0.000    0.000 {function ZipExtFile.close at 0x7f8cd5ca2048}\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:1194(getinfo)\n",
+      "      140    0.000    0.000    0.000    0.000 {built-in method builtins.min}\n",
+      "       20    0.000    0.000    0.000    0.000 threading.py:1224(current_thread)\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:654(__init__)\n",
+      "        1    0.000    0.000    0.000    0.000 util.py:195(get_chunk_range)\n",
+      "       20    0.000    0.000    0.000    0.000 {built-in method _codecs.charmap_decode}\n",
+      "        1    0.000    0.000    0.000    0.000 util.py:166(normalize_array_selection)\n",
+      "        1    0.000    0.000    0.000    0.000 util.py:198(<listcomp>)\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:1715(_fpclose)\n",
+      "       20    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}\n",
+      "       63    0.000    0.000    0.000    0.000 {built-in method builtins.len}\n",
+      "        1    0.000    0.000    0.000    0.000 {built-in method numpy.core.multiarray.empty}\n",
+      "        2    0.000    0.000    0.000    0.000 util.py:182(<genexpr>)\n",
+      "       20    0.000    0.000    0.000    0.000 {built-in method builtins.hasattr}\n",
+      "       20    0.000    0.000    0.000    0.000 {built-in method _thread.get_ident}\n",
+      "        1    0.000    0.000    0.000    0.000 util.py:130(normalize_axis_selection)\n",
+      "       20    0.000    0.000    0.000    0.000 zipfile.py:636(_get_decompressor)\n",
+      "       20    0.000    0.000    0.000    0.000 threading.py:1298(main_thread)\n",
+      "        4    0.000    0.000    0.000    0.000 core.py:373(<genexpr>)\n",
+      "        3    0.000    0.000    0.000    0.000 util.py:187(<genexpr>)\n",
+      "        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import cProfile\n",
+    "cProfile.run('z[:10]', sort='cumtime')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'0.11.0'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import dask\n",
+    "import dask.array as da\n",
+    "dask.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dask.array<array-f..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "d = da.from_array(z, chunks=z.chunks)\n",
+    "d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 3min 35s, sys: 4.36 s, total: 3min 40s\n",
+      "Wall time: 29.5 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[3, 0],\n",
+       "       [1, 0],\n",
+       "       [2, 0],\n",
+       "       ..., \n",
+       "       [2, 8],\n",
+       "       [8, 8],\n",
+       "       [0, 1]])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%time d.sum(axis=1).compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n",
+       "  nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n",
+       "  compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n",
+       "  store: DirectoryStore"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# compare with same data via directory store\n",
+    "store_dir = zarr.DirectoryStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes')\n",
+    "grp_dir = zarr.Group(store_dir)\n",
+    "z_dir = grp_dir['3L/calldata/genotype']\n",
+    "z_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dask.array<array-7..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "d_dir = da.from_array(z_dir, chunks=z_dir.chunks)\n",
+    "d_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 3min 39s, sys: 4.91 s, total: 3min 44s\n",
+      "Wall time: 31.1 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([[3, 0],\n",
+       "       [1, 0],\n",
+       "       [2, 0],\n",
+       "       ..., \n",
+       "       [2, 8],\n",
+       "       [8, 8],\n",
+       "       [0, 1]])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%time d_dir.sum(axis=1).compute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}