Skip to content

Commit

Permalink
Merge pull request #67 from alimanfoo/issue_66
Browse files Browse the repository at this point in the history
fix ZipStore performance; resolves #66
  • Loading branch information
alimanfoo committed Sep 9, 2016
2 parents c8db5b1 + 62f4360 commit ebaddd2
Show file tree
Hide file tree
Showing 9 changed files with 773 additions and 44 deletions.
3 changes: 3 additions & 0 deletions docs/api/storage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,7 @@ can be used as a Zarr array store.
.. autoclass:: DirectoryStore
.. autoclass:: ZipStore

.. automethod:: close
.. automethod:: flush

.. autofunction:: migrate_1to2
2 changes: 2 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
Release notes
=============

* Fixed performance issues with ``ZipStore`` class
(`#66 <https://github.com/alimanfoo/zarr/issues/27>`_)
* The Blosc extension has been modified to return bytes instead of array
objects from compress and decompress function calls. This should
improve compatibility and also provides a small performance increase for
Expand Down
1 change: 1 addition & 0 deletions docs/spec/v2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ Here is the same example using a Zip file as storage::
>>> sub_grp = root_grp.create_group('foo')
>>> a = sub_grp.create_dataset('bar', shape=(20, 20), chunks=(10, 10))
>>> a[:] = 42
>>> store.close()

What has been stored::

Expand Down
6 changes: 6 additions & 0 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,7 @@ Here is an example storing an array directly into a Zip file::
nbytes: 3.8M; nbytes_stored: 21.8K; ratio: 179.2; initialized: 100/100
compressor: Blosc(cname='lz4', clevel=5, shuffle=1)
store: ZipStore
>>> store.close()
>>> import os
>>> os.path.getsize('example.zip')
30721
Expand All @@ -536,12 +537,17 @@ Re-open and check that data have been written::
[42, 42, 42, ..., 42, 42, 42],
[42, 42, 42, ..., 42, 42, 42],
[42, 42, 42, ..., 42, 42, 42]], dtype=int32)
>>> store.close()

Note that there are some restrictions on how Zip files can be used,
because items within a Zip file cannot be updated in place. This means
that data in the array should only be written once and write
operations should be aligned with chunk boundaries.

Note also that the ``close()`` method must be called after writing any data to
the store, otherwise essential records will not be written to the underlying
zip file.

The Dask project has implementations of the ``MutableMapping``
interface for distributed storage systems, see the `S3Map
<http://s3fs.readthedocs.io/en/latest/api.html#s3fs.mapping.S3Map>`_
Expand Down
343 changes: 343 additions & 0 deletions notebooks/.ipynb_checkpoints/zip_benchmark-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,343 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'2.0.2.dev0+dirty'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"sys.path.insert(0, '..')\n",
"import zarr\n",
"zarr.__version__"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n",
" nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n",
" compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n",
" store: ZipStore"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"store = zarr.ZipStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes.zip',\n",
" mode='r')\n",
"grp = zarr.Group(store)\n",
"z = grp['3L/calldata/genotype']\n",
"z"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 1832 function calls in 0.024 seconds\n",
"\n",
" Ordered by: cumulative time\n",
"\n",
" ncalls tottime percall cumtime percall filename:lineno(function)\n",
" 1 0.000 0.000 0.024 0.024 {built-in method builtins.exec}\n",
" 1 0.000 0.000 0.024 0.024 <string>:1(<module>)\n",
" 1 0.000 0.000 0.024 0.024 core.py:292(__getitem__)\n",
" 20 0.000 0.000 0.023 0.001 core.py:539(_chunk_getitem)\n",
" 20 0.000 0.000 0.020 0.001 core.py:679(_decode_chunk)\n",
" 20 0.000 0.000 0.020 0.001 codecs.py:355(decode)\n",
" 20 0.020 0.001 0.020 0.001 {zarr.blosc.decompress}\n",
" 20 0.000 0.000 0.002 0.000 storage.py:766(__getitem__)\n",
" 20 0.000 0.000 0.001 0.000 zipfile.py:1235(open)\n",
" 20 0.000 0.000 0.001 0.000 zipfile.py:821(read)\n",
" 20 0.000 0.000 0.001 0.000 zipfile.py:901(_read1)\n",
" 80 0.000 0.000 0.001 0.000 zipfile.py:660(read)\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:854(_update_crc)\n",
" 40 0.000 0.000 0.000 0.000 {built-in method zlib.crc32}\n",
" 80 0.000 0.000 0.000 0.000 {method 'read' of '_io.BufferedReader' objects}\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:937(_read2)\n",
" 80 0.000 0.000 0.000 0.000 core.py:390(<genexpr>)\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:953(close)\n",
" 20 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n",
" 20 0.000 0.000 0.000 0.000 util.py:106(is_total_slice)\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:708(__init__)\n",
" 20 0.000 0.000 0.000 0.000 {method 'decode' of 'bytes' objects}\n",
" 20 0.000 0.000 0.000 0.000 core.py:676(_chunk_key)\n",
" 80 0.000 0.000 0.000 0.000 {method 'seek' of '_io.BufferedReader' objects}\n",
" 20 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.frombuffer}\n",
" 80 0.000 0.000 0.000 0.000 core.py:398(<genexpr>)\n",
" 20 0.000 0.000 0.000 0.000 {method 'join' of 'str' objects}\n",
" 20 0.000 0.000 0.000 0.000 core.py:386(<listcomp>)\n",
" 20 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n",
" 40 0.000 0.000 0.000 0.000 util.py:121(<genexpr>)\n",
" 231 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance}\n",
" 20 0.000 0.000 0.000 0.000 cp437.py:14(decode)\n",
" 80 0.000 0.000 0.000 0.000 {method 'tell' of '_io.BufferedReader' objects}\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:667(close)\n",
" 20 0.000 0.000 0.000 0.000 {built-in method _struct.unpack}\n",
" 140 0.000 0.000 0.000 0.000 {built-in method builtins.max}\n",
" 20 0.000 0.000 0.000 0.000 {function ZipExtFile.close at 0x7f8cd5ca2048}\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:1194(getinfo)\n",
" 140 0.000 0.000 0.000 0.000 {built-in method builtins.min}\n",
" 20 0.000 0.000 0.000 0.000 threading.py:1224(current_thread)\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:654(__init__)\n",
" 1 0.000 0.000 0.000 0.000 util.py:195(get_chunk_range)\n",
" 20 0.000 0.000 0.000 0.000 {built-in method _codecs.charmap_decode}\n",
" 1 0.000 0.000 0.000 0.000 util.py:166(normalize_array_selection)\n",
" 1 0.000 0.000 0.000 0.000 util.py:198(<listcomp>)\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:1715(_fpclose)\n",
" 20 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects}\n",
" 63 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n",
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n",
" 2 0.000 0.000 0.000 0.000 util.py:182(<genexpr>)\n",
" 20 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n",
" 20 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n",
" 1 0.000 0.000 0.000 0.000 util.py:130(normalize_axis_selection)\n",
" 20 0.000 0.000 0.000 0.000 zipfile.py:636(_get_decompressor)\n",
" 20 0.000 0.000 0.000 0.000 threading.py:1298(main_thread)\n",
" 4 0.000 0.000 0.000 0.000 core.py:373(<genexpr>)\n",
" 3 0.000 0.000 0.000 0.000 util.py:187(<genexpr>)\n",
" 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n",
"\n",
"\n"
]
}
],
"source": [
"import cProfile\n",
"cProfile.run('z[:10]', sort='cumtime')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0.11.0'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import dask\n",
"import dask.array as da\n",
"dask.__version__"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dask.array<array-f..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d = da.from_array(z, chunks=z.chunks)\n",
"d"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 35s, sys: 4.36 s, total: 3min 40s\n",
"Wall time: 29.5 s\n"
]
},
{
"data": {
"text/plain": [
"array([[3, 0],\n",
" [1, 0],\n",
" [2, 0],\n",
" ..., \n",
" [2, 8],\n",
" [8, 8],\n",
" [0, 1]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time d.sum(axis=1).compute()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n",
" nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n",
" compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n",
" store: DirectoryStore"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compare with same data via directory store\n",
"store_dir = zarr.DirectoryStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes')\n",
"grp_dir = zarr.Group(store_dir)\n",
"z_dir = grp_dir['3L/calldata/genotype']\n",
"z_dir"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"dask.array<array-7..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d_dir = da.from_array(z_dir, chunks=z_dir.chunks)\n",
"d_dir"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3min 39s, sys: 4.91 s, total: 3min 44s\n",
"Wall time: 31.1 s\n"
]
},
{
"data": {
"text/plain": [
"array([[3, 0],\n",
" [1, 0],\n",
" [2, 0],\n",
" ..., \n",
" [2, 8],\n",
" [8, 8],\n",
" [0, 1]])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time d_dir.sum(axis=1).compute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

0 comments on commit ebaddd2

Please sign in to comment.