-
-
Notifications
You must be signed in to change notification settings - Fork 259
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #67 from alimanfoo/issue_66
fix ZipStore performance; resolves #66
- Loading branch information
Showing
9 changed files
with
773 additions
and
44 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
343 changes: 343 additions & 0 deletions
343
notebooks/.ipynb_checkpoints/zip_benchmark-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,343 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'2.0.2.dev0+dirty'" | ||
] | ||
}, | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import sys\n", | ||
"sys.path.insert(0, '..')\n", | ||
"import zarr\n", | ||
"zarr.__version__" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n", | ||
" nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n", | ||
" compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n", | ||
" store: ZipStore" | ||
] | ||
}, | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"store = zarr.ZipStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes.zip',\n", | ||
" mode='r')\n", | ||
"grp = zarr.Group(store)\n", | ||
"z = grp['3L/calldata/genotype']\n", | ||
"z" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" 1832 function calls in 0.024 seconds\n", | ||
"\n", | ||
" Ordered by: cumulative time\n", | ||
"\n", | ||
" ncalls tottime percall cumtime percall filename:lineno(function)\n", | ||
" 1 0.000 0.000 0.024 0.024 {built-in method builtins.exec}\n", | ||
" 1 0.000 0.000 0.024 0.024 <string>:1(<module>)\n", | ||
" 1 0.000 0.000 0.024 0.024 core.py:292(__getitem__)\n", | ||
" 20 0.000 0.000 0.023 0.001 core.py:539(_chunk_getitem)\n", | ||
" 20 0.000 0.000 0.020 0.001 core.py:679(_decode_chunk)\n", | ||
" 20 0.000 0.000 0.020 0.001 codecs.py:355(decode)\n", | ||
" 20 0.020 0.001 0.020 0.001 {zarr.blosc.decompress}\n", | ||
" 20 0.000 0.000 0.002 0.000 storage.py:766(__getitem__)\n", | ||
" 20 0.000 0.000 0.001 0.000 zipfile.py:1235(open)\n", | ||
" 20 0.000 0.000 0.001 0.000 zipfile.py:821(read)\n", | ||
" 20 0.000 0.000 0.001 0.000 zipfile.py:901(_read1)\n", | ||
" 80 0.000 0.000 0.001 0.000 zipfile.py:660(read)\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:854(_update_crc)\n", | ||
" 40 0.000 0.000 0.000 0.000 {built-in method zlib.crc32}\n", | ||
" 80 0.000 0.000 0.000 0.000 {method 'read' of '_io.BufferedReader' objects}\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:937(_read2)\n", | ||
" 80 0.000 0.000 0.000 0.000 core.py:390(<genexpr>)\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:953(close)\n", | ||
" 20 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects}\n", | ||
" 20 0.000 0.000 0.000 0.000 util.py:106(is_total_slice)\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:708(__init__)\n", | ||
" 20 0.000 0.000 0.000 0.000 {method 'decode' of 'bytes' objects}\n", | ||
" 20 0.000 0.000 0.000 0.000 core.py:676(_chunk_key)\n", | ||
" 80 0.000 0.000 0.000 0.000 {method 'seek' of '_io.BufferedReader' objects}\n", | ||
" 20 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.frombuffer}\n", | ||
" 80 0.000 0.000 0.000 0.000 core.py:398(<genexpr>)\n", | ||
" 20 0.000 0.000 0.000 0.000 {method 'join' of 'str' objects}\n", | ||
" 20 0.000 0.000 0.000 0.000 core.py:386(<listcomp>)\n", | ||
" 20 0.000 0.000 0.000 0.000 {built-in method builtins.all}\n", | ||
" 40 0.000 0.000 0.000 0.000 util.py:121(<genexpr>)\n", | ||
" 231 0.000 0.000 0.000 0.000 {built-in method builtins.isinstance}\n", | ||
" 20 0.000 0.000 0.000 0.000 cp437.py:14(decode)\n", | ||
" 80 0.000 0.000 0.000 0.000 {method 'tell' of '_io.BufferedReader' objects}\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:667(close)\n", | ||
" 20 0.000 0.000 0.000 0.000 {built-in method _struct.unpack}\n", | ||
" 140 0.000 0.000 0.000 0.000 {built-in method builtins.max}\n", | ||
" 20 0.000 0.000 0.000 0.000 {function ZipExtFile.close at 0x7f8cd5ca2048}\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:1194(getinfo)\n", | ||
" 140 0.000 0.000 0.000 0.000 {built-in method builtins.min}\n", | ||
" 20 0.000 0.000 0.000 0.000 threading.py:1224(current_thread)\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:654(__init__)\n", | ||
" 1 0.000 0.000 0.000 0.000 util.py:195(get_chunk_range)\n", | ||
" 20 0.000 0.000 0.000 0.000 {built-in method _codecs.charmap_decode}\n", | ||
" 1 0.000 0.000 0.000 0.000 util.py:166(normalize_array_selection)\n", | ||
" 1 0.000 0.000 0.000 0.000 util.py:198(<listcomp>)\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:1715(_fpclose)\n", | ||
" 20 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects}\n", | ||
" 63 0.000 0.000 0.000 0.000 {built-in method builtins.len}\n", | ||
" 1 0.000 0.000 0.000 0.000 {built-in method numpy.core.multiarray.empty}\n", | ||
" 2 0.000 0.000 0.000 0.000 util.py:182(<genexpr>)\n", | ||
" 20 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}\n", | ||
" 20 0.000 0.000 0.000 0.000 {built-in method _thread.get_ident}\n", | ||
" 1 0.000 0.000 0.000 0.000 util.py:130(normalize_axis_selection)\n", | ||
" 20 0.000 0.000 0.000 0.000 zipfile.py:636(_get_decompressor)\n", | ||
" 20 0.000 0.000 0.000 0.000 threading.py:1298(main_thread)\n", | ||
" 4 0.000 0.000 0.000 0.000 core.py:373(<genexpr>)\n", | ||
" 3 0.000 0.000 0.000 0.000 util.py:187(<genexpr>)\n", | ||
" 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}\n", | ||
"\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import cProfile\n", | ||
"cProfile.run('z[:10]', sort='cumtime')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'0.11.0'" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import dask\n", | ||
"import dask.array as da\n", | ||
"dask.__version__" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"dask.array<array-f..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"d = da.from_array(z, chunks=z.chunks)\n", | ||
"d" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"CPU times: user 3min 35s, sys: 4.36 s, total: 3min 40s\n", | ||
"Wall time: 29.5 s\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([[3, 0],\n", | ||
" [1, 0],\n", | ||
" [2, 0],\n", | ||
" ..., \n", | ||
" [2, 8],\n", | ||
" [8, 8],\n", | ||
" [0, 1]])" | ||
] | ||
}, | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"%time d.sum(axis=1).compute()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"Array(/3L/calldata/genotype, (7449486, 773, 2), int8, chunks=(13107, 40, 2), order=C)\n", | ||
" nbytes: 10.7G; nbytes_stored: 193.5M; ratio: 56.7; initialized: 11380/11380\n", | ||
" compressor: Blosc(cname='zstd', clevel=1, shuffle=2)\n", | ||
" store: DirectoryStore" | ||
] | ||
}, | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# compare with same data via directory store\n", | ||
"store_dir = zarr.DirectoryStore('/data/coluzzi/ag1000g/data/phase1/release/AR3.1/haplotypes/main/zarr2/zstd/ag1000g.phase1.ar3.1.haplotypes')\n", | ||
"grp_dir = zarr.Group(store_dir)\n", | ||
"z_dir = grp_dir['3L/calldata/genotype']\n", | ||
"z_dir" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"dask.array<array-7..., shape=(7449486, 773, 2), dtype=int8, chunksize=(13107, 40, 2)>" | ||
] | ||
}, | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"d_dir = da.from_array(z_dir, chunks=z_dir.chunks)\n", | ||
"d_dir" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"CPU times: user 3min 39s, sys: 4.91 s, total: 3min 44s\n", | ||
"Wall time: 31.1 s\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"array([[3, 0],\n", | ||
" [1, 0],\n", | ||
" [2, 0],\n", | ||
" ..., \n", | ||
" [2, 8],\n", | ||
" [8, 8],\n", | ||
" [0, 1]])" | ||
] | ||
}, | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"%time d_dir.sum(axis=1).compute()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.1" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 1 | ||
} |
Oops, something went wrong.