In [1]:
import sys
sys.path.insert(0, '..')
import cProfile
import numpy as np; print('numpy', np.__version__)
import zarr; print('zarr', zarr.__version__, 'blosc', ' '.join(zarr.blosc_version()))
import bcolz; print('bcolz', bcolz.__version__, 'blosc', ' '.join(bcolz.blosc_version()))
import line_profiler
bcolz.blosc_set_nthreads(1)

numpy 1.10.2
zarr 0.2.6.dev0+dirty blosc 1.7.0 $Date:: 2015-07-05 #$
bcolz 0.12.2.dev22+dirty blosc 1.7.0 $Date:: 2015-07-05 #$


4

## 1D array creation

In [2]:
a = np.arange(1e8, dtype='i4')
a

array([       0,        1,        2, ..., 99999997, 99999998, 99999999], dtype=int32)

In [3]:
c = bcolz.carray(a, cparams=bcolz.cparams(cname='lz4'))
c

carray((100000000,), int32)
  nbytes: 381.47 MB; cbytes: 7.68 MB; ratio: 49.67
  cparams := cparams(clevel=5, shuffle=True, cname='lz4')
[       0        1        2 ..., 99999997 99999998 99999999]

In [4]:
z = zarr.array(a, chunks=c.chunklen, cname='lz4')
z

zarr.ext.Array((100000000,), int32, chunks=(262144,), cname='lz4', clevel=5, shuffle=1)
  nbytes: 381.5M; cbytes: 6.6M; ratio: 57.4

In [5]:
%timeit bcolz.carray(a, cparams=bcolz.cparams(cname='lz4'))

10 loops, best of 3: 139 ms per loop


In [6]:
%timeit zarr.array(a, chunks=c.chunklen, cname='lz4')

10 loops, best of 3: 137 ms per loop


In [7]:
cProfile.run('zarr.array(a, chunks=c.chunklen, cname="lz4")', sort='time')

         8039 function calls (6890 primitive calls) in 0.146 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      382    0.133    0.000    0.133    0.000 ext.pyx:209(compress)
  762/382    0.003    0.000    0.138    0.000 ext.pyx:287(__setitem__)
    381/0    0.002    0.000    0.000          ext.pyx:468(genexpr)
      382    0.002    0.000    0.138    0.000 ext.pyx:127(chunk_setitem)
    380/0    0.001    0.000    0.000          ext.pyx:482(genexpr)
      763    0.001    0.000    0.002    0.000 numeric.py:1970(isscalar)
      381    0.001    0.000    0.001    0.000 {built-in method array}
      382    0.001    0.000    0.001    0.000 ext.pyx:109(is_total_slice)
      2/1    0.001    0.000    0.001    0.001 ext.pyx:493(__cinit__)
      763    0.000    0.000    0.000    0.000 {built-in method isinstance}
      382    0.000    0.000    0.001    0.000 ext.pyx:182(__cinit__)
      381    0.000    0.000    0.001    0.000 numeric

## 1D array read

In [8]:
%timeit c[:]

10 loops, best of 3: 169 ms per loop


In [9]:
%timeit z[:]

10 loops, best of 3: 165 ms per loop


In [10]:
cProfile.run('z[:]', sort='time')

         2304 function calls (1157 primitive calls) in 0.173 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      382    0.164    0.000    0.164    0.000 ext.pyx:255(decompress)
  762/382    0.003    0.000    0.001    0.000 ext.pyx:109(is_total_slice)
    381/0    0.002    0.000    0.000          ext.pyx:417(genexpr)
    381/0    0.002    0.000    0.000          ext.pyx:423(genexpr)
      2/1    0.001    0.001    0.002    0.002 <string>:1(<module>)
      2/1    0.000    0.000    0.001    0.001 ext.pyx:387(array_getitem)
      2/1    0.000    0.000    0.002    0.002 {built-in method exec}
        1    0.000    0.000    0.172    0.172 ext.pyx:398(genexpr)
      382    0.000    0.000    0.000    0.000 ext.pyx:203(__get__)
        1    0.000    0.000    0.000    0.000 ext.pyx:352(get_chunk_range)
        1    0.000    0.000    0.000    0.000 ext.pyx:157(chunk_getitem)
      2/1    0.000    0.000    0.001    0.001 ext.pyx:561(_

In [11]:
profile = line_profiler.LineProfiler(zarr.ext.array_getitem)
profile.run('z[:]')
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.169746 s
File: /media/aliman/SD1/src/github/alimanfoo/zarr/zarr/ext.pyx
Function: array_getitem at line 387

Line #      Hits         Time  Per Hit   % Time  Line Contents
   387                                           def array_getitem(Array self, item):
   388                                               """Array.__getitem__ broken out as separate function to enable line
   389                                               profiling."""
   390                                           
   391                                               cdef ndarray dest
   392                                               cdef Chunk chunk
   393                                           
   394                                               # normalise selection
   395         1           14     14.0      0.0      selection = normalise_array_selection(item, self.shape)
   396                                           
   397                                      

## 2D array creation

In [12]:
a = np.arange(1e8, dtype='i4').reshape((10000, 10000))
a

array([[       0,        1,        2, ...,     9997,     9998,     9999],
       [   10000,    10001,    10002, ...,    19997,    19998,    19999],
       [   20000,    20001,    20002, ...,    29997,    29998,    29999],
       ..., 
       [99970000, 99970001, 99970002, ..., 99979997, 99979998, 99979999],
       [99980000, 99980001, 99980002, ..., 99989997, 99989998, 99989999],
       [99990000, 99990001, 99990002, ..., 99999997, 99999998, 99999999]], dtype=int32)

In [13]:
c = bcolz.carray(a, cparams=bcolz.cparams(cname='lz4'))
c

carray((10000, 10000), int32)
  nbytes: 381.47 MB; cbytes: 7.75 MB; ratio: 49.25
  cparams := cparams(clevel=5, shuffle=True, cname='lz4')
[[       0        1        2 ...,     9997     9998     9999]
 [   10000    10001    10002 ...,    19997    19998    19999]
 [   20000    20001    20002 ...,    29997    29998    29999]
 ..., 
 [99970000 99970001 99970002 ..., 99979997 99979998 99979999]
 [99980000 99980001 99980002 ..., 99989997 99989998 99989999]
 [99990000 99990001 99990002 ..., 99999997 99999998 99999999]]

In [14]:
c.chunklen

26

In [15]:
z = zarr.array(a, chunks=(c.chunklen, a.shape[1]), cname='lz4')
z

zarr.ext.Array((10000, 10000), int32, chunks=(26, 10000), nbytes=381.5M, cbytes=6.7M, cratio=56.8, cname=lz4, clevel=5, shuffle=1)

In [16]:
%timeit bcolz.carray(a, cparams=bcolz.cparams(cname='lz4'))

10 loops, best of 3: 131 ms per loop


In [17]:
%timeit zarr.array(a, chunks=(c.chunklen, a.shape[1]), cname='lz4')

10 loops, best of 3: 138 ms per loop


In [18]:
cProfile.run('zarr.array(a, chunks=(c.chunklen, a.shape[1]), cname="lz4")', sort='time')

         8103 function calls (6945 primitive calls) in 0.154 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      385    0.138    0.000    0.138    0.000 ext.pyx:209(compress)
  768/385    0.003    0.000    0.144    0.000 ext.pyx:287(__setitem__)
      385    0.002    0.000    0.143    0.000 ext.pyx:127(chunk_setitem)
    384/0    0.002    0.000    0.000          ext.pyx:468(genexpr)
    383/0    0.001    0.000    0.000          ext.pyx:482(genexpr)
      769    0.001    0.000    0.002    0.000 numeric.py:1970(isscalar)
      2/1    0.001    0.001    0.003    0.003 ext.pyx:493(__cinit__)
      384    0.001    0.000    0.001    0.000 {built-in method array}
      385    0.001    0.000    0.001    0.000 ext.pyx:109(is_total_slice)
      385    0.001    0.000    0.001    0.000 ext.pyx:182(__cinit__)
      769    0.000    0.000    0.000    0.000 {built-in method isinstance}
      384    0.000    0.000    0.001    0.000 numeric

In [19]:
%timeit zarr.array(a, chunks=(c.chunklen*10, a.shape[1]//10), cname='lz4')

1 loops, best of 3: 197 ms per loop


In [20]:
cProfile.run('zarr.array(a, chunks=(c.chunklen*10, a.shape[1]//10), cname="lz4")', sort='time')

         8190 function calls (7017 primitive calls) in 0.246 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      390    0.166    0.000    0.167    0.000 ext.pyx:209(compress)
      380    0.059    0.000    0.059    0.000 {built-in method array}
  778/390    0.005    0.000    0.233    0.001 ext.pyx:287(__setitem__)
      390    0.004    0.000    0.232    0.001 ext.pyx:127(chunk_setitem)
    389/0    0.003    0.000    0.000          ext.pyx:468(genexpr)
    388/0    0.002    0.000    0.000          ext.pyx:482(genexpr)
      770    0.002    0.000    0.003    0.000 numeric.py:1970(isscalar)
      390    0.001    0.000    0.001    0.000 ext.pyx:109(is_total_slice)
      380    0.001    0.000    0.060    0.000 numeric.py:527(ascontiguousarray)
      770    0.001    0.000    0.001    0.000 {built-in method isinstance}
      2/1    0.001    0.000    0.001    0.001 ext.pyx:493(__cinit__)
      390    0.000    0.000    0.000    0.

In [21]:
profile = line_profiler.LineProfiler(zarr.ext.array_setitem, zarr.ext.chunk_setitem)
profile.run('zarr.array(a, chunks=(c.chunklen, a.shape[1]), cname="lz4")')
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.144544 s
File: /media/aliman/SD1/src/github/alimanfoo/zarr/zarr/ext.pyx
Function: chunk_setitem at line 127

Line #      Hits         Time  Per Hit   % Time  Line Contents
   127                                           def chunk_setitem(Chunk self, key, value):
   128                                               """Chunk.__setitem__ broken out as separate function to enable line
   129                                               profiling."""
   130                                           
   131       385         1611      4.2      1.1      if is_total_slice(key, self.shape):
   132                                                   # completely replace the contents of this chunk
   133                                           
   134       384         1103      2.9      0.8          if np.isscalar(value):
   135                                                       array = np.empty(self.shape, dtype=self.dtype)
   136                         

In [22]:
profile = line_profiler.LineProfiler(zarr.ext.array_setitem, zarr.ext.chunk_setitem)
profile.run('zarr.array(a, chunks=(c.chunklen*10, a.shape[1]//10), cname="lz4")')
profile.print_stats()

Timer unit: 1e-06 s

Total time: 0.228859 s
File: /media/aliman/SD1/src/github/alimanfoo/zarr/zarr/ext.pyx
Function: chunk_setitem at line 127

Line #      Hits         Time  Per Hit   % Time  Line Contents
   127                                           def chunk_setitem(Chunk self, key, value):
   128                                               """Chunk.__setitem__ broken out as separate function to enable line
   129                                               profiling."""
   130                                           
   131       390         1815      4.7      0.8      if is_total_slice(key, self.shape):
   132                                                   # completely replace the contents of this chunk
   133                                           
   134       380         1180      3.1      0.5          if np.isscalar(value):
   135                                                       array = np.empty(self.shape, dtype=self.dtype)
   136                         

## 2D array read

In [23]:
%timeit c[:]

1 loops, best of 3: 284 ms per loop


In [24]:
z = zarr.array(a, chunks=(c.chunklen, a.shape[1]), cname='lz4')
%timeit z[:]
profile = line_profiler.LineProfiler(zarr.ext.array_getitem, zarr.ext.chunk_getitem)
profile.run('z[:]')
profile.print_stats()

1 loops, best of 3: 291 ms per loop
Timer unit: 1e-06 s

Total time: 0.000414 s
File: /media/aliman/SD1/src/github/alimanfoo/zarr/zarr/ext.pyx
Function: chunk_getitem at line 157

Line #      Hits         Time  Per Hit   % Time  Line Contents
   157                                           def chunk_getitem(Chunk self, item):
   158                                               """Chunk.__getitem__ broken out as separate function to enable line
   159                                               profiling."""
   160                                           
   161                                               cdef:
   162                                                   ndarray array
   163                                           
   164                                               # setup output array
   165         1           13     13.0      3.1      array = np.empty(self.shape, dtype=self.dtype)
   166                                           
   167         1            1

In [25]:
z = zarr.array(a, chunks=(c.chunklen*10, a.shape[1]//10), cname='lz4')
%timeit z[:]
profile = line_profiler.LineProfiler(zarr.ext.array_getitem, zarr.ext.chunk_getitem)
profile.run('z[:]')
profile.print_stats()

1 loops, best of 3: 363 ms per loop
Timer unit: 1e-06 s

Total time: 0.154248 s
File: /media/aliman/SD1/src/github/alimanfoo/zarr/zarr/ext.pyx
Function: chunk_getitem at line 157

Line #      Hits         Time  Per Hit   % Time  Line Contents
   157                                           def chunk_getitem(Chunk self, item):
   158                                               """Chunk.__getitem__ broken out as separate function to enable line
   159                                               profiling."""
   160                                           
   161                                               cdef:
   162                                                   ndarray array
   163                                           
   164                                               # setup output array
   165       390         2242      5.7      1.5      array = np.empty(self.shape, dtype=self.dtype)
   166                                           
   167       390          187

## 2D array slices

In [29]:
%timeit c[1000:2000]

100 loops, best of 3: 18.2 ms per loop


In [30]:
z = zarr.array(a, chunks=(c.chunklen, a.shape[1]), cname='lz4')
%timeit z[1000:2000]

100 loops, best of 3: 19 ms per loop


In [31]:
z = zarr.array(a, chunks=(c.chunklen*10, a.shape[1]//10), cname='lz4')
%timeit z[1000:2000]

10 loops, best of 3: 30.1 ms per loop


In [32]:
%timeit c[:, 1000:2000]

1 loops, best of 3: 303 ms per loop


In [33]:
z = zarr.array(a, chunks=(c.chunklen, a.shape[1]), cname='lz4')
%timeit z[:, 1000:2000]

10 loops, best of 3: 148 ms per loop


In [34]:
z = zarr.array(a, chunks=(c.chunklen*10, a.shape[1]//10), cname='lz4')
%timeit z[:, 1000:2000]

100 loops, best of 3: 17.5 ms per loop
