This notebook has some profiling of Dask used to make a selection along both first and second axes of a large-ish multidimensional array. The use case is making selections of genotype data, e.g., as required for making a web-browser for genotype data as in www.malariagen.net/apps/ag1000g.

In [1]:
import zarr; print('zarr', zarr.__version__)
import dask; print('dask', dask.__version__)
import numpy as np

zarr 2.1.1
dask 0.11.0


## Real data

In [2]:
# here's the real data
callset = zarr.open_group('/kwiat/2/coluzzi/ag1000g/data/phase1/release/AR3.1/variation/main/zarr2/zstd/ag1000g.phase1.ar3',
                          mode='r')
callset

Group(/, 8)
  arrays: 1; samples
  groups: 7; 2L, 2R, 3L, 3R, UNKN, X, Y_unplaced
  store: DirectoryStore

In [3]:
# here's the array we're going to work with
g = callset['3R/calldata/genotype']
g

Array(/3R/calldata/genotype, (22632425, 765, 2), int8, chunks=(13107, 40, 2), order=C)
  nbytes: 32.2G; nbytes_stored: 1.0G; ratio: 31.8; initialized: 34540/34540
  compressor: Blosc(cname='zstd', clevel=1, shuffle=2)
  store: DirectoryStore

In [4]:
# wrap as dask array with very simple chunking of first dim only
%time gd = da.from_array(g, chunks=(g.chunks[0], None, None))
gd

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 5.13 ms


dask.array<array-b..., shape=(22632425, 765, 2), dtype=int8, chunksize=(13107, 765, 2)>

In [5]:
# load condition used to make selection on first axis
dim0_condition = callset['3R/variants/FILTER_PASS'][:]
dim0_condition.shape, dim0_condition.dtype, np.count_nonzero(dim0_condition)

((22632425,), dtype('bool'), 13167162)

In [6]:
# invent a random selection for second axis
dim1_indices = sorted(np.random.choice(765, size=100, replace=False))

In [7]:
# setup the 2D selection - this is the slow bit
%time gd_sel = gd[dim0_condition][:, dim1_indices]
gd_sel

CPU times: user 15.3 s, sys: 256 ms, total: 15.5 s
Wall time: 15.5 s


dask.array<getitem..., shape=(13167162, 100, 2), dtype=int8, chunksize=(8873, 100, 2)>

In [23]:
# now load a slice from this new selection - quick!
%time gd_sel[1000000:1100000].compute(optimize_graph=False)

CPU times: user 1.21 s, sys: 152 ms, total: 1.36 s
Wall time: 316 ms


array([[[0, 0],
        [0, 0],
        [0, 0],
        ..., 
        [0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0],
        ..., 
        [0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0],
        ..., 
        [0, 0],
        [0, 0],
        [0, 0]],

       ..., 
       [[0, 0],
        [0, 0],
        [0, 0],
        ..., 
        [0, 1],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0],
        ..., 
        [0, 0],
        [0, 0],
        [0, 0]],

       [[0, 0],
        [0, 0],
        [0, 0],
        ..., 
        [0, 0],
        [0, 0],
        [0, 0]]], dtype=int8)

In [9]:
# what's taking so long?
import cProfile

cProfile.run('gd[dim0_condition][:, dim1_indices]', sort='time')

         105406881 function calls (79072145 primitive calls) in 26.182 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
13167268/6    6.807    0.000    9.038    1.506 slicing.py:623(check_index)
        2    4.713    2.356    5.831    2.916 slicing.py:398(partition_by_size)
13167270/2    4.470    0.000    8.763    4.382 slicing.py:540(posify_index)
 52669338    4.118    0.000    4.119    0.000 {built-in method builtins.isinstance}
        2    2.406    1.203    8.763    4.382 slicing.py:563(<listcomp>)
        1    0.875    0.875    0.875    0.875 slicing.py:44(<listcomp>)
 13182474    0.600    0.000    0.600    0.000 {built-in method builtins.len}
        2    0.527    0.264    0.527    0.264 slicing.py:420(issorted)
 13189168    0.520    0.000    0.520    0.000 {method 'append' of 'list' objects}
        2    0.271    0.136    0.271    0.136 slicing.py:479(<listcomp>)
        2    0.220    0.110    0.220    0.110 {built-in

In [10]:
cProfile.run('gd[dim0_condition][:, dim1_indices]', sort='cumtime')

         105406881 function calls (79072145 primitive calls) in 25.630 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   25.630   25.630 {built-in method builtins.exec}
        1    0.107    0.107   25.630   25.630 <string>:1(<module>)
        2    0.102    0.051   25.523   12.761 core.py:1024(__getitem__)
        2    0.001    0.000   25.381   12.691 slicing.py:60(slice_array)
        2    0.049    0.024   24.214   12.107 slicing.py:142(slice_with_newaxes)
        2    0.000    0.000   24.147   12.073 slicing.py:170(slice_wrap_lists)
13167268/6    6.664    0.000    8.855    1.476 slicing.py:623(check_index)
13167270/2    4.354    0.000    8.466    4.233 slicing.py:540(posify_index)
        2    2.277    1.139    8.465    4.233 slicing.py:563(<listcomp>)
        2    0.000    0.000    6.826    3.413 slicing.py:487(take)
        2    0.111    0.056    6.331    3.165 slicing.py:441(take_sorted)
 

## Synthetic data

In [22]:
# create a synthetic dataset for profiling
a = zarr.array(np.random.randint(-1, 4, size=(20000000, 200, 2), dtype='i1'),
               chunks=(10000, 100, 2), compressor=zarr.Blosc(cname='zstd', clevel=1, shuffle=2))
a

Array((20000000, 200, 2), int8, chunks=(10000, 100, 2), order=C)
  nbytes: 7.5G; nbytes_stored: 2.7G; ratio: 2.8; initialized: 4000/4000
  compressor: Blosc(cname='zstd', clevel=1, shuffle=2)
  store: dict

In [24]:
# create a synthetic selection for first axis
c = np.random.randint(0, 2, size=a.shape[0], dtype=bool)

In [25]:
# create a synthetic selection for second axis
s = sorted(np.random.choice(a.shape[1], size=100, replace=False))

In [26]:
%time d = da.from_array(a, chunks=(a.chunks[0], None, None))
d

CPU times: user 208 ms, sys: 0 ns, total: 208 ms
Wall time: 206 ms


dask.array<array-5..., shape=(20000000, 200, 2), dtype=int8, chunksize=(10000, 200, 2)>

In [27]:
%time ds = d[c][:, s]

CPU times: user 12 s, sys: 200 ms, total: 12.2 s
Wall time: 12.2 s


In [28]:
cProfile.run('d[c][:, s]', sort='time')

         80095589 function calls (60091843 primitive calls) in 19.467 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
10001773/6    4.872    0.000    6.456    1.076 slicing.py:623(check_index)
        2    3.517    1.758    4.357    2.179 slicing.py:398(partition_by_size)
10001775/2    3.354    0.000    6.484    3.242 slicing.py:540(posify_index)
 40007358    2.965    0.000    2.965    0.000 {built-in method builtins.isinstance}
        2    1.749    0.875    6.484    3.242 slicing.py:563(<listcomp>)
        1    0.878    0.878    0.878    0.878 slicing.py:44(<listcomp>)
 10019804    0.451    0.000    0.451    0.000 {built-in method builtins.len}
 10027774    0.392    0.000    0.392    0.000 {method 'append' of 'list' objects}
        2    0.363    0.181    0.363    0.181 slicing.py:420(issorted)
        2    0.270    0.135    4.786    2.393 slicing.py:441(take_sorted)
        1    0.207    0.207    0.207    0.207 {method '

In [29]:
%time ds[1000000:1100000].compute(optimize_graph=False)

CPU times: user 452 ms, sys: 8 ms, total: 460 ms
Wall time: 148 ms


array([[[ 2, -1],
        [ 2,  3],
        [ 3,  0],
        ..., 
        [ 1,  3],
        [-1, -1],
        [ 1,  1]],

       [[ 1, -1],
        [ 2,  2],
        [-1,  2],
        ..., 
        [ 2, -1],
        [ 1,  3],
        [-1, -1]],

       [[ 1, -1],
        [ 2,  0],
        [ 0,  3],
        ..., 
        [ 2,  2],
        [ 3,  2],
        [ 0,  2]],

       ..., 
       [[ 1,  2],
        [ 3, -1],
        [ 2,  1],
        ..., 
        [ 1,  2],
        [ 1,  0],
        [ 2,  0]],

       [[ 1,  2],
        [ 1,  0],
        [ 2,  3],
        ..., 
        [-1,  2],
        [ 3,  3],
        [ 1, -1]],

       [[-1,  3],
        [ 2,  2],
        [ 1,  1],
        ..., 
        [ 3,  3],
        [ 0,  0],
        [ 0,  2]]], dtype=int8)

In [30]:
# problem is in fact just the dim0 selection
cProfile.run('d[c]', sort='time')

         80055494 function calls (60052157 primitive calls) in 19.425 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
10001670/3    5.032    0.000    6.671    2.224 slicing.py:623(check_index)
        1    3.459    3.459    4.272    4.272 slicing.py:398(partition_by_size)
10001671/1    3.287    0.000    6.378    6.378 slicing.py:540(posify_index)
 40006704    2.999    0.000    2.999    0.000 {built-in method builtins.isinstance}
        1    1.731    1.731    6.378    6.378 slicing.py:563(<listcomp>)
        1    0.849    0.849    0.849    0.849 slicing.py:44(<listcomp>)
 10011685    0.433    0.000    0.433    0.000 {built-in method builtins.len}
 10015670    0.381    0.000    0.381    0.000 {method 'append' of 'list' objects}
        1    0.355    0.355    0.355    0.355 slicing.py:420(issorted)
        1    0.196    0.196    0.196    0.196 {method 'tolist' of 'numpy.ndarray' objects}
        1    0.193    0.193    0.193  