### Profiling VCF reading using cyvcf2

In [1]:
from pathlib import Path
from sgkit_vcf import partition_into_regions, vcf_to_zarr

from bokeh.io import output_notebook
from dask.diagnostics import ResourceProfiler
output_notebook()

In [2]:
vcf_path = Path("data/ALL.chr22.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz")
output = vcf_path.name + ".zarr"

In [21]:
!rm -rf {output}

Split file into _N_ roughly equal parts. This uses the tabix linear index to find split points.

In [22]:
regions = partition_into_regions(vcf_path, num_parts=8) # number of cores x2
regions

['22:1-21168128',
 '22:21168129-25395200',
 '22:25395201-30064640',
 '22:30064641-34832384',
 '22:34832385-39305216',
 '22:39305217-44236800',
 '22:44236801-47890432',
 '22:47890433-']

In [23]:
%%time
with ResourceProfiler() as prof:
    vcf_to_zarr(vcf_path, output, regions=regions)



CPU times: user 17min 10s, sys: 1min 38s, total: 18min 48s
Wall time: 4min 1s


In [24]:
prof.visualize()

### Run sequentially
By not specifying regions.

In [17]:
!rm -rf {output}

In [18]:
%%time
with ResourceProfiler() as prof:
    vcf_to_zarr(vcf_path, output)

CPU times: user 7min 56s, sys: 11.5 s, total: 8min 8s
Wall time: 8min 5s


In [19]:
prof.visualize()

### Scikit-allel
Note that phasing information and masks are not stored

In [10]:
!rm -rf {output}

In [12]:
import allel

In [13]:
%%time
with ResourceProfiler() as prof:
    allel.vcf_to_zarr(str(vcf_path), str(output))

CPU times: user 5min 25s, sys: 5.3 s, total: 5min 30s
Wall time: 5min 29s


In [14]:
prof.visualize()