In [1]:
import sys; print(sys.version)
import os
import glob
import subprocess
import multiprocessing

import numpy as np; print('numpy', np.__version__)
import pandas as pd; print('pandas',pd.__version__)
import allel; print('allel', allel.__version__)
import zarr; print('zarr', zarr.__version__)

from IPython.display import display, HTML

3.6.7 | packaged by conda-forge | (default, Feb 28 2019, 09:07:38) 
[GCC 7.3.0]
numpy 1.16.2
pandas 0.24.1
allel 1.2.0
zarr 2.2.0


# Constants

In [2]:
INFN = '/home/travc/proj/100Acol/100Acol_pflit.vcf.gz'
OUTFN = INFN+'.zarr'

# Below will probably not need to be changed
FIELDS = [
    'samples',
    'variants/CHROM',
    'variants/POS',
    'variants/REF',
    'variants/ALT',
    'variants/QUAL',
    'variants/TYPE',
#     'variants/NUMALT',
    'variants/numalt',
    'variants/AN',
    'variants/AC',
    'variants/AF',
    'variants/DP',
#     'variants/ANN',
    'calldata/DP',
    'calldata/GT',
         ]
EXCLUDE_FIELDS = None

# # ALTERNATIVE:
# # All the fields from the VCF (overkill leads to very big archive)
# FIELDS = '*' 
# EXCLUDE_FIELDS = ['variants/NUMALT'] # allel will calculate a numalt (lower case) on the fly

TABIX_EXEC = 'tabix'

print("Using tabix executable '{}' {} '{}'\n{}".format(TABIX_EXEC, u"\u2192", 
        subprocess.check_output(['which', 'tabix']).decode('utf-8').rstrip(),
        subprocess.check_output([TABIX_EXEC, '--version']).decode('utf-8')))

Using tabix executable 'tabix' → '/usr/local/bin/tabix'
tabix (htslib) 1.9
Copyright (C) 2018 Genome Research Ltd.



# Make a archive file from a vcf (using allel 1.1+)

In [3]:
# get list of chroms
chroms = subprocess.check_output([TABIX_EXEC,'-l',INFN], 
                                 universal_newlines=True).strip().split('\n')
display(chroms)

['2L', '3L', 'X', '3R', '2R', 'UNKN', 'Y_unplaced', 'Mt']

In [4]:
# Create zarr archive with a group for each chrom
num_procs = multiprocessing.cpu_count()-1

transformers = None
if 'ANN' in FIELDS:
    transformers=allel.ANNTransformer()

def vcf_to_zarr_func(ch):
    allel.vcf_to_zarr(INFN, OUTFN,
                      region=ch,
                      group=ch,
                      log=sys.stderr,
                      fields=FIELDS,
                      exclude_fields=EXCLUDE_FIELDS,
                      tabix=TABIX_EXEC,
                      transformers=transformers)
    
with multiprocessing.Pool(num_procs) as pool:
    pool.map(vcf_to_zarr_func, chroms, chunksize=1)

[vcf_to_zarr] 953 rows in 0.09s; chunk in 0.09s (10117 rows/s)
[vcf_to_zarr] all done (6323 rows/s)
[vcf_to_zarr] 5983 rows in 0.18s; chunk in 0.18s (33430 rows/s)
[vcf_to_zarr] all done (25993 rows/s)
[vcf_to_zarr] 65536 rows in 1.74s; chunk in 1.74s (37738 rows/s); X :581702
[vcf_to_zarr] 65536 rows in 1.94s; chunk in 1.94s (33731 rows/s); 2R :743477
[vcf_to_zarr] 65536 rows in 2.07s; chunk in 2.07s (31735 rows/s); 3L :1776199
[vcf_to_zarr] 65536 rows in 2.05s; chunk in 2.05s (31921 rows/s); UNKN :1308985
[vcf_to_zarr] 65536 rows in 2.18s; chunk in 2.18s (30068 rows/s); 3R :728941
[vcf_to_zarr] 65536 rows in 2.18s; chunk in 2.18s (30040 rows/s); 2L :1758907
[vcf_to_zarr] 131072 rows in 3.26s; chunk in 1.52s (43121 rows/s); X :965276
[vcf_to_zarr] 131072 rows in 3.85s; chunk in 1.91s (34356 rows/s); 2R :1097260
[vcf_to_zarr] 131072 rows in 4.13s; chunk in 1.95s (33545 rows/s); 3R :1100020
[vcf_to_zarr] 131072 rows in 4.28s; chunk in 2.22s (29482 rows/s); UNKN :2805992
[vcf_to_zarr] 13

[vcf_to_zarr] 1048576 rows in 33.05s; chunk in 2.01s (32654 rows/s); UNKN :27530148
[vcf_to_zarr] 1048576 rows in 33.36s; chunk in 2.12s (30848 rows/s); 3L :13370362
[vcf_to_zarr] 1179648 rows in 33.58s; chunk in 1.73s (37988 rows/s); 2R :7139422
[vcf_to_zarr] 1310720 rows in 33.64s; chunk in 1.67s (39332 rows/s); X :7712630
[vcf_to_zarr] 1114112 rows in 34.01s; chunk in 1.90s (34561 rows/s); 2L :11190318
[vcf_to_zarr] 1245184 rows in 34.68s; chunk in 1.87s (35074 rows/s); 3R :6882876
[vcf_to_zarr] 1114112 rows in 34.97s; chunk in 1.93s (34038 rows/s); UNKN :29141107
[vcf_to_zarr] 1114112 rows in 35.00s; chunk in 1.64s (39859 rows/s); 3L :13753569
[vcf_to_zarr] 1376256 rows in 35.43s; chunk in 1.79s (36611 rows/s); X :8163556
[vcf_to_zarr] 1245184 rows in 35.61s; chunk in 2.03s (32306 rows/s); 2R :7577086
[vcf_to_zarr] 1179648 rows in 35.86s; chunk in 1.85s (35425 rows/s); 2L :11638760
[vcf_to_zarr] 1310720 rows in 36.56s; chunk in 1.88s (34927 rows/s); 3R :7218696
[vcf_to_zarr] 117964

[vcf_to_zarr] 2424832 rows in 65.64s; chunk in 1.46s (44890 rows/s); 3R :12791701
[vcf_to_zarr] 2359296 rows in 66.33s; chunk in 1.66s (39582 rows/s); 2R :14252583
[vcf_to_zarr] 2293760 rows in 66.69s; chunk in 1.66s (39408 rows/s); 3L :20392681
[vcf_to_zarr] 2293760 rows in 66.81s; chunk in 1.53s (42849 rows/s); 2L :17971853
[vcf_to_zarr] 2490368 rows in 67.20s; chunk in 1.55s (42180 rows/s); 3R :13046765
[vcf_to_zarr] 2555904 rows in 67.38s; chunk in 1.91s (34348 rows/s); X :15869377
[vcf_to_zarr] 2424832 rows in 68.01s; chunk in 1.68s (38904 rows/s); 2R :14591508
[vcf_to_zarr] 2359296 rows in 68.39s; chunk in 1.70s (38497 rows/s); 3L :20726907
[vcf_to_zarr] 2359296 rows in 68.39s; chunk in 1.57s (41652 rows/s); 2L :18337435
[vcf_to_zarr] 2555904 rows in 68.75s; chunk in 1.55s (42255 rows/s); 3R :13326598
[vcf_to_zarr] 2621440 rows in 69.24s; chunk in 1.86s (35234 rows/s); X :16476305
[vcf_to_zarr] 2490368 rows in 69.72s; chunk in 1.71s (38349 rows/s); 2R :14940805
[vcf_to_zarr] 2424

[vcf_to_zarr] 3932160 rows in 102.71s; chunk in 1.78s (36764 rows/s); 3R :19674124
[vcf_to_zarr] 3801088 rows in 102.73s; chunk in 1.76s (37216 rows/s); 2R :22359191
[vcf_to_zarr] 3801088 rows in 102.93s; chunk in 1.49s (43932 rows/s); 2L :25886855
[vcf_to_zarr] 3997696 rows in 104.18s; chunk in 1.47s (44716 rows/s); 3R :19942647
[vcf_to_zarr] 3801088 rows in 104.21s; chunk in 1.53s (42764 rows/s); 3L :28057690
[vcf_to_zarr] 3866624 rows in 104.33s; chunk in 1.59s (41089 rows/s); 2R :22704985
[vcf_to_zarr] 3866624 rows in 104.45s; chunk in 1.51s (43262 rows/s); 2L :26219878
[vcf_to_zarr] 4063232 rows in 105.56s; chunk in 1.39s (47264 rows/s); 3R :20226133
[vcf_to_zarr] 3866624 rows in 105.82s; chunk in 1.61s (40727 rows/s); 3L :28347608
[vcf_to_zarr] 3932160 rows in 105.91s; chunk in 1.46s (44772 rows/s); 2L :26515944
[vcf_to_zarr] 3932160 rows in 105.95s; chunk in 1.62s (40437 rows/s); 2R :23039759
[vcf_to_zarr] 4128768 rows in 107.24s; chunk in 1.67s (39143 rows/s); 3R :20533000
[vcf

[vcf_to_zarr] 5373952 rows in 142.27s; chunk in 1.68s (39079 rows/s); 2R :31898720
[vcf_to_zarr] 5308416 rows in 142.43s; chunk in 1.88s (34782 rows/s); 3L :35037062
[vcf_to_zarr] 5570560 rows in 142.55s; chunk in 1.74s (37567 rows/s); 3R :27281369
[vcf_to_zarr] 5570560 rows in 143.35s; chunk in 1.50s (43574 rows/s); 2L :34315234
[vcf_to_zarr] 5439488 rows in 143.88s; chunk in 1.61s (40789 rows/s); 2R :32339567
[vcf_to_zarr] 5373952 rows in 143.94s; chunk in 1.51s (43367 rows/s); 3L :35337062
[vcf_to_zarr] 5636096 rows in 144.13s; chunk in 1.57s (41659 rows/s); 3R :27608960
[vcf_to_zarr] 5636096 rows in 144.82s; chunk in 1.47s (44685 rows/s); 2L :34634091
[vcf_to_zarr] 5439488 rows in 145.56s; chunk in 1.62s (40529 rows/s); 3L :35657368
[vcf_to_zarr] 5505024 rows in 145.55s; chunk in 1.67s (39241 rows/s); 2R :32812615
[vcf_to_zarr] 5701632 rows in 145.81s; chunk in 1.68s (38945 rows/s); 3R :27917793
[vcf_to_zarr] 5701632 rows in 146.43s; chunk in 1.62s (40578 rows/s); 2L :34977152
[vcf

[vcf_to_zarr] 7274496 rows in 184.19s; chunk in 1.77s (37097 rows/s); 3R :36398016
[vcf_to_zarr] 7340032 rows in 184.85s; chunk in 1.57s (41787 rows/s); 2L :43352832
[vcf_to_zarr] 7077888 rows in 185.60s; chunk in 1.62s (40410 rows/s); 2R :44486650
[vcf_to_zarr] 7340032 rows in 185.77s; chunk in 1.58s (41488 rows/s); 3R :36837619
[vcf_to_zarr] 7405568 rows in 186.41s; chunk in 1.55s (42237 rows/s); 2L :43697091
[vcf_to_zarr] 7405568 rows in 187.28s; chunk in 1.52s (43206 rows/s); 3R :37198354
[vcf_to_zarr] 7143424 rows in 187.44s; chunk in 1.83s (35764 rows/s); 2R :45054974
[vcf_to_zarr] 7471104 rows in 187.86s; chunk in 1.45s (45153 rows/s); 2L :43999442
[vcf_to_zarr] 7208960 rows in 189.15s; chunk in 1.72s (38168 rows/s); 2R :45641780
[vcf_to_zarr] 7471104 rows in 189.22s; chunk in 1.94s (33810 rows/s); 3R :37661103
[vcf_to_zarr] 7536640 rows in 189.40s; chunk in 1.54s (42435 rows/s); 2L :44331032
[vcf_to_zarr] 7536640 rows in 190.89s; chunk in 1.67s (39215 rows/s); 3R :38023237
[vcf

# Load callset from zarr archive

In [5]:
callset = zarr.open_group(OUTFN, mode='r')

In [7]:
%%time
callset['2L'].tree()

CPU times: user 997 µs, sys: 290 µs, total: 1.29 ms
Wall time: 789 µs


In [6]:
t = callset

In [9]:
list(callset['2L/variants'].keys())

['AC',
 'AF',
 'ALT',
 'AN',
 'CHROM',
 'DP',
 'POS',
 'QUAL',
 'REF',
 'TYPE',
 'numalt']