# Parsing barcode variants from whole genome variant files

Learn to subset variants from Zarr files that can be used in IDM models.

The Zarr format is similar to HD5 that will allows to subset large files without loading into memory. For any whole genome sequencing file, variants in a population will be in a VCF format.

Tutorials for guidance:

http://alimanfoo.github.io/2016/06/10/scikit-allel-tour.html

http://alimanfoo.github.io/2018/04/09/selecting-variants.html

In [3]:
# load packages - have been installed in vcf virtual env
import os, sys
import zarr
import numpy as np
import pandas as pd
import allel  
import dask.array as da
from itertools import compress

## Functions

In [4]:
def read_zarr(chrom, zarr_path):
    zarr_file = zarr_path + 'SNP_INDEL_Pf3D7_' + chrom + '_v3.zarr'
    callset = zarr.open(zarr_file, mode='r')
    return(callset)

def gt_subset(callset, variant_bool, sample_bool):
    gt_zarr = callset['calldata/GT']
    gt_dask = allel.GenotypeDaskArray(gt_zarr)
    gt_daskSub = gt_dask.subset(variant_bool, sample_bool).compute()
    return(gt_daskSub)

In [5]:
def overlap_avail(user_pos_list, callset):
    zarr_index   = callset['variants/POS'][:].tolist()
    user_overlap =  [i in user_pos_list for i in zarr_index]
    return(user_overlap)
    
def overlap_filter(user_snps, filter_snps):
    return(user_snps & filter_snps)

def user_snps(user_df, chrom):
    user_sub = user_df[user_df['chr'].str.contains(chrom)]
    return(user_sub["position"].tolist())
    
def variant_filter(callset, vsqlod_min, num_alt):
    quality_set = callset['variants/FILTER_PASS'][:]
    snp_set = callset['variants/is_snp'][:]
    vsqlod_set = callset['variants/VQSLOD'][:]  > vsqlod_min
    alt_set = callset['variants/numalt'][:] < num_alt + 1 
    variant_hq = quality_set & snp_set & vsqlod_set & alt_set
    return(variant_hq)


In [6]:
def allele_freq(callset, variant_bool, sample_bool, num_alt):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    # count number of alleles
    ac = gt_daskSub.count_alleles(max_allele=num_alt)
    sub_af = ac/ac.sum(axis=1, keepdims=True)
    sub_df = pd.DataFrame(sub_af,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=["REF"] + ["ALT" + str(x) for x in range(1, num_alt + 1)])
    return(sub_df)

In [7]:
def gt_merge(list1, list2, j):
    dic = {-1:"X", 
           0: np.array(callset['variants/REF'][:])[variant_bool][j], 
           1: np.array(callset['variants/ALT'][:])[variant_bool][j,0], 
           2: np.array(callset['variants/ALT'][:])[variant_bool][j,1],
           3: np.array(callset['variants/ALT'][:])[variant_bool][j,2]}
    merged_list = []
    for i in range(0, len(list1)):
        if list1[i] == list2[i]:
            if list1[i] == -1:
                merged_list.append("X")
            else:
                merged_list.append(list1[i])
        else:
            merged_list.append("N") 
    merged_replace=[dic.get(n, n) for n in merged_list]  
    return(merged_replace)

def match_genotype(callset, variant_bool, sample_bool):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    data =[]
    for j in range(0, len(gt_daskSub)):
        data.append(gt_merge(gt_daskSub[j,:,0], gt_daskSub[j,:,1], j))
    gt_df = pd.DataFrame(data,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=list(compress(np.array(callset['samples']).tolist(), sample_bool)))
    return(gt_df)

## Run it together 

This section is specifically set up to run a for loop since Pf3k was divivde by chromosome. For other data sets, the above functions except reading in zarr object, should hold asumping it contains the same information in the Zarr file. 

In [12]:
# import the SNPs of interest - i.e. 24 barcode positions
project_dir = '/home/jribado/Dropbox (IDM)/parasite_genetics/genomics/senegal'
user_snp_file = os.path.join(project_dir, "2008Daniels_BarcodePositions_Updated.txt")
user_snp_df = pd.read_csv(user_snp_file, sep='\t', header=0)
user_snp_df

Unnamed: 0,chr,position
0,Pf3D7_01_v3,130339
1,Pf3D7_01_v3,537322
2,Pf3D7_02_v3,842805
3,Pf3D7_04_v3,276127
4,Pf3D7_05_v3,931606
5,Pf3D7_06_v3,145475
6,Pf3D7_06_v3,937752
7,Pf3D7_07_v3,221722
8,Pf3D7_07_v3,435497
9,Pf3D7_07_v3,489666


In [13]:
user_samples = '/home/jribado/Dropbox (IDM)/parasite_genetics/genomics/senegal/pf3k_bc_sampleOverlap.txt'
with open(user_samples) as f:
    sampleList = f.read().splitlines() 
print(sampleList[1:5])
print(len(sampleList))

['SenP008', 'SenP011', 'SenP019', 'SenP027']
113


In [9]:
project_dir = '/home/jribado/Dropbox (IDM)/parasite_genetics/genomics/senegal/'
num_alt = 3
genotypes, frequencies = [], []


for i in range(1, 2):
    chrom = "{:02d}".format(i)
    # read in file
    zarr_path = '/home/jribado/Dropbox (IDM)/Data, Dynamics, and Analytics Folder/Projects/malaria_pfcommunity/malaria_pf3k/pf3k_zarr/'
    callset = read_zarr(chrom, zarr_path)
    # identify variants in VCF that match user inputs
    user_chrom_snps = user_snps(user_snp_df, chrom)
    variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), 
                                  variant_filter(callset, 2, num_alt))
    # identify samples in VCF that match user inputs
    sample_list = [x.split(".")[0] for x in np.array(callset['samples']).tolist()]
    sample_bool = np.array([i in sampleList for i in sample_list]) 
    #sample_bool  = np.array(["Sen" in i for i in np.array(callset['samples']).tolist()])
    freq = allele_freq(callset, variant_bool, sample_bool, 3)
    frequencies.append(freq)
    geno = match_genotype(callset, variant_bool, sample_bool)
    genotypes.append(geno)

appended_freq = pd.concat(frequencies)
print(appended_freq)
#appended_freq.to_csv('~/Desktop/Pf3k_wgsSenegalBCOverlapFrequencies24bc.txt') 

appended_geno = pd.concat(genotypes)
print(appended_geno)
#appended_geno.to_csv('~/Desktop/Pf3k_wgsSenegalBCOverlapsHaploidGenotype24bc.txt')  

                         REF      ALT1  ALT2  ALT3
Pf3D7_01_v3:130339  0.302920  0.697080   0.0   0.0
Pf3D7_01_v3:537322  0.021898  0.978102   0.0   0.0
                   SenP005.02 SenP008.04 SenP011.02 SenP019.04 SenP027.02  \
Pf3D7_01_v3:130339          C          T          T          C          C   
Pf3D7_01_v3:537322          A          A          A          A          A   

                   SenP031.01 SenP051.02 SenP060.02 SenT001.08 SenT001.11  \
Pf3D7_01_v3:130339          T          C          T          T          N   
Pf3D7_01_v3:537322          A          A          A          A          A   

                    ... SenT230.08 SenT231.08 SenT232.08 SenT233.08  \
Pf3D7_01_v3:130339  ...          T          T          T          T   
Pf3D7_01_v3:537322  ...          A          A          A          A   

                   SenT235.08 SenT236.08 SenV034.04 SenV035.04 SenV042.05  \
Pf3D7_01_v3:130339          T          C          C          T          T   
Pf3D7_01_v3:537

## Incorporating depth and count data to improve called genotypes

We are going to improve the genotype calls by incorporating minimum read counts.  

In [47]:
chrom="01"
zarr_path = '/home/jribado/Dropbox (IDM)/Data, Dynamics, and Analytics Folder/Projects/malaria_pfcommunity/malaria_pf3k/pf3k_zarr/'
callset = read_zarr(chrom, zarr_path)
callset.tree()

In [18]:
user_chrom_snps = user_snps(user_snp_df, chrom)
variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), variant_filter(callset, 2, 2))
    # identify samples in VCF that match user inputs
sample_list = [x.split(".")[0] for x in np.array(callset['samples']).tolist()]
sample_bool = np.array([i in sampleList for i in sample_list])

In [48]:
gt_zarr = callset['calldata/GT']
gt_dask = allel.GenotypeDaskArray(gt_zarr)
gt_daskSub = gt_dask.subset(variant_bool, sample_bool).compute()
gt_daskSub

Unnamed: 0,0,1,2,3,4,...,132,133,134,135,136
0,0/0,1/1,1/1,0/0,0/0,...,0/0,0/0,1/1,1/1,1/1
1,1/1,1/1,1/1,1/1,1/1,...,1/1,1/1,1/1,1/1,1/1


In [49]:
# pull total number of high quality reads that align to each position per sample
dp_zarr = callset['calldata/DP']
dp_dask = allel.AlleleCountsDaskArray(dp_zarr)
dp_variant_selection = dp_dask.compress(variant_bool, axis=0).compute()
dp_variant_selection = dp_variant_selection[:, sample_bool]
print(dp_variant_selection)

[[ 38  89  23  58  47  47  44  40 135  74 119 270 108 174 213  44  66  35
   35 309  25  82 242 273 101  70  58  84  45 352 197  37  42  18 110 112
  105  45 239 130  32  55  29  65  63  37  35  20 155  58 125 174  33 141
  234 118 213  18 131 162 185  70 221  27 165  36 258 534 223  50  88 128
   56 107 122 263 423 122  23  40  40  87  52 402 199 194  92  35  72 383
   60  67 148 543 179 118  43  44  36  35  42 221 148 282  36  48  87 269
  130 239 284 315 166  41 191 235 274 197  36  36 130  41 145  76  41  36
  115 103  98  94 159 153  36  36  49  84 105]
 [ 73  96  66  78 146  67 154  67 121  66 163 411 109 216 253  43  91  32
  130 387  23  86 299 355 116 535  76 650  26 406 249  37 287  26 117 181
  147 107 518 138  34  53  45  53 355 233 140  21 392  69  92 201  52 153
  282 204 384  11 193 161 197 257 219  25 249  32 327 695 235  49  81 113
  514 144  93 208 302  77  19 150  81  83 372 690 199 248 113 127  83 402
  545 112 178 594 124 793 128 165  67 156 535 357 233 347  37  37

In [50]:
cutoff=100
dp_lt_cutoff=np.where(dp_variant_selection<cutoff,0)
dp_lt_cutoff

ValueError: either both or neither of x and y should be given

In [30]:
# get the number of reads that align to each of the alleles
ad_zarr=callset['calldata/AD']
ad_array=da.from_zarr(ad_zarr)
ad_variant_selection=ad_array[variant_bool]
ad_variant_selection=ad_variant_selection[:,sample_bool,:]

In [67]:
user_chrom_snps

[130339, 537322]

In [None]:
dp_dask = allel.AlleleCountsDaskArray(dp_zarr)

user_chrom_snps = user_snps(user_snp_df, chrom)
variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), variant_filter(callset, 2, 2))
dp_variant_selection = dp_dask.compress(variant_bool, axis=0).compute()
dp_variant_selection = dp_variant_selection[:, sample_bool]

def overlap_avail(user_pos_list, callset):
    zarr_index   = callset['variants/POS'][:].tolist()
    user_overlap =  [i in user_pos_list for i in zarr_index]
    return(user_overlap)

class SNP:
    def __init__(self, chromosome, position):
        self.chromosome = chromosome
        self.position = position
        variant_bool = overlap_filter(overlap_avail(position, callset), variant_filter(callset, 2, 2)
        
        continue
                                      

    
        

In [57]:
locus1_totalreads = dp_variant_selection[0]
locus2_totalreads = dp_variant_selection[1]

locus1_ad_array_ref = ad_variant_selection[0,:,0]
locus2_ad_array_ref = ad_variant_selection[1,:,0]



In [66]:
print(np.asarray(locus1_ad_array_ref))
print(locus1_totalreads )

locus1_refprop = (locus1_totalreads - locus1_ad_array_ref ) / locus1_totalreads
locus1_alt1prop = (locus1_totalreads = locus1_ada)
print(np.asarray(locus1_refprop))

[ 38   2   0  58  47   0  44   0   0  48   0   0   0  31   1   0   0   0
  35   0   0   0   1   0   0  70   1  84   0   0   0  37  42   0   0   0
   0  45   0   0   0   0   0   0  63  37  35   0   0   0   0   0  33   0
   0   0   0   0   0 162   0  70   0   0   0  36   0  12   0   1   0   0
  56   0  17   0   0   0  23  40  40   0  52   1   0   0   0  35   0   0
  60   0   0   1   1 118  43  44  36  35  42   1   0 247  36   0   0  48
   0   0  25 315   0  41   2   1   1   6  36  36   0   0   0   0  41  36
   0   0   0   0   1   1  36  36   0   0   0]
[ 38  89  23  58  47  47  44  40 135  74 119 270 108 174 213  44  66  35
  35 309  25  82 242 273 101  70  58  84  45 352 197  37  42  18 110 112
 105  45 239 130  32  55  29  65  63  37  35  20 155  58 125 174  33 141
 234 118 213  18 131 162 185  70 221  27 165  36 258 534 223  50  88 128
  56 107 122 263 423 122  23  40  40  87  52 402 199 194  92  35  72 383
  60  67 148 543 179 118  43  44  36  35  42 221 148 282  36  48  87 269
 130 

In [55]:
ad_array_ref

Unnamed: 0,Array,Chunk
Bytes,548 B,128 B
Shape,"(2, 137)","(1, 64)"
Count,223 Tasks,6 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 548 B 128 B Shape (2, 137) (1, 64) Count 223 Tasks 6 Chunks Type int16 numpy.ndarray",137  2,

Unnamed: 0,Array,Chunk
Bytes,548 B,128 B
Shape,"(2, 137)","(1, 64)"
Count,223 Tasks,6 Chunks
Type,int16,numpy.ndarray


In [51]:
# pull the first item in the allele count array to get the reference count
ad_array_ref=ad_variant_selection[:,:,0]
ad_array_ref=ad_array_ref.compute()
print(ad_array_ref)

# ignore zero division for site where one population may have an allele but other samples may not have adequate coverage
np.seterr(divide='ignore', invalid='ignore')
a=ad_array_ref/dp_variant_selection
print(a)

[[ 38   2   0  58  47   0  44   0   0  48   0   0   0  31   1   0   0   0
   35   0   0   0   1   0   0  70   1  84   0   0   0  37  42   0   0   0
    0  45   0   0   0   0   0   0  63  37  35   0   0   0   0   0  33   0
    0   0   0   0   0 162   0  70   0   0   0  36   0  12   0   1   0   0
   56   0  17   0   0   0  23  40  40   0  52   1   0   0   0  35   0   0
   60   0   0   1   1 118  43  44  36  35  42   1   0 247  36   0   0  48
    0   0  25 315   0  41   2   1   1   6  36  36   0   0   0   0  41  36
    0   0   0   0   1   1  36  36   0   0   0]
 [  0   2   0   1   0   0   0   0   0   0   0   0  27 179   0   0   0   0
    0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
    0   0   3   0  34   0   0   0   1   0   0   0 339   0   0   0   0   0
    0   0  41   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
    0   0   0   0   0   0   0   0   0   0   2   1   0   0   0   0   0   0
    0   0   0   2   0   2   0   0   0   0   0   0   0   0   0   0

In [45]:
a[a<0.1] = 1
a[a>=0.9] = 0


In [46]:
a

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.64864865,
        0.        , 0.        , 0.        , 0.17816092, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  