# Parsing barcode variants from whole genome variant files

Learn to subset variants from Zarr files that can be used in IDM models.

The Zarr format is similar to HD5 that will allows to subset large files without loading into memory. For any whole genome sequencing file, variants in a population will be in a VCF format.

In [1]:
# load packages - have been installed in vcf virtual env
import os, sys
import zarr
import numpy as np
import pandas as pd
import allel  
import dask.array as da
from itertools import compress

## Functions

In [2]:
def read_zarr(chrom, zarr_path):
    zarr_file = zarr_path + 'SNP_INDEL_Pf3D7_' + chrom + '_v3.zarr'
    callset = zarr.open(zarr_file, mode='r')
    return(callset)

def gt_subset(callset, variant_bool, sample_bool):
    gt_zarr = callset['calldata/GT']
    gt_dask = allel.GenotypeDaskArray(gt_zarr)
    gt_daskSub = gt_dask.subset(variant_bool, sample_bool).compute()
    return(gt_daskSub)

In [13]:
def overlap_avail(user_pos_list, callset):
    zarr_index   = callset['variants/POS'][:].tolist()
    user_overlap =  [i in user_pos_list for i in zarr_index]
    return(user_overlap)
    
def overlap_filter(user_snps, filter_snps):
    return(user_snps & filter_snps)

def user_snps(user_df, chrom):
    user_sub = user_df[user_df['chr'].str.contains(chrom)]
    return(user_sub["position"].tolist())
    
def variant_filter(callset, vsqlod_min, num_alt):
    quality_set = callset['variants/FILTER_PASS'][:]
    snp_set = callset['variants/is_snp'][:]
    vsqlod_set = callset['variants/VQSLOD'][:]  > vsqlod_min
    alt_set = callset['variants/numalt'][:] < num_alt + 1 
    variant_hq = quality_set & snp_set & vsqlod_set & alt_set
    return(variant_hq)


In [4]:
def allele_freq(callset, variant_bool, sample_bool, num_alt):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    # count number of alleles
    ac = gt_daskSub.count_alleles(max_allele=num_alt)
    sub_af = ac/ac.sum(axis=1, keepdims=True)
    sub_df = pd.DataFrame(sub_af,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=["REF"] + ["ALT" + str(x) for x in range(1, num_alt + 1)])
    return(sub_df)

In [5]:
def gt_merge(list1, list2, j):
    dic = {-1:"X", 
           0: np.array(callset['variants/REF'][:])[variant_bool][j], 
           1: np.array(callset['variants/ALT'][:])[variant_bool][j,0], 
           2: np.array(callset['variants/ALT'][:])[variant_bool][j,1],
           3: np.array(callset['variants/ALT'][:])[variant_bool][j,2]}
    merged_list = []
    for i in range(0, len(list1)):
        if list1[i] == list2[i]:
            if list1[i] == -1:
                merged_list.append("X")
            else:
                merged_list.append(list1[i])
        else:
            merged_list.append("N") 
    merged_replace=[dic.get(n, n) for n in merged_list]  
    return(merged_replace)

def match_genotype(callset, variant_bool, sample_bool):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    data =[]
    for j in range(0, len(gt_daskSub)):
        data.append(gt_merge(gt_daskSub[j,:,0], gt_daskSub[j,:,1], j))
    gt_df = pd.DataFrame(data,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=list(compress(np.array(callset['samples']).tolist(), sample_bool)))
    return(gt_df)

## Run it together 

This section is specifically set up to run a for loop since Pf3k was divivde by chromosome. For other data sets, the above functions except reading in zarr object, should hold asumping it contains the same information in the Zarr file. 

In [14]:
# import the SNPs of interest - i.e. 24 barcode positions
user_snp_file = os.path.join(project_dir, "2008Daniels_BarcodePositions_Updated.txt")
user_snp_df = pd.read_csv(user_snp_file, sep='\t', header=0)
user_snp_df

Unnamed: 0,chr,position
0,Pf3D7_01_v3,130339
1,Pf3D7_01_v3,537322
2,Pf3D7_02_v3,842805
3,Pf3D7_04_v3,276127
4,Pf3D7_05_v3,931606
5,Pf3D7_06_v3,145475
6,Pf3D7_06_v3,937752
7,Pf3D7_07_v3,221722
8,Pf3D7_07_v3,435497
9,Pf3D7_07_v3,489666


In [29]:
user_samples = '/home/jribado/Dropbox (IDM)/parasite_genetics/genomics/senegal/pf3k_bc_sampleOverlap.txt'
with open(user_samples) as f:
    sampleList = f.read().splitlines() 
print(sampleList[1:5])
print(len(sampleList))

['SenP008', 'SenP011', 'SenP019', 'SenP027']
113


In [28]:
project_dir = '/home/jribado/Dropbox (IDM)/parasite_genetics/genomics/senegal/'
num_alt = 3
genotypes, frequencies = [], []


for i in range(1, 15):
    chrom = "{:02d}".format(i)
    # read in file
    zarr_path = '/home/jribado/Dropbox (IDM)/Data, Dynamics, and Analytics Folder/Projects/malaria_pfcommunity/malaria_pf3k/pf3k_zarr/'
    callset = read_zarr(chrom, zarr_path)
    # identify variants in VCF that match user inputs
    user_chrom_snps = user_snps(user_snp_df, chrom)
    variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), 
                                  variant_filter(callset, 2, num_alt))
    # identify samples in VCF that match user inputs
    sample_list = [x.split(".")[0] for x in np.array(callset['samples']).tolist()]
    sample_bool = np.array([i in sampleList for i in sample_list]) 
    #sample_bool  = np.array(["Sen" in i for i in np.array(callset['samples']).tolist()])
    freq = allele_freq(callset, variant_bool, sample_bool, 3)
    frequencies.append(freq)
    geno = match_genotype(callset, variant_bool, sample_bool)
    genotypes.append(geno)

appended_freq = pd.concat(frequencies)
print(appended_freq)
appended_freq.to_csv('~/Desktop/Pf3k_wgsSenegalBCOverlapFrequencies24bc.txt') 

appended_geno = pd.concat(genotypes)
print(appended_geno)
appended_geno.to_csv('~/Desktop/Pf3k_wgsSenegalBCOverlapsHaploidGenotype24bc.txt')  

137
137
137
137
137
137
137
137
137
137
137
137
137
137
                          REF      ALT1      ALT2  ALT3
Pf3D7_01_v3:130339   0.302920  0.697080  0.000000   0.0
Pf3D7_01_v3:537322   0.021898  0.978102  0.000000   0.0
Pf3D7_02_v3:842805   0.722222  0.277778  0.000000   0.0
Pf3D7_04_v3:276127   0.791971  0.208029  0.000000   0.0
Pf3D7_05_v3:931606   0.781022  0.218978  0.000000   0.0
Pf3D7_06_v3:145475   0.675182  0.324818  0.000000   0.0
Pf3D7_06_v3:937752   0.419708  0.580292  0.000000   0.0
Pf3D7_07_v3:221722   0.905109  0.094891  0.000000   0.0
Pf3D7_07_v3:435497   0.521898  0.478102  0.000000   0.0
Pf3D7_07_v3:489666   0.529197  0.470803  0.000000   0.0
Pf3D7_07_v3:616459   0.529197  0.470803  0.000000   0.0
Pf3D7_07_v3:628392   0.697080  0.302920  0.000000   0.0
Pf3D7_07_v3:736978   0.580292  0.419708  0.000000   0.0
Pf3D7_07_v3:1359804  0.562044  0.324818  0.113139   0.0
Pf3D7_08_v3:612596   0.733577  0.266423  0.000000   0.0
Pf3D7_09_v3:634019   0.496350  0.503650  0.00000