In [10]:
# load packages - have been installed in vcf virtual env
import os, sys
import zarr
import numpy as np
import pandas as pd
import allel  
import dask.array as da
from itertools import compress
import glob
from collections import defaultdict

In [11]:
def read_zarr(chrom, zarr_path):
    zarr_file = zarr_path + 'SNP_INDEL_Pf3D7_' + chrom + '_v3.zarr'
    print(zarr_file)
    callset = zarr.open(zarr_file, mode='r')
    return(callset)

def gt_subset(callset, variant_bool, sample_bool):
    gt_zarr = callset['calldata/GT']
    gt_dask = allel.GenotypeDaskArray(gt_zarr)
    gt_daskSub = gt_dask.subset(variant_bool, sample_bool).compute()
    return(gt_daskSub)

def overlap_avail(user_pos_list, callset):
    zarr_index   = callset['variants/POS'][:].tolist()
    user_overlap =  [i in user_pos_list for i in zarr_index]
    return(user_overlap)
    
def overlap_filter(user_snps, filter_snps):
    return(user_snps & filter_snps)

def user_snps(user_df, chrom):
    user_sub = user_df[user_df['chr'].str.contains(chrom)]
    return(user_sub["position"].tolist())
    
def variant_filter(callset, vsqlod_min, num_alt):
    quality_set = callset['variants/FILTER_PASS'][:]
    snp_set = callset['variants/is_snp'][:]
    vsqlod_set = callset['variants/VQSLOD'][:]  > vsqlod_min
    alt_set = callset['variants/numalt'][:] < num_alt + 1 
    variant_hq = quality_set & snp_set & vsqlod_set & alt_set
    return(variant_hq)


def allele_freq(callset, variant_bool, sample_bool, num_alt, chrom):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    # count number of alleles
    ac = gt_daskSub.count_alleles(max_allele=num_alt)
    sub_af = ac/ac.sum(axis=1, keepdims=True)
    sub_df = pd.DataFrame(sub_af,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=["REF"] + ["ALT" + str(x) for x in range(1, num_alt + 1)])
    return(sub_df)

def gt_merge(list1, list2, j, callset, variant_bool):
    dic = {-1:"X", 
           0: np.array(callset['variants/REF'][:])[variant_bool][j], 
           1: np.array(callset['variants/ALT'][:])[variant_bool][j,0], 
           2: np.array(callset['variants/ALT'][:])[variant_bool][j,1],
           3: np.array(callset['variants/ALT'][:])[variant_bool][j,2]}
    merged_list = []
    for i in range(0, len(list1)):
        if list1[i] == list2[i]:
            if list1[i] == -1:
                merged_list.append("X")
            else:
                merged_list.append(list1[i])
        else:
            merged_list.append("N") 
    merged_replace=[dic.get(n, n) for n in merged_list]  
    return(merged_replace)

def match_genotype(callset, variant_bool, sample_bool, chrom ):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    data =[]
    for j in range(0, len(gt_daskSub)):
        data.append(gt_merge(gt_daskSub[j,:,0], gt_daskSub[j,:,1], j, callset, variant_bool))
    gt_df = pd.DataFrame(data,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=list(compress(np.array(callset['samples']).tolist(), sample_bool)))
    return(gt_df)

In [12]:
# import the SNPs of interest - i.e. 24 barcode positions
project_dir = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal'
user_snp_file = os.path.join(project_dir, "2008Daniels_BarcodePositions_Updated.txt")
user_snp_df = pd.read_csv(user_snp_file, sep='\t', header=0)
user_snp_df

#user_samples = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal/pf3k_bc_sampleOverlap.txt'
#with open(user_samples) as f:
#    sampleList = f.read().splitlines() 
#print(sampleList[1:5])
#print(len(sampleList))

sampleList = ['SenP005.02','SenP008.04','SenP011.02','SenP019.04','SenP027.02','SenP031.01','SenP051.02','SenP060.02','SenT001.08']



In [42]:
project_dir = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal/'
num_alt = 3
genotypes, frequencies = [], []


class Zarr_Parse():
    def __init__(self, sample_list, position_df, chrom_list = [x for x in range(1,15)], zarr_directory = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/pf3k_zarr/',):
        self.zarr_directory = zarr_directory
        self.zfile_chromosomes = [int(zfile.split('_')[-2]) for zfile in glob.glob(self.zarr_directory + '*.zarr')]
        self.chromosomes = []
        self.samples = sample_list
        self.positions_df = position_df #dataframe, columns = ['chr', 'position']
        #would prefer to convert this to a dictionary
        self.callset = {}
        self.variant_bools = {}
        self.sample_bools = {}
        self.freq = {}
        self.genotypes = {}        
        self.fields = defaultdict(dict)
        
        self.create_callset(chrom_list)
        
        
    def create_callset(self, chrom_list):
        for chromosome in chrom_list:
            if int(chromosome) in self.zfile_chromosomes:
                self.chromosomes.append(chromosome)
                chrom = "{:02d}".format(chromosome)
                callset = read_zarr(chrom, self.zarr_directory)
                self.callset[chromosome]= callset
                
                user_chrom_snps = user_snps(self.positions_df, chrom)
                variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), 
                                          variant_filter(callset, 2, num_alt))
                self.variant_bools[chromosome] = variant_bool
                
                callset_samples = np.asarray(self.callset[chromosome]['samples']) #callset_samples
                sample_bool = np.array([i in self.samples for i in callset_samples]) 
                self.sample_bools[chromosome] = sample_bool
                
                self.freq[chromosome] = allele_freq(callset, variant_bool, sample_bool, 3, chrom)
                self.genotypes[chromosome] = match_genotype(callset, variant_bool, sample_bool, chrom)
                
    def extract_fields(self, dir1, field):
        #works only for GT, DP, and AD only!
        entry = '{dir1}/{field}'.format(dir1=dir1, field = field)
        for chromosome in self.chromosomes:
            if field in ['GT']:
                dask = allel.GenotypeDaskArray(self.callset[chromosome][entry])
                self.fields[entry][chromosome] = dask.subset(self.variant_bools[chromosome], self.sample_bools[chromosome]).compute()
            elif field in ['DP']: #check the differentiation between AlleleCountsDaskarray and GenotypeDaskArray
                dask = allel.AlleleCountsDaskArray(self.callset[chromosome][entry])
                variant_selection = dask.compress(self.variant_bools[chromosome], axis=0).compute()
                sample_selection = variant_selection[:, self.sample_bools[chromosome]]
                self.fields[entry][chromosome] = sample_selection
            elif field in ['AD']: #?? not sure why some things are Genotype/Allelecounts/ or simply an array
                zarr_array =self.callset[chromosome][entry]
                darray =da.from_zarr(zarr_array)
                variant_selection=darray[self.variant_bools[chromosome]]
                sample_selection=variant_selection[:,self.sample_bools[chromosome],:]
                self.fields[entry][chromosome] = sample_selection
    
    def calculate_readdepth_proportions(self):
        self.allele_props = {}
        self.extract_fields('calldata', 'DP')
        self.extract_fields('calldata', 'AD')
        for chromosome in self.chromosomes:
            allele_depths = np.asarray(self.fields['calldata/AD'][chromosome])
            allele_depths[allele_depths == -1] = 0 #missing call considered same as 0
            expanded_dim_depths = np.expand_dims(self.fields['calldata/DP'][chromosome], axis = 2) #expand the last dimension to be shape (position,samples,1) and allow direct division
            self.allele_props[chromosome] = allele_depths / expanded_dim_depths

        
    def make_variant_call(self):
        def call(stack):
            if stack[1] > 0.98:
                return stack[0]
            else:
                return -1   
        self.calculate_readdepth_proportions()
        self.variant_calls = {}
        for chromosome in self.chromosomes:
            max_idxes = np.argmax(self.allele_props[chromosome], axis = 2)
            max_values = np.max(self.allele_props[chromosome], axis = 2)
            stacked_array = np.stack((max_idxes, max_values), axis = 2)
            self.variant_calls[chromosome] = np.apply_along_axis(call, 2, stacked_array)
        #check reference statusd
        
        
Z = Zarr_Parse(sampleList, user_snp_df)
Z.extract_fields('calldata', 'GT')
Z.make_variant_call()

C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/pf3k_zarr/SNP_INDEL_Pf3D7_01_v3.zarr


In [16]:
Z.__dict__.keys()

dict_keys(['zarr_directory', 'zfile_chromosomes', 'chromosomes', 'samples', 'positions_df', 'callset', 'variant_bools', 'sample_bools', 'freq', 'genotypes', 'fields', 'allele_props', 'variant_calls'])

In [19]:
Z.callset[1].tree()

In [21]:
Z.variant_bools

{1: array([False, False, False, ..., False, False, False])}

In [23]:
Z.fields.keys()

dict_keys(['calldata/GT', 'calldata/DP', 'calldata/AD'])

In [25]:
Z.fields['calldata/GT'][1]

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0/0,1/1,1/1,0/0,0/0,1/1,0/0,1/1,1/1
1,1/1,1/1,1/1,1/1,1/1,1/1,1/1,1/1,1/1


In [26]:
Z.fields['calldata/DP'][1]

array([[ 38,  89,  23,  58,  47,  47,  44,  40, 135],
       [ 73,  96,  66,  78, 146,  67, 154,  67, 121]], dtype=int16)

In [28]:
np.asarray(Z.fields['calldata/AD'][1])

array([[[ 38,   0,  -1,  -1],
        [  2,  87,  -1,  -1],
        [  0,  23,  -1,  -1],
        [ 58,   0,  -1,  -1],
        [ 47,   0,  -1,  -1],
        [  0,  47,  -1,  -1],
        [ 44,   0,  -1,  -1],
        [  0,  40,  -1,  -1],
        [  0, 135,  -1,  -1]],

       [[  0,  73,  -1,  -1],
        [  2,  94,  -1,  -1],
        [  0,  66,  -1,  -1],
        [  1,  77,  -1,  -1],
        [  0, 146,  -1,  -1],
        [  0,  67,  -1,  -1],
        [  0, 154,  -1,  -1],
        [  0,  67,  -1,  -1],
        [  0, 121,  -1,  -1]]], dtype=int16)

In [30]:
np.expand_dims(Z.fields['calldata/DP'][1], axis=2)

array([[[ 38],
        [ 89],
        [ 23],
        [ 58],
        [ 47],
        [ 47],
        [ 44],
        [ 40],
        [135]],

       [[ 73],
        [ 96],
        [ 66],
        [ 78],
        [146],
        [ 67],
        [154],
        [ 67],
        [121]]], dtype=int16)

In [32]:
np.asarray(Z.fields['calldata/AD'][1]) / np.expand_dims(Z.fields['calldata/DP'][1], axis=2)

array([[[ 1.        ,  0.        , -0.02631579, -0.02631579],
        [ 0.02247191,  0.97752809, -0.01123596, -0.01123596],
        [ 0.        ,  1.        , -0.04347826, -0.04347826],
        [ 1.        ,  0.        , -0.01724138, -0.01724138],
        [ 1.        ,  0.        , -0.0212766 , -0.0212766 ],
        [ 0.        ,  1.        , -0.0212766 , -0.0212766 ],
        [ 1.        ,  0.        , -0.02272727, -0.02272727],
        [ 0.        ,  1.        , -0.025     , -0.025     ],
        [ 0.        ,  1.        , -0.00740741, -0.00740741]],

       [[ 0.        ,  1.        , -0.01369863, -0.01369863],
        [ 0.02083333,  0.97916667, -0.01041667, -0.01041667],
        [ 0.        ,  1.        , -0.01515152, -0.01515152],
        [ 0.01282051,  0.98717949, -0.01282051, -0.01282051],
        [ 0.        ,  1.        , -0.00684932, -0.00684932],
        [ 0.        ,  1.        , -0.01492537, -0.01492537],
        [ 0.        ,  1.        , -0.00649351, -0.00649351],
      

In [34]:
Z.allele_props

{1: array([[[1.        , 0.        , 0.        , 0.        ],
         [0.02247191, 0.97752809, 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ],
         [1.        , 0.        , 0.        , 0.        ],
         [1.        , 0.        , 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ],
         [1.        , 0.        , 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ]],
 
        [[0.        , 1.        , 0.        , 0.        ],
         [0.02083333, 0.97916667, 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ],
         [0.01282051, 0.98717949, 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.        ],
         [0.        , 1.        , 0.        , 0.  

In [35]:
max_idxes = np.argmax(Z.allele_props[1], axis = 2)
print(max_idxes)

[[0 1 1 0 0 1 0 1 1]
 [1 1 1 1 1 1 1 1 1]]


In [36]:
max_values = np.max(Z.allele_props[1], axis = 2)
print(max_values)

[[1.         0.97752809 1.         1.         1.         1.
  1.         1.         1.        ]
 [1.         0.97916667 1.         0.98717949 1.         1.
  1.         1.         1.        ]]


In [37]:
stacked_array = np.stack((max_idxes, max_values), axis = 2)

In [38]:
stacked_array

array([[[0.        , 1.        ],
        [1.        , 0.97752809],
        [1.        , 1.        ],
        [0.        , 1.        ],
        [0.        , 1.        ],
        [1.        , 1.        ],
        [0.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ]],

       [[1.        , 1.        ],
        [1.        , 0.97916667],
        [1.        , 1.        ],
        [1.        , 0.98717949],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ]]])

In [49]:
pd.DataFrame(Z.variant_calls[1], columns = Z.samples)

Unnamed: 0,SenP005.02,SenP008.04,SenP011.02,SenP019.04,SenP027.02,SenP031.01,SenP051.02,SenP060.02,SenT001.08
0,0.0,-1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
1,1.0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
for chr, position in zip(Z.positions_df['chr'], Z.positions_df['position']):
    print(chr, position)

Pf3D7_01_v3 130339
Pf3D7_01_v3 537322
Pf3D7_02_v3 842805
Pf3D7_04_v3 276127
Pf3D7_05_v3 931606
Pf3D7_06_v3 145475
Pf3D7_06_v3 937752
Pf3D7_07_v3 221722
Pf3D7_07_v3 435497
Pf3D7_07_v3 489666
Pf3D7_07_v3 602559
Pf3D7_07_v3 616459
Pf3D7_07_v3 628392
Pf3D7_07_v3 736978
Pf3D7_07_v3 1359804
Pf3D7_08_v3 612596
Pf3D7_09_v3 634019
Pf3D7_10_v3 82375
Pf3D7_10_v3 1402510
Pf3D7_11_v3 119497
Pf3D7_11_v3 408600
Pf3D7_13_v3 158412
Pf3D7_13_v3 1429067
Pf3D7_14_v3 755731
