In [199]:
# load packages - have been installed in vcf virtual env
import os, sys
import zarr
import numpy as np
import pandas as pd
import allel  
import dask.array as da
from itertools import compress
import glob
from collections import defaultdict

In [200]:
def read_zarr(chrom, zarr_path):
    zarr_file = zarr_path + 'SNP_INDEL_Pf3D7_' + chrom + '_v3.zarr'
    print(zarr_file)
    callset = zarr.open(zarr_file, mode='r')
    return(callset)

def gt_subset(callset, variant_bool, sample_bool):
    gt_zarr = callset['calldata/GT']
    gt_dask = allel.GenotypeDaskArray(gt_zarr)
    gt_daskSub = gt_dask.subset(variant_bool, sample_bool).compute()
    return(gt_daskSub)

def overlap_avail(user_pos_list, callset):
    zarr_index   = callset['variants/POS'][:].tolist()
    user_overlap =  [i in user_pos_list for i in zarr_index]
    return(user_overlap)
    
def overlap_filter(user_snps, filter_snps):
    return(user_snps & filter_snps)

def user_snps(user_df, chrom):
    user_sub = user_df[user_df['chr'].str.contains(chrom)]
    return(user_sub["position"].tolist())
    
def variant_filter(callset, vsqlod_min, num_alt):
    quality_set = callset['variants/FILTER_PASS'][:]
    snp_set = callset['variants/is_snp'][:]
    vsqlod_set = callset['variants/VQSLOD'][:]  > vsqlod_min
    alt_set = callset['variants/numalt'][:] < num_alt + 1 
    variant_hq = quality_set & snp_set & vsqlod_set & alt_set
    return(variant_hq)


def allele_freq(callset, variant_bool, sample_bool, num_alt, chrom):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    # count number of alleles
    ac = gt_daskSub.count_alleles(max_allele=num_alt)
    sub_af = ac/ac.sum(axis=1, keepdims=True)
    sub_df = pd.DataFrame(sub_af,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=["REF"] + ["ALT" + str(x) for x in range(1, num_alt + 1)])
    return(sub_df)

def gt_merge(list1, list2, j, callset, variant_bool):
    dic = {-1:"X", 
           0: np.array(callset['variants/REF'][:])[variant_bool][j], 
           1: np.array(callset['variants/ALT'][:])[variant_bool][j,0], 
           2: np.array(callset['variants/ALT'][:])[variant_bool][j,1],
           3: np.array(callset['variants/ALT'][:])[variant_bool][j,2]}
    merged_list = []
    for i in range(0, len(list1)):
        if list1[i] == list2[i]:
            if list1[i] == -1:
                merged_list.append("X")
            else:
                merged_list.append(list1[i])
        else:
            merged_list.append("N") 
    merged_replace=[dic.get(n, n) for n in merged_list]  
    return(merged_replace)

def match_genotype(callset, variant_bool, sample_bool, chrom ):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    data =[]
    for j in range(0, len(gt_daskSub)):
        data.append(gt_merge(gt_daskSub[j,:,0], gt_daskSub[j,:,1], j, callset, variant_bool))
    gt_df = pd.DataFrame(data,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=list(compress(np.array(callset['samples']).tolist(), sample_bool)))
    return(gt_df)

In [201]:
# import the SNPs of interest - i.e. 24 barcode positions
project_dir = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal'
user_snp_file = os.path.join(project_dir, "2008Daniels_BarcodePositions_Updated.txt")
user_snp_df = pd.read_csv(user_snp_file, sep='\t', header=0)
user_snp_df

#user_samples = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal/pf3k_bc_sampleOverlap.txt'
#with open(user_samples) as f:
#    sampleList = f.read().splitlines() 
#print(sampleList[1:5])
#print(len(sampleList))

sampleList = ['SenP005.02','SenP008.04','SenP011.02','SenP019.04','SenP027.02','SenP031.01','SenP051.02','SenP060.02','SenT001.08']



In [202]:
project_dir = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal/'
num_alt = 3
genotypes, frequencies = [], []


class Zarr_Parse():
    def __init__(self, sample_list, position_df, chrom_list = [x for x in range(1,15)], zarr_directory = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/pf3k_zarr/',):
        self.zarr_directory = zarr_directory
        self.zfile_chromosomes = [int(zfile.split('_')[-2]) for zfile in glob.glob(self.zarr_directory + '*.zarr')]
        self.chromosomes = []
        self.samples = sample_list
        self.positions = position_df #dataframe, columns = ['chr', 'position']
        self.callset = {}
        self.variant_bools = {}
        self.sample_bools = {}
        self.freq = {}
        self.genotypes = {}        
        self.fields = defaultdict(dict)
        
        self.create_callset(chrom_list)
        
    def create_callset(self, chrom_list):
        for chromosome in chrom_list:
            if int(chromosome) in self.zfile_chromosomes:
                self.chromosomes.append(chromosome)
                chrom = "{:02d}".format(chromosome)
                callset = read_zarr(chrom, self.zarr_directory)
                self.callset[chromosome]= callset
                
                user_chrom_snps = user_snps(self.positions, chrom)
                variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), 
                                          variant_filter(callset, 2, num_alt))
                self.variant_bools[chromosome] = variant_bool
                
                callset_samples = np.asarray(self.callset[chromosome]['samples']) #callset_samples
                sample_bool = np.array([i in self.samples for i in callset_samples]) 
                self.sample_bools[chromosome] = sample_bool
                
                self.freq[chromosome] = allele_freq(callset, variant_bool, sample_bool, 3, chrom)
                self.genotypes[chromosome] = match_genotype(callset, variant_bool, sample_bool, chrom)
                
    def extract_fields(self, dir1, field):
        #works only for GT, DP, and AD only!
        entry = '{dir1}/{field}'.format(dir1=dir1, field = field)
        for chromosome in self.chromosomes:
            if field in ['GT']:
                dask = allel.GenotypeDaskArray(self.callset[chromosome][entry])
                self.fields[entry][chromosome] = dask.subset(self.variant_bools[chromosome], self.sample_bools[chromosome]).compute()
            elif field in ['DP']: #check the differentiation between AlleleCountsDaskarray and GenotypeDaskArray
                dask = allel.AlleleCountsDaskArray(self.callset[chromosome][entry])
                variant_selection = dask.compress(self.variant_bools[chromosome], axis=0).compute()
                sample_selection = variant_selection[:, self.sample_bools[chromosome]]
                self.fields[entry][chromosome] = sample_selection
            elif field in ['AD']: #?? not sure why some things are Genotype/Allelecounts/ or simply an array
                zarr_array =self.callset[chromosome][entry]
                darray =da.from_zarr(zarr_array)
                variant_selection=darray[self.variant_bools[chromosome]]
                sample_selection=variant_selection[:,self.sample_bools[chromosome],:]
                self.fields[entry][chromosome] = sample_selection
    
    def calculate_readdepth_proportions(self):
        self.allele_props = {}
        self.extract_fields('calldata', 'DP')
        self.extract_fields('calldata', 'AD')
        for chromosome in self.chromosomes:
            allele_depths = np.asarray(self.fields['calldata/AD'][chromosome])
            allele_depths[allele_depths == -1] = 0 #missing call considered same as 0
            expanded_dim_depths = np.expand_dims(self.fields['calldata/DP'][chromosome], axis = 2) #expand the last dimension to be shape (position,samples,1) and allow direct division
            self.allele_props[chromosome] = allele_depths / expanded_dim_depths

        
    def make_variant_call(self):
        def call(stack):
            if stack[1] > 0.9:
                return stack[0]
            else:
                return 'mixed'    
        self.variant_call = {}
        self.calculate_readdepth_proportions()
        self.variant_calls = {}
        for chromosome in self.chromosomes:
            max_idxes = np.argmax(self.allele_props[chromosome], axis = 2)
            max_values = np.max(self.allele_props[chromosome], axis = 2)
            stacked_array = np.stack((max_idxes, max_props), axis = 2)
            self.variant_calls[chromosome] = np.apply_along_axis(call, 2, stacked_array)
        #check reference status
        
        
Z = Zarr_Parse(sampleList, user_snp_df)
Z.extract_fields('calldata', 'GT')
Z.calculate_readdepth_proportions()
Z.make_variant_call()

C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/pf3k_zarr/SNP_INDEL_Pf3D7_01_v3.zarr


In [158]:
Z.allele_props[1]

array([[[1.        , 0.        , 0.        , 0.        ],
        [0.02247191, 0.97752809, 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ]],

       [[0.        , 1.        , 0.        , 0.        ],
        [0.02083333, 0.97916667, 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [0.01282051, 0.98717949, 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [0.        , 1.        , 0.        , 0.        ],
        [0. 

In [198]:
Z.variant_calls

{1: array([[0., 1., 1., 0., 0., 1., 0., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1.]])}