In [None]:
# load packages - have been installed in vcf virtual env
import os, sys
import zarr
import numpy as np
import pandas as pd
import allel  
import dask.array as da
from itertools import compress
import glob
from collections import defaultdict

In [13]:
def read_zarr(chrom, zarr_path):
    zarr_file = zarr_path + 'SNP_INDEL_Pf3D7_' + chrom + '_v3.zarr'
    print(zarr_file)
    callset = zarr.open(zarr_file, mode='r')
    return(callset)

def gt_subset(callset, variant_bool, sample_bool):
    gt_zarr = callset['calldata/GT']
    gt_dask = allel.GenotypeDaskArray(gt_zarr)
    gt_daskSub = gt_dask.subset(variant_bool, sample_bool).compute()
    return(gt_daskSub)

def overlap_avail(user_pos_list, callset):
    zarr_index   = callset['variants/POS'][:].tolist()
    user_overlap =  [i in user_pos_list for i in zarr_index]
    return(user_overlap)
    
def overlap_filter(user_snps, filter_snps):
    return(user_snps & filter_snps)

def user_snps(user_df, chrom):
    user_sub = user_df[user_df['chr'].str.contains(chrom)]
    return(user_sub["position"].tolist())
    
def variant_filter(callset, vsqlod_min, num_alt):
    quality_set = callset['variants/FILTER_PASS'][:]
    snp_set = callset['variants/is_snp'][:]
    vsqlod_set = callset['variants/VQSLOD'][:]  > vsqlod_min
    alt_set = callset['variants/numalt'][:] < num_alt + 1 
    variant_hq = quality_set & snp_set & vsqlod_set & alt_set
    return(variant_hq)


def allele_freq(callset, variant_bool, sample_bool, num_alt, chrom):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    # count number of alleles
    ac = gt_daskSub.count_alleles(max_allele=num_alt)
    sub_af = ac/ac.sum(axis=1, keepdims=True)
    sub_df = pd.DataFrame(sub_af,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=["REF"] + ["ALT" + str(x) for x in range(1, num_alt + 1)])
    return(sub_df)

def gt_merge(list1, list2, j, callset, variant_bool):
    dic = {-1:"X", 
           0: np.array(callset['variants/REF'][:])[variant_bool][j], 
           1: np.array(callset['variants/ALT'][:])[variant_bool][j,0], 
           2: np.array(callset['variants/ALT'][:])[variant_bool][j,1],
           3: np.array(callset['variants/ALT'][:])[variant_bool][j,2]}
    merged_list = []
    for i in range(0, len(list1)):
        if list1[i] == list2[i]:
            if list1[i] == -1:
                merged_list.append("X")
            else:
                merged_list.append(list1[i])
        else:
            merged_list.append("N") 
    merged_replace=[dic.get(n, n) for n in merged_list]  
    return(merged_replace)

def match_genotype(callset, variant_bool, sample_bool, chrom ):
    gt_daskSub = gt_subset(callset, variant_bool, sample_bool)
    data =[]
    for j in range(0, len(gt_daskSub)):
        data.append(gt_merge(gt_daskSub[j,:,0], gt_daskSub[j,:,1], j, callset, variant_bool))
    gt_df = pd.DataFrame(data,
                 index=["Pf3D7_" + chrom + "_v3:" + str(x) for x in np.array(callset['variants/POS'][:])[variant_bool]],
                 columns=list(compress(np.array(callset['samples']).tolist(), sample_bool)))
    return(gt_df)

In [14]:
# import the SNPs of interest - i.e. 24 barcode positions
project_dir = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal'
user_snp_file = os.path.join(project_dir, "2008Daniels_BarcodePositions_Updated.txt")
user_snp_df = pd.read_csv(user_snp_file, sep='\t', header=0)
user_snp_df

#user_samples = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal/pf3k_bc_sampleOverlap.txt'
#with open(user_samples) as f:
#    sampleList = f.read().splitlines() 
#print(sampleList[1:5])
#print(len(sampleList))

sampleList = ['SenP005.02','SenP008.04','SenP011.02','SenP019.04','SenP027.02','SenP031.01','SenP051.02','SenP060.02','SenT001.08']



In [15]:
project_dir = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/senegal/'
num_alt = 3
genotypes, frequencies = [], []


class Zarr_Parse():
    def __init__(self, sample_list, position_df, chrom_list = [x for x in range(1,15)], zarr_directory = 'C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/pf3k_zarr/',):
        self.zarr_directory = zarr_directory
        self.zfile_chromosomes = [int(zfile.split('_')[-2]) for zfile in glob.glob(self.zarr_directory + '*.zarr')]
        self.chromosomes = []
        self.samples = sample_list
        self.positions = position_df #dataframe, columns = ['chr', 'position']
        self.callset = {}
        self.variant_bools = {}
        self.sample_bools = {}
        self.freq = {}
        self.genotypes = {}        
        self.fields = defaultdict(dict)
        
        self.create_callset(chrom_list)
        
    def create_callset(self, chrom_list):
        for chromosome in chrom_list:
            if int(chromosome) in self.zfile_chromosomes:
                self.chromosomes.append(chromosome)
                chrom = "{:02d}".format(chromosome)
                callset = read_zarr(chrom, self.zarr_directory)
                self.callset[chromosome]= callset
                
                user_chrom_snps = user_snps(self.positions, chrom)
                variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), 
                                          variant_filter(callset, 2, num_alt))
                self.variant_bools[chromosome] = variant_bool
                
                callset_samples = np.asarray(self.callset[chromosome]['samples']) #callset_samples
                sample_bool = np.array([i in self.samples for i in callset_samples]) 
                self.sample_bools[chromosome] = sample_bool
                
                self.freq[chromosome] = allele_freq(callset, variant_bool, sample_bool, 3, chrom)
                self.genotypes[chromosome] = match_genotype(callset, variant_bool, sample_bool, chrom)
                
    def extract_fields(self, dir1, field):
        entry = '{dir1}/{field}'.format(dir1=dir1, field = field)
        for chromosome in self.chromosomes:
            if field == 'GT':
                dask = allel.GenotypeDaskArray(self.callset[chromosome][entry])
                self.fields[entry][chromosome] = dask.subset(self.variant_bools[chromosome], self.sample_bools[chromosome]).compute()
            else: #check the differentiation between AlleleCountsDaskarray and GenotypeDaskArray
                dask = allel.AlleleCountsDaskArray(self.callset[chromosome][entry])
                variant_selection = dask.compress(self.variant_bools[chromosome], axis=0).compute()
                variant_selection = variant_selection[:, self.sample_bools[chromosome]]
                self.fields[entry][chromosome] = variant_selection
                
                
Z = Zarr_Parse(sampleList, user_snp_df)
Z.extract_fields('calldata', 'GT')
Z.extract_fields('calldata', 'DP')
Z.extract_fields('calldata', 'AD')


#appended_freq = pd.concat(frequencies)
#print(appended_freq)
#appended_freq.to_csv('~/Desktop/Pf3k_wgsSenegalBCOverlapFrequencies24bc.txt') 

#appended_geno = pd.concat(genotypes)
#print(appended_geno)
#appended_geno.to_csv('~/Desktop/Pf3k_wgsSenegalBCOverlapsHaploidGenotype24bc.txt')  



C:/Users/wwong/Dropbox (IDM)/parasite_genetics/genomics/pf3k_zarr/SNP_INDEL_Pf3D7_01_v3.zarr


TypeError: bad number of dimensions: expected 2; found 3

In [16]:
da.from_zarr(Z.callset[1]['calldata/AD'])

AttributeError: module 'dask.array' has no attribute 'from_zarr'

In [174]:
# get the number of reads that align to each of the alleles
ad_zarr=Z.callset[1]['calldata/AD']

ad_array=da.from_zarr(ad_zarr)
ad_variant_selection=ad_array[variant_bool]
ad_variant_selection=ad_variant_selection[:,sample_bool,:]

AttributeError: module 'dask.array' has no attribute 'from_zarr'

In [17]:
da.fr

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__path__', '__file__', '__cached__', '__builtins__', 'absolute_import', 'division', 'print_function', 'ignoring', 'numpy_compat', 'chunk', 'slicing', 'optimization', 'core', 'Array', 'block', 'concatenate', 'stack', 'from_array', 'store', 'map_blocks', 'atop', 'to_hdf5', 'to_npy_stack', 'from_npy_stack', 'from_delayed', 'asarray', 'asanyarray', 'broadcast_to', 'wrap', 'creation', 'routines', 'take', 'choose', 'argwhere', 'where', 'coarsen', 'insert', 'ravel', 'roll', 'unique', 'squeeze', 'topk', 'ptp', 'diff', 'ediff1d', 'bincount', 'digitize', 'histogram', 'cov', 'array', 'dstack', 'vstack', 'hstack', 'compress', 'extract', 'round', 'count_nonzero', 'flatnonzero', 'nonzero', 'around', 'isnull', 'notnull', 'isclose', 'allclose', 'corrcoef', 'swapaxes', 'tensordot', 'transpose', 'dot', 'vdot', 'matmul', 'apply_along_axis', 'apply_over_axes', 'result_type', 'atleast_1d', 'atleast_2d', 'atleast_3d', 'flip', 'flipu

In [154]:
# pull total number of high quality reads that align to each position per sample
dp_zarr = callset['calldata/DP']
dp_dask = allel.AlleleCountsDaskArray(dp_zarr)
dp_variant_selection = dp_dask.compress(variant_bool, axis=0).compute()
dp_variant_selection = dp_variant_selection[:, sample_bool]
print(dp_variant_selection)


dp_dask = allel.AlleleCountsDaskArray(dp_zarr)

user_chrom_snps = user_snps(user_snp_df, chrom)
variant_bool = overlap_filter(overlap_avail(user_chrom_snps, callset), variant_filter(callset, 2, 2))
dp_variant_selection = dp_dask.compress(variant_bool, axis=0).compute()
dp_variant_selection = dp_variant_selection[:, sample_bool]

                                      

    

[[ 38  89  23  58  47  47  44  40 135  74 119 270 108 174 213  44  66  35
   35 309  25  82 242 273 101  70  58  84  45 352 197  37  42  18 110 112
  105  45 239 130  32  55  29  65  63  37  35  20 155  58 125 174  33 141
  234 118 213  18 131 162 185  70 221  27 165  36 258 534 223  50  88 128
   56 107 122 263 423 122  23  40  40  87  52 402 199 194  92  35  72 383
   60  67 148 543 179 118  43  44  36  35  42 221 148 282  36  48  87 269
  130 239 284 315 166  41 191 235 274 197  36  36 130  41 145  76  41  36
  115 103  98  94 159 153  36  36  49  84 105]
 [ 73  96  66  78 146  67 154  67 121  66 163 411 109 216 253  43  91  32
  130 387  23  86 299 355 116 535  76 650  26 406 249  37 287  26 117 181
  147 107 518 138  34  53  45  53 355 233 140  21 392  69  92 201  52 153
  282 204 384  11 193 161 197 257 219  25 249  32 327 695 235  49  81 113
  514 144  93 208 302  77  19 150  81  83 372 690 199 248 113 127  83 402
  545 112 178 594 124 793 128 165  67 156 535 357 233 347  37  37