# Haplotype alignments

Alignment of haplotypes around the *Ace1* duplication for further phylogenetic analysis.

## Input

Input files:

In [1]:
# input data
outdir      = "results_admixture_phylo/"
metasam_fn  = "metadata/samples.meta_phenotypes_acegenotype.simple.txt"
callset_fn  = "/home/xavi/Documents/VariationAg1k/data/phase2.AR1/variation/main/zarr2/ag1000g.phase2.ar1.pass/"
accessi_fn  = "/home/xavi/Documents/VariationAg1k/data/phase2.AR1/accessibility/accessibility.h5"
haploty_fn  = "/home/xavi/Documents/VariationAg1k/data/phase2.AR1/haplotypes/zarr2/ag1000g.phase2.ar1.samples/"
snpeff_fn   = "/home/xavi/Documents/VariationAg1k/data/phase2.AR1/snpeff/zarr2/"
gffann_fn   = "metadata/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.9.gff3"

# define populations
popl    = ["BFcol","BFgam","CIcol","GHcol","GHgam","GNgam"]
popc    = "population"
sub1l   = popl
sub1c   = popc
chrom   = "2R"

# exclude these samples
excludec   = "ox_code"
excludel   = list(("NO RES"))

Libraries:

In [2]:
import os
import numpy as np
import zarr
import pandas as pd
import allel
import itertools

## Load data

### Genotypes, haplotypes, variants & samples

Load data for all variants & genotypes. Population and sample structure:

In [3]:
# load samples list with sample code, groupings, locations etc.
samples_df   = pd.read_csv(metasam_fn, sep='\t')
samples_bool = (
    samples_df[popc].isin(popl).values & 
    samples_df[sub1c].isin(sub1l).values &
    ~samples_df[excludec].isin(excludel).values
)
samples_sub  = samples_df[samples_bool]
samples_sub.reset_index(drop=True, inplace=True)

# indexed dictionary of populations
popdict = dict()
for popi in popl: 
    popdict[popi]  = samples_sub[samples_sub[popc] == popi].index.tolist()

# add an extra population composed of all other locations
popdict["all"] = []
for popi in popl:
    popdict["all"] = popdict["all"] + popdict[popi]
    
# report
print("Data:")
print("* Samples     = ", samples_sub.shape[0])
print("* Populations = ", set(samples_sub[popc]))
print(samples_sub.groupby(("population",popc)).size())


Data:
* Samples     =  345
* Populations =  {'GNgam', 'CIcol', 'GHgam', 'BFcol', 'GHcol', 'BFgam'}
population  population
BFcol       BFcol         75
BFgam       BFgam         92
CIcol       CIcol         71
GHcol       GHcol         55
GHgam       GHgam         12
GNgam       GNgam         40
dtype: int64




Load variant, genotypes, haplotypes, accessibility, etc. data:

In [4]:
# Accessibility
import h5py
print("Load accessibility array...")
accessi_df  = h5py.File(accessi_fn,mode="r")
accessi_arr = accessi_df[chrom]["is_accessible"][:]

Load accessibility array...


## Divergence

Calculate divergence of genotype frequencies between species and duplicated sequences:

In [5]:
has_dup = np.array([any(b in s for b in ["TRUE"]) for s in samples_sub["pop_dup"].values])

# dictionary with species (nondup) and dups
popdict_div = dict()
popdict_div["dup"] = np.where(has_dup)[0].tolist()
popdict_div["col"] = samples_sub[  np.logical_and( samples_sub["m_s"] == "M" , np.logical_not(has_dup) )  ].index.tolist()
popdict_div["gam"] = samples_sub[  np.logical_and( samples_sub["m_s"] == "S" , np.logical_not(has_dup) )  ].index.tolist()
popdict_div["all"] = popdict_div["dup"] + popdict_div["gam"] + popdict_div["col"]

Region of interest:

In [6]:
ace_dups  = 3436800 # start duplication
ace_dupe  = 3639600 # end duplication

In [7]:
export_start = ace_dups - 1e5
export_end   = ace_dupe + 1e5
export_name  = "duplication"

Load data from region of interest:

In [8]:
# haplotypes: variants
hapcall     = zarr.open(haploty_fn)
print("Load haplotype variants...")
hapcall_var = hapcall[chrom]["variants"]
hapvars     = allel.VariantChunkedTable(hapcall_var,names=["POS","REF","ALT"],index="POS")
hap_bool    = np.logical_and(hapvars["POS"][:] >= export_start, hapvars["POS"][:] <= export_end)
hapvars_sub = hapvars.compress(hap_bool)

# haplotypes: phased genotypes
print("Load haplotype haplotypes...")
hapcall_gen = hapcall[chrom]["calldata/genotype"]
haploty_gen = allel.GenotypeChunkedArray(hapcall_gen)
# find samples in haplotype dataset that coincide with genotypes
haploty_sam = hapcall[chrom]["samples"][:].astype(str)
hapsam_bool = np.isin(haploty_sam, np.array(samples_sub["ox_code"]))
haploty_sub = haploty_gen.subset(sel0=hap_bool,sel1=hapsam_bool)

# calculate population allele counts
hapalco_sub = haploty_sub.count_alleles_subpops(subpops=popdict_div)

# filter haplotypes: segregating alleles, no singletons
is_hapseg   = hapalco_sub["all"].is_segregating()[:] # segregating
is_hapnosing= hapalco_sub["all"][:,:2].min(axis=1)>2 # no singletons
filhap_bool = (is_hapseg[:] & is_hapnosing[:])

# subset
print("Subset haps...")
haploty_seg = haploty_sub.compress(filhap_bool)
hapvars_seg = hapvars_sub.compress(filhap_bool)
hapalco_seg = hapalco_sub.compress(filhap_bool)

Load haplotype variants...
Load haplotype haplotypes...
Subset haps...


Pairwise differences:

In [9]:
# pairwise difference
mpd_dg = allel.mean_pairwise_difference_between(ac1=hapalco_seg["dup"], ac2=hapalco_seg["gam"])
mpd_dc = allel.mean_pairwise_difference_between(ac1=hapalco_seg["dup"], ac2=hapalco_seg["col"])
# jack-knife per-SNP estimates
mpd_dg_est = allel.stats.misc.jackknife(mpd_dg , np.mean)
mpd_dc_est = allel.stats.misc.jackknife(mpd_dc , np.mean)
# report
print("dif gam to dup = %.7f +/- %.7f " % (mpd_dg_est[0], mpd_dg_est[1]))
print("dif col to dup = %.7f +/- %.7f " % (mpd_dc_est[0], mpd_dc_est[1]))

dif gam to dup = 0.0830800 +/- 0.0009444 
dif col to dup = 0.0850911 +/- 0.0009654 


Dxy divergence:

In [10]:
# dxy per window
dxy_dg = allel.windowed_divergence(ac1=hapalco_seg["dup"], ac2=hapalco_seg["gam"], pos = hapvars_seg["POS"], size=100, is_accessible = accessi_arr)
dxy_dc = allel.windowed_divergence(ac1=hapalco_seg["dup"], ac2=hapalco_seg["col"], pos = hapvars_seg["POS"], size=100, is_accessible = accessi_arr)

# jack-knife per-SNP estimate
dxy_dg_est = allel.stats.misc.jackknife(dxy_dg[0] , np.nanmean)
dxy_dc_est = allel.stats.misc.jackknife(dxy_dc[0] , np.nanmean)

# report
print("dxy gam to dup = %.7f +/- %.7f " % (dxy_dg_est[0], dxy_dg_est[1]))
print("dxy col to dup = %.7f +/- %.7f " % (dxy_dc_est[0], dxy_dc_est[1]))

dxy gam to dup = 0.0088033 +/- 0.0003273 
dxy col to dup = 0.0090649 +/- 0.0003450 


PBS relative to duplicated sequences:

In [11]:
pbs_col = allel.pbs(ac1=hapalco_seg["col"], ac2=hapalco_seg["gam"], ac3=hapalco_seg["dup"], window_size=100)
pbs_gam = allel.pbs(ac1=hapalco_seg["gam"], ac2=hapalco_seg["col"], ac3=hapalco_seg["dup"], window_size=100)

# jack-knifing
pbs_col_est = allel.stats.misc.jackknife(pbs_col , np.nanmean)
pbs_gam_est = allel.stats.misc.jackknife(pbs_gam , np.nanmean)

# report
print("PBS col = %.7f +/- %.7f" % (pbs_col_est[0], pbs_col_est[1]))
print("PBS gam = %.7f +/- %.7f" % (pbs_gam_est[0], pbs_gam_est[1]))

PBS col = 0.0273723 +/- 0.0014900
PBS gam = 0.0102802 +/- 0.0011229


## Export alignment

Export `Phylip` alignment of haplotypes.

First, the entire duplication:

In [12]:
export_start = ace_dups
export_end   = ace_dupe
export_name  = "duplication"

In [13]:
poplhap = popl

# haplotypes: variants
hapcall     = zarr.open(haploty_fn)
print("Load haplotype variants...")
hapcall_var = hapcall[chrom]["variants"]
hapvars     = allel.VariantChunkedTable(hapcall_var,names=["POS","REF","ALT"],index="POS")
hap_bool    = np.logical_and(hapvars["POS"][:] >= export_start, hapvars["POS"][:] <= export_end)
hapvars_sub = hapvars.compress(hap_bool)

# haplotypes: phased genotypes
print("Load haplotype haplotypes...")
hapcall_gen = hapcall[chrom]["calldata/genotype"]
haploty_gen = allel.GenotypeChunkedArray(hapcall_gen)
# find samples in haplotype dataset that coincide with genotypes
haploty_sam = hapcall[chrom]["samples"][:].astype(str)
hapsam_bool = np.isin(haploty_sam, np.array(samples_sub["ox_code"]))
haploty_sub = haploty_gen.subset(sel0=hap_bool,sel1=hapsam_bool)

# recast haplotypes: drop ploidy
print("Drop ploidy haplotypes...")
haploty_sub_hap = haploty_sub.to_haplotypes()

# haplotype dicts
# arrays of hap ids and populations of each hap (double the size of genotype arryays: 2 haps per individual except in X chromosome)
print("Samples dictionary for haps...")
is_samp_in_hap = np.isin(np.array(samples_sub["ox_code"]),haploty_sam)
hap_ids        = np.array(list(itertools.chain(*[[s + 'a', s + 'b'] for s in haploty_sam[hapsam_bool]])))
hap_pops       = np.array(list(itertools.chain(*[[s, s] for s in np.array(samples_sub[popc][is_samp_in_hap])])))
hap_pops_df    = pd.DataFrame(data={ popc : hap_pops , "ids" : hap_ids})

# pop dicts for haplotype data
popdicthap = dict()
for popi in poplhap: 
    popdicthap[popi]  = hap_pops_df[hap_pops_df[popc] == popi].index.tolist()

popdicthap["all"] = []
for popi in poplhap:
    popdicthap["all"] = popdicthap["all"] + popdicthap[popi]

# haplotypes: allele counts
print("Allele counts haplotypes...")
hapalco_sub = haploty_sub_hap.count_alleles_subpops(subpops=popdicthap)

# filter haplotypes: segregating alleles, no singletons
is_hapseg   = hapalco_sub["all"].is_segregating()[:] # segregating
is_hapnosing= hapalco_sub["all"][:,:2].min(axis=1)>2 # no singletons
filhap_bool = (is_hapseg[:] & is_hapnosing[:])

# subset
print("Subset haps...")
haploty_seg = haploty_sub_hap.compress(filhap_bool)
hapvars_seg = hapvars_sub.compress(filhap_bool)
hapalco_seg = hapalco_sub.compress(filhap_bool)

# refs and alts
hapvars_seg_REF = hapvars_seg["REF"][:].astype(str)
hapvars_seg_ALT = hapvars_seg["ALT"][:].astype(str)


# output
print("FASTA...")
happhy = pd.DataFrame({
    "hap": ">"+hap_pops_df["ids"]+"_"+hap_pops_df["population"],
    "seq": np.nan},    
    columns=["hap", "seq"])

for pn,popi in enumerate(hap_pops_df["ids"]):
    
    popi_gen = np.ndarray.tolist(haploty_seg[:,pn])
    popi_seq = [hapvars_seg_REF[gn] if gei == 0 else hapvars_seg_ALT[gn] for gn,gei in enumerate(popi_gen)]
    happhy["seq"][pn] = ''.join(str(e) for e in popi_seq)

happhy.to_csv("%s/hapalignment_%s.fasta" % (outdir,export_name),sep="\n",index=False, header=False)

Load haplotype variants...
Load haplotype haplotypes...
Drop ploidy haplotypes...
Samples dictionary for haps...
Allele counts haplotypes...
Subset haps...
FASTA...


Second, an admixed region **just** downstream of the duplication:

In [14]:
export_start = ace_dupe
export_end   = ace_dupe+5e4
export_name  = "breakdodu"

In [15]:
poplhap = popl

# haplotypes: variants
hapcall     = zarr.open(haploty_fn)
print("Load haplotype variants...")
hapcall_var = hapcall[chrom]["variants"]
hapvars     = allel.VariantChunkedTable(hapcall_var,names=["POS","REF","ALT"],index="POS")
hap_bool    = np.logical_and(hapvars["POS"][:] >= export_start, hapvars["POS"][:] <= export_end)
hapvars_sub = hapvars.compress(hap_bool)

# haplotypes: phased genotypes
print("Load haplotype haplotypes...")
hapcall_gen = hapcall[chrom]["calldata/genotype"]
haploty_gen = allel.GenotypeChunkedArray(hapcall_gen)
# find samples in haplotype dataset that coincide with genotypes
haploty_sam = hapcall[chrom]["samples"][:].astype(str)
hapsam_bool = np.isin(haploty_sam, np.array(samples_sub["ox_code"]))
haploty_sub = haploty_gen.subset(sel0=hap_bool,sel1=hapsam_bool)

# recast haplotypes: drop ploidy
print("Drop ploidy haplotypes...")
haploty_sub_hap = haploty_sub.to_haplotypes()

# haplotype dicts
# arrays of hap ids and populations of each hap (double the size of genotype arryays: 2 haps per individual except in X chromosome)
print("Samples dictionary for haps...")
is_samp_in_hap = np.isin(np.array(samples_sub["ox_code"]),haploty_sam)
hap_ids        = np.array(list(itertools.chain(*[[s + 'a', s + 'b'] for s in haploty_sam[hapsam_bool]])))
hap_pops       = np.array(list(itertools.chain(*[[s, s] for s in np.array(samples_sub[popc][is_samp_in_hap])])))
hap_pops_df    = pd.DataFrame(data={ popc : hap_pops , "ids" : hap_ids})

# pop dicts for haplotype data
popdicthap = dict()
for popi in poplhap: 
    popdicthap[popi]  = hap_pops_df[hap_pops_df[popc] == popi].index.tolist()

popdicthap["all"] = []
for popi in poplhap:
    popdicthap["all"] = popdicthap["all"] + popdicthap[popi]

# haplotypes: allele counts
print("Allele counts haplotypes...")
hapalco_sub = haploty_sub_hap.count_alleles_subpops(subpops=popdicthap)

# filter haplotypes: segregating alleles, no singletons
is_hapseg   = hapalco_sub["all"].is_segregating()[:] # segregating
is_hapnosing= hapalco_sub["all"][:,:2].min(axis=1)>2 # no singletons
filhap_bool = (is_hapseg[:] & is_hapnosing[:])

# subset
print("Subset haps...")
haploty_seg = haploty_sub_hap.compress(filhap_bool)
hapvars_seg = hapvars_sub.compress(filhap_bool)
hapalco_seg = hapalco_sub.compress(filhap_bool)

# refs and alts
hapvars_seg_REF = hapvars_seg["REF"][:].astype(str)
hapvars_seg_ALT = hapvars_seg["ALT"][:].astype(str)


# output
print("FASTA...")
happhy = pd.DataFrame({
    "hap": ">"+hap_pops_df["ids"]+"_"+hap_pops_df["population"],
    "seq": np.nan},    
    columns=["hap", "seq"])

for pn,popi in enumerate(hap_pops_df["ids"]):
    
    popi_gen = np.ndarray.tolist(haploty_seg[:,pn])
    popi_seq = [hapvars_seg_REF[gn] if gei == 0 else hapvars_seg_ALT[gn] for gn,gei in enumerate(popi_gen)]
    happhy["seq"][pn] = ''.join(str(e) for e in popi_seq)

happhy.to_csv("%s/hapalignment_%s.fasta" % (outdir,export_name),sep="\n",index=False, header=False)

Load haplotype variants...
Load haplotype haplotypes...
Drop ploidy haplotypes...
Samples dictionary for haps...
Allele counts haplotypes...
Subset haps...
FASTA...


Now, an unadmixed region upstream of the duplication:

In [16]:
export_start = ace_dups - 1e6
export_end   = export_start + 5e4
export_name  = "upstream"

In [17]:
poplhap = popl

# haplotypes: variants
hapcall     = zarr.open(haploty_fn)
print("Load haplotype variants...")
hapcall_var = hapcall[chrom]["variants"]
hapvars     = allel.VariantChunkedTable(hapcall_var,names=["POS","REF","ALT"],index="POS")
hap_bool    = np.logical_and(hapvars["POS"][:] >= export_start, hapvars["POS"][:] <= export_end)
hapvars_sub = hapvars.compress(hap_bool)

# haplotypes: phased genotypes
print("Load haplotype haplotypes...")
hapcall_gen = hapcall[chrom]["calldata/genotype"]
haploty_gen = allel.GenotypeChunkedArray(hapcall_gen)
# find samples in haplotype dataset that coincide with genotypes
haploty_sam = hapcall[chrom]["samples"][:].astype(str)
hapsam_bool = np.isin(haploty_sam, np.array(samples_sub["ox_code"]))
haploty_sub = haploty_gen.subset(sel0=hap_bool,sel1=hapsam_bool)

# recast haplotypes: drop ploidy
print("Drop ploidy haplotypes...")
haploty_sub_hap = haploty_sub.to_haplotypes()

# haplotype dicts
# arrays of hap ids and populations of each hap (double the size of genotype arryays: 2 haps per individual except in X chromosome)
print("Samples dictionary for haps...")
is_samp_in_hap = np.isin(np.array(samples_sub["ox_code"]),haploty_sam)
hap_ids        = np.array(list(itertools.chain(*[[s + 'a', s + 'b'] for s in haploty_sam[hapsam_bool]])))
hap_pops       = np.array(list(itertools.chain(*[[s, s] for s in np.array(samples_sub[popc][is_samp_in_hap])])))
hap_pops_df    = pd.DataFrame(data={ popc : hap_pops , "ids" : hap_ids})

# pop dicts for haplotype data
popdicthap = dict()
for popi in poplhap: 
    popdicthap[popi]  = hap_pops_df[hap_pops_df[popc] == popi].index.tolist()

popdicthap["all"] = []
for popi in poplhap:
    popdicthap["all"] = popdicthap["all"] + popdicthap[popi]

# haplotypes: allele counts
print("Allele counts haplotypes...")
hapalco_sub = haploty_sub_hap.count_alleles_subpops(subpops=popdicthap)

# filter haplotypes: segregating alleles, no singletons
is_hapseg   = hapalco_sub["all"].is_segregating()[:] # segregating
is_hapnosing= hapalco_sub["all"][:,:2].min(axis=1)>2 # no singletons
filhap_bool = (is_hapseg[:] & is_hapnosing[:])

# subset
print("Subset haps...")
haploty_seg = haploty_sub_hap.compress(filhap_bool)
hapvars_seg = hapvars_sub.compress(filhap_bool)
hapalco_seg = hapalco_sub.compress(filhap_bool)

# refs and alts
hapvars_seg_REF = hapvars_seg["REF"][:].astype(str)
hapvars_seg_ALT = hapvars_seg["ALT"][:].astype(str)


# output
print("FASTA...")
happhy = pd.DataFrame({
    "hap": ">"+hap_pops_df["ids"]+"_"+hap_pops_df["population"],
    "seq": np.nan},    
    columns=["hap", "seq"])

for pn,popi in enumerate(hap_pops_df["ids"]):
    
    popi_gen = np.ndarray.tolist(haploty_seg[:,pn])
    popi_seq = [hapvars_seg_REF[gn] if gei == 0 else hapvars_seg_ALT[gn] for gn,gei in enumerate(popi_gen)]
    happhy["seq"][pn] = ''.join(str(e) for e in popi_seq)

happhy.to_csv("%s/hapalignment_%s.fasta" % (outdir,export_name),sep="\n",index=False, header=False)

Load haplotype variants...
Load haplotype haplotypes...
Drop ploidy haplotypes...
Samples dictionary for haps...
Allele counts haplotypes...
Subset haps...
FASTA...


Now, a region downstream of the duplication:

In [18]:
export_start = ace_dupe + 1e6
export_end   = export_start + 5e4
export_name  = "downstream"

In [19]:
poplhap = popl

# haplotypes: variants
hapcall     = zarr.open(haploty_fn)
print("Load haplotype variants...")
hapcall_var = hapcall[chrom]["variants"]
hapvars     = allel.VariantChunkedTable(hapcall_var,names=["POS","REF","ALT"],index="POS")
hap_bool    = np.logical_and(hapvars["POS"][:] >= export_start, hapvars["POS"][:] <= export_end)
hapvars_sub = hapvars.compress(hap_bool)

# haplotypes: phased genotypes
print("Load haplotype haplotypes...")
hapcall_gen = hapcall[chrom]["calldata/genotype"]
haploty_gen = allel.GenotypeChunkedArray(hapcall_gen)
# find samples in haplotype dataset that coincide with genotypes
haploty_sam = hapcall[chrom]["samples"][:].astype(str)
hapsam_bool = np.isin(haploty_sam, np.array(samples_sub["ox_code"]))
haploty_sub = haploty_gen.subset(sel0=hap_bool,sel1=hapsam_bool)

# recast haplotypes: drop ploidy
print("Drop ploidy haplotypes...")
haploty_sub_hap = haploty_sub.to_haplotypes()

# haplotype dicts
# arrays of hap ids and populations of each hap (double the size of genotype arryays: 2 haps per individual except in X chromosome)
print("Samples dictionary for haps...")
is_samp_in_hap = np.isin(np.array(samples_sub["ox_code"]),haploty_sam)
hap_ids        = np.array(list(itertools.chain(*[[s + 'a', s + 'b'] for s in haploty_sam[hapsam_bool]])))
hap_pops       = np.array(list(itertools.chain(*[[s, s] for s in np.array(samples_sub[popc][is_samp_in_hap])])))
hap_pops_df    = pd.DataFrame(data={ popc : hap_pops , "ids" : hap_ids})

# pop dicts for haplotype data
popdicthap = dict()
for popi in poplhap: 
    popdicthap[popi]  = hap_pops_df[hap_pops_df[popc] == popi].index.tolist()

popdicthap["all"] = []
for popi in poplhap:
    popdicthap["all"] = popdicthap["all"] + popdicthap[popi]

# haplotypes: allele counts
print("Allele counts haplotypes...")
hapalco_sub = haploty_sub_hap.count_alleles_subpops(subpops=popdicthap)

# filter haplotypes: segregating alleles, no singletons
is_hapseg   = hapalco_sub["all"].is_segregating()[:] # segregating
is_hapnosing= hapalco_sub["all"][:,:2].min(axis=1)>2 # no singletons
filhap_bool = (is_hapseg[:] & is_hapnosing[:])

# subset
print("Subset haps...")
haploty_seg = haploty_sub_hap.compress(filhap_bool)
hapvars_seg = hapvars_sub.compress(filhap_bool)
hapalco_seg = hapalco_sub.compress(filhap_bool)

# refs and alts
hapvars_seg_REF = hapvars_seg["REF"][:].astype(str)
hapvars_seg_ALT = hapvars_seg["ALT"][:].astype(str)


# output
print("FASTA...")
happhy = pd.DataFrame({
    "hap": ">"+hap_pops_df["ids"]+"_"+hap_pops_df["population"],
    "seq": np.nan},    
    columns=["hap", "seq"])

for pn,popi in enumerate(hap_pops_df["ids"]):
    
    popi_gen = np.ndarray.tolist(haploty_seg[:,pn])
    popi_seq = [hapvars_seg_REF[gn] if gei == 0 else hapvars_seg_ALT[gn] for gn,gei in enumerate(popi_gen)]
    happhy["seq"][pn] = ''.join(str(e) for e in popi_seq)

happhy.to_csv("%s/hapalignment_%s.fasta" % (outdir,export_name),sep="\n",index=False, header=False)

Load haplotype variants...
Load haplotype haplotypes...
Drop ploidy haplotypes...
Samples dictionary for haps...
Allele counts haplotypes...
Subset haps...
FASTA...
