# impute_unified_genealogy

In [None]:
# python impute_unified_genealogy.py
# -t input.trees
# -i input vcf (sites to impute)
# -o output vcf (imputed sites)

In [None]:
# Pipeline to impute from modern genomes (unified genealogy) into ancient genomes:
# 1. Take the unified whole-genome genealogy from Wohns et al. (2022). DONE
# 2. Get an ancestors ts from it. (Keep leaves or not? Try both). DONE
# 3. Convert GLIMPSE output BCF to raw input BCF. DONE.
# 4. Get `.samples` file from input BCF.
# 4. match_samples = output ts
# 5. ts.write_vcf()
# 
# Bonus:
# Impute from Wohns ts into the new 1KG Project Phase 3 data and evaluate using IQS.

### Downloading the unified genealogy

In [None]:
# "[A] unified tree sequence of 3601 modern and eight high-coverage ancient human genome sequences
# compiled from eight datasets. This structure is a lossless and compact representation of
# 27 million ancestral haplotype fragments and 231 million ancestral lineages linking genomes
# from these datasets back in time.
# 
# Modern genomes:
# a. 1000 Genomes
# b. Human Genome Diversity
# c. Simons Genome Diversity
# 
# Trees:
# a. Without ancient genomes (https://zenodo.org/record/5495535#.YuGjXi8w08R)
# b. With___ ancient genomes (https://zenodo.org/record/5512994#.YuGj4S8w08R)
# 
# Human reference: GRCh38

In [1]:
import tskit

# Load the tree sequences of the p- and q-arms of chr20.
data_dir = "../data/ref_panel/"

contig_id = "chr20"

ts_p_file = data_dir + "hgdp_tgp_sgdp_" + contig_id + "_" + "p" + ".dated.trees"
ts_q_file = data_dir + "hgdp_tgp_sgdp_" + contig_id + "_" + "q" + ".dated.trees"

import tskit
ts_p = tskit.load(ts_p_file)
ts_q = tskit.load(ts_q_file)

print(f"The short arm of chr20 contains {ts_p.num_trees} trees.")
print(f"The long arm of chr20 contains {ts_q.num_trees} trees.")

The short arm of chr20 contains 58756 trees.
The long arm of chr20 contains 74301 trees.


In [2]:
import tsinfer

ts_p_anc = tsinfer.eval_util.make_ancestors_ts(ts_p)
ts_q_anc = tsinfer.eval_util.make_ancestors_ts(ts_q)

In [2]:
# Simulated data from 48 high coverage shotgun (SG) samples.
# 
# Annotation file:
# http://genomebrowser-uploads.hms.harvard.edu/data/aa681/tmp/anno.sim.tsv
# 
# Simulated data files:
# http://genomebrowser-uploads.hms.harvard.edu/data/aa681/tmp/simulations.tar
# 
# MD5 checksum:
# 3a98628e5c8015b3da4664837568e14c  simulations.tar
# e332d36afc87431d08453bc66b0eee28  anno.sim.tsv
# 
# File size:
# 8.1K anno.sim.tsv
# 166G simulations.tar
# 
# There are 374 bcf files, each with 48 samples.
# I have simulated both SG and 1240k for 22 autosomes and for 8 different coverages.
# 
# */merged/{DT}/DP{DP}/chr{CHROM}.bcf
# 
# DT in [SG, 1240k]
# DP in [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5]
# CHROM in [1,2,...,22]
# 
# Imputed version of the original sample is also available from
#   */merged/SG/DPmax/chr{CHROM}.bcf
# 
# All 4 members of Afanasievo family are also included in this list:
#   Mother I3388.SG
#   Father I3950.SG
#   Son 1 I3949.SG
#   Son 2 I6714.SG
# 
# Please note that coverage is >20x, except for an individual from Afanasievo family with ~10x coverage (I3388.SG).  
# 
# Imputed file format is BCF with following fields:
# 
# Generated by GLIMPSE:
#   GT: Phased and imputed genotypes
#   DS: Genotype dosage
#   GP: Genotype posteriors
#
# Generated by genotype caller (mpileup):
#   PL: Phred-scaled genotype likelihoods
#   AD: Allelic depths (high-quality bases)
# 
# * Note that PL and AD are available when the target variant is covered by at least 1 read otherwise, their value is missed.
# * To minimize the reference bias, only PL of SNPs are used to build imputation model for both SNPs and indels.
#   Nevertheless, PL and AD for indels are also reported in the output.

In [5]:
data_dir = "/Users/szhan/Projects/impute_unified_genealogy/"
bcf_file = data_dir + "merged/SG/DPmax/" + "raw_chr20.bcf"