In [1]:
from copy import deepcopy
import math
import numpy as np
import os
import random
import sys
import time

from IPython.display import SVG

sys.path.append("../modules/")
import mask_genotype
import parse_vcf

import demes
import tsinfer
from tsinfer import make_ancestors_ts
import tskit
import msprime
import stdpopsim
import cyvcf2

print(f"tskit {tskit.__version__}")
print(f"tsinfer {tsinfer.__version__}")
print(f"msprime {msprime.__version__}")
print(f"stdpopsim {stdpopsim.__version__}")
print(f"cyvcf2 {cyvcf2.__version__}")

tskit 0.4.1
tsinfer 0.2.3.dev9+gc8568d5
msprime 1.1.1
stdpopsim 0.1.2
cyvcf2 0.30.14


In [2]:
def print_sample_data_to_vcf(sample_data,
                             individuals,
                             samples,
                             ploidy_level,
                             mask,
                             out_vcf_file,
                             contig_id,
                             sequence_length_max = 1e12):
    """
    Fields:
    CHROM contig_id
    POS row index in genotype_matrix
    ID .
    REF ancestral allele
    ALT derived allele(s)
    QUAL .
    FILTER PASS
    INFO
    FORMAT GT
    individual 0
    individual 1
    ...
    individual n - 1; n = number of individuals
    """
    CHROM = contig_id
    ID = '.'
    QUAL = '.'
    FILTER = 'PASS'
    FORMAT = 'GT'
    
    assert ploidy_level == 1 or ploidy_level == 2,\
        f"Specified ploidy_level {ploidy_level} is not recognized."
    
    assert ploidy_level * len(individuals) == len(samples),\
        f"Some individuals may not have the same ploidy level of {ploidy_level}."
    
    # Assume that both sample and individual ids are ordered the same way.
    #individual_id_map = np.repeat(individuals, 2)
    
    header  = "##fileformat=VCFv4.2\n"\
            + "##source=tskit " + tskit.__version__ + "\n"\
            + "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n"\
            + "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
    header += "##contig=<ID=" + contig_id + "," + "length=" + str(int(ts.sequence_length)) + ">\n"
    header += "\t".join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']\
                        + ["s" + str(x) for x in individuals])
    
    with open(out_vcf_file, "w") as vcf:
        vcf.write(header + "\n")
        for i, variant in enumerate(ts.variants()):
            site_id = variant.site.id
            POS = int(np.round(variant.site.position))
            if POS > sequence_length_max:
                break
            # Since the tree sequence was produced using simulation,
            #    there's no reference sequence other than the ancestral sequence.
            REF = variant.site.ancestral_state
            alt_alleles = list(set(variant.alleles) - {REF})
            AA = variant.site.ancestral_state
            ALT = ",".join(alt_alleles) if len(alt_alleles) > 0 else "."
            INFO = "AA" + "=" + AA
            record = [str(x)
                      for x
                      in [CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT]]
            
            for j in individuals:
                #sample_ids = [samples[x]
                #              for x
                #              in np.where(individual_id_map == j)[0].tolist()]
                #genotype = "|".join([str(variant.genotypes[k])
                #                     for k
                #                     in sample_ids])
                if ploidy_level == 1:
                    genotype = str(variant.genotypes[j])
                else:
                    genotype = str(variant.genotypes[2 * j]) + "|" + str(variant.genotypes[2 * j + 1])
                    
                if mask is not None and mask.query_position(individual = j, position = POS) == True:
                    if ploidy_level == 1:
                        genotype = '.'
                    else:
                        genotype = '.|.' # Or "./."
                record += [genotype]
                
            vcf.write("\t".join(record) + "\n")

In [3]:
# Sourced and modified from:
# https://tsinfer.readthedocs.io/en/latest/tutorial.html#data-example
def get_chromosome_length(vcf):
    assert len(vcf.seqlens) == 1
    return vcf.seqlens[0]


def add_populations(vcf,
                    samples):
    """
    TODO
    """
    pop_ids = [sample_name[0] for sample_name in vcf.samples]
    pop_codes = np.unique(pop_ids)
    pop_lookup = {}
    for p in pop_codes:
        pop_lookup[p] = samples.add_population(metadata = {"name" : p})
    return [pop_lookup[pop_id] for pop_id in pop_ids]


def add_individuals(vcf,
                    samples,
                    ploidy_level,
                    populations):
    for name, population in zip(vcf.samples, populations):
        samples.add_individual(ploidy = ploidy_level,
                               metadata = {"name": name},
                               population = population)


def add_sites(vcf,
              samples,
              ploidy_level,
              warn_monomorphic_sites = False):
    """
    Read the sites in the VCF and add them to the samples object,
    reordering the alleles to put the ancestral allele first,
    if it is available.
    """
    assert ploidy_level == 1 or ploidy_level == 2,\
        f"ploidy_level {ploidy_level} is not recognized."
    
    pos = 0
    for variant in vcf:
        # Check for duplicate site positions.
        if pos == variant.POS:
            raise ValueError("Duplicate positions for variant at position", pos)
        else:
            pos = variant.POS
        # Check that the genotypes are phased.
        #if any([not phased for _, _, phased in variant.genotypes]):
        #    raise ValueError("Unphased genotypes for variant at position", pos)
        alleles = [variant.REF] + variant.ALT # Exactly as in the input VCF file.
        if warn_monomorphic_sites:
            if len(set(alleles) - {'.'}) == 1:
                print(f"Monomorphic site at {pos}")
        ancestral = variant.INFO.get("AA", variant.REF) # Dangerous action!!!
        # Ancestral state must be first in the allele list.
        ordered_alleles = [ancestral] + list(set(alleles) - {ancestral})
        # Create an index mapping from the input VCF to tsinfer input.
        allele_index = {
            old_index: ordered_alleles.index(allele)
            for old_index, allele in enumerate(alleles)
        }
        # When genotype is missing...
        if variant.num_unknown > 0:
            allele_index[-1] = tskit.MISSING_DATA
            ordered_alleles += [None]
        # Map original allele indexes to their indexes in the new alleles list.
        genotypes = [
            allele_index[old_index]
            for row in variant.genotypes # cyvcf2 uses -1 to indicate missing data.
            for old_index in row[0:ploidy_level] # Each is a 3-tuple (allele 1, allele 2, is phased?).
        ]
        samples.add_site(pos,
                         genotypes = genotypes,
                         alleles = ordered_alleles)


def create_sample_data_from_vcf_file(vcf_file):
    vcf = cyvcf2.VCF(vcf_file,
                     gts012 = False, # 0=HOM_REF, 1=HET, 2=UNKNOWN, 3=HOM_ALT
                     strict_gt = True)
    with tsinfer.SampleData(
        sequence_length = get_chromosome_length(vcf)
    ) as samples:
        populations = add_populations(vcf, samples)
        add_individuals(vcf, samples, ploidy_level, populations)
        add_sites(vcf, samples, ploidy_level)
    return(samples)

In [4]:
def parse_vcf_file(vcf_file):
    """
    If gt_types = False, then 0=HOM_REF, 1=HET, 2=UNKNOWN, the coordinates are 0-based.
    It returns a list of dictionaries, each containing a VCF record.
    """
    parsed_vcf = []
    for variant in cyvcf2.VCF(vcf_file,
                              gts012 = False, # 0=HOM_REF, 1=HET, 2=UNKNOWN, 3=HOM_ALT
                              strict_gt = True):
        record = {
            'ref': variant.REF,
            'alt': variant.ALT,
            'ctg': variant.CHROM, # Contig id/name
            'pos': int(variant.start),
            'aa' : variant.INFO.get('AA'), # Ancestral allele
            'gt' : variant.genotypes
        }
        parsed_vcf.append(record)
    return(parsed_vcf)


def compare_vcf(vcf_1, vcf_2):
    assert len(vcf_1) == len(vcf_2)
    for i in range(len(vcf_1)):
        is_valid_ref = vcf_1[i].get('ref') == vcf_2[i].get('ref')
        is_valid_alt = vcf_1[i].get('alt') == vcf_2[i].get('alt')
        is_valid_ctg = vcf_1[i].get('ctg') == vcf_2[i].get('ctg')
        is_valid_pos = vcf_1[i].get('pos') == vcf_2[i].get('pos')
        is_valid_aa  = vcf_1[i].get('aa' ) == vcf_2[i].get('aa' )
        is_all_valid = np.all([is_valid_ref,
                               is_valid_alt,
                               is_valid_ctg,
                               is_valid_pos,
                               is_valid_aa])
        if not is_all_valid:
            return(False)
    return(True)


def get_common_positions_in_vcf(vcf_1, vcf_2):
    pos_1 = []
    pos_2 = []
    for i, record in enumerate(vcf_1):
        pos_1.append(record.get('pos'))
    for i, record in enumerate(vcf_2):
        pos_2.append(record.get('pos'))
    # All positions should be unique.
    assert len(pos_1) == len(set(pos_1)),\
        "The positions in vcf_1 are not all unique."
    assert len(pos_2) == len(set(pos_2)),\
        "The positions in vcf_2 are not all unique."
    common_pos = list(set.intersection(set(pos_1), set(pos_2)))
    return(common_pos)


def compare_variants(true_vcf_file,
                     miss_vcf_file,
                     imputed_vcf_file,
                     ploidy_level,
                     verbose = False):
    true_vcf    = parse_vcf_file(true_vcf_file)
    miss_vcf    = parse_vcf_file(miss_vcf_file)
    imputed_vcf = parse_vcf_file(imputed_vcf_file)
    
    assert compare_vcf(true_vcf, miss_vcf),\
        "true_vcf and miss_vcf are not comparable."
    
    # If diploid, then assume phased.
    MISSING_GENOTYPE_CONSTANT = [-1, -1, True] if ploidy_level == 2 else [-1, False]
    
    # Imputed VCF file must have at most the number of positions as the true/miss VCF files.
    common_pos = get_common_positions_in_vcf(miss_vcf, imputed_vcf)
    
    # Number of genotypes imputed, correctly or not.
    nbr_gt_total = 0
    # Number of instances of genotypes correctly imputed.
    nbr_gt_correct = 0
    nbr_gt_0 = 0
    nbr_gt_1 = 0
    
    for i in range(len(imputed_vcf)):
        if        true_vcf[i]['pos'] not in common_pos\
            or    miss_vcf[i]['pos'] not in common_pos\
            or imputed_vcf[i]['pos'] not in common_pos:
            continue
            
        imputed_bool = [x == MISSING_GENOTYPE_CONSTANT
                        for x
                        in miss_vcf[i]['gt']
                       ]
        true_gt_oi = [x
                      for x, y
                      in zip(true_vcf[i]['gt'], imputed_bool) if y
                     ]
        miss_gt_oi = [x
                      for x, y
                      in zip(miss_vcf[i]['gt'], imputed_bool) if y
                     ]
        imputed_gt_oi = [x
                         for x, y
                         in zip(imputed_vcf[i]['gt'], imputed_bool) if y
                        ]
        
        nbr_gt_0 += np.sum([x == [0, False]
                            for x, y
                            in zip(true_vcf[i]['gt'], imputed_bool) if y])
        nbr_gt_1 += np.sum([x == [1, False]
                            for x, y
                            in zip(true_vcf[i]['gt'], imputed_bool) if y])
        
        nbr_gt_total   += len(true_gt_oi)
        nbr_gt_correct += np.count_nonzero([x == y
                                            for x, y
                                            in zip(true_gt_oi, imputed_gt_oi)
                                           ])
        
        if verbose:
            print(f"Position {true_vcf[i]['pos']}")
            print(f"Boolean array to indicate which genotypes are missing.")
            print(imputed_bool)
            print(f"True genotypes")
            print(true_gt_oi)
            print(f"Missing genotypes")
            print(miss_gt_oi)
            print(f"Imputed genotypes")
            print(imputed_gt_oi)
            
    if verbose:
        print(f"Number of missing genotypes 0: {nbr_gt_0}")
        print(f"Number of missing genotypes 1: {nbr_gt_1}")
            
    concordance_rate = float(nbr_gt_correct) / float(nbr_gt_total)
    
    return((nbr_gt_total, nbr_gt_correct, concordance_rate))

In [5]:
def find_biallelic_sites(sample_data_1,
                         sample_data_2,
                         verbose = False):
    variants_1 = sample_data_1.variants()
    variants_2 = sample_data_2.variants()
    
    # Keep only biallelic sites
    sites_1 = []
    sites_2 = []
    
    for var_1, var_2 in zip(variants_1, variants_2):
        assert var_1.site.position == var_2.site.position
        
        alleles_1 = set(var_1.alleles) - {None}
        alleles_2 = set(var_2.alleles) - {None}
        
        if len(alleles_1) == 2\
            and len(alleles_2) == 2\
            and alleles_1 == alleles_2:
            sites_1.append(var_1.site.id)
            sites_2.append(var_2.site.id)
            
            if verbose:
                print(f"SD1: {var_1.site.position} {alleles_1}" + " "+\
                      f"SD2: {var_2.site.position} {alleles_2}")
                
    assert len(sites_1) == len(sites_2),\
        "The number of site positions in sites_1 and sites_2 are different."
    
    return(sites_1, sites_2)

In [6]:
def get_ts_with_discretized_coordinates(ts):
    ts_tables = ts.dump_tables()
    ts_tables.sites.position = np.round(ts_tables.sites.position)
    ts_tables.deduplicate_sites()
    ts_tables.sort()
    ts_tables.build_index()
    ts_tables.compute_mutation_times()
    ts_discretized = ts_tables.tree_sequence()
    return(ts_discretized)

In [7]:
def impute_genotypes_using_tsinfer(ref_vcf_file,
                                   miss_vcf_file,
                                   imputed_vcf_file,
                                   contig_id):
    sd_ref  = create_sample_data_from_vcf_file(ref_vcf_file)
    sd_miss = create_sample_data_from_vcf_file(miss_vcf_file)
    ad_ref     = tsinfer.generate_ancestors(sample_data = sd_ref)
    # This step is to infer a tree sequence from the sample data.
    ts_anc_ref = tsinfer.match_ancestors(sample_data   = sd_ref,
                                         ancestor_data = ad_ref)
    ts_matched = tsinfer.match_samples(sample_data  = sd_miss,
                                       ancestors_ts = ts_anc_ref)
    with open(imputed_vcf_file, "w") as vcf:
        ts_matched.write_vcf(vcf, contig_id = contig_id)

In [8]:
def impute_genotypes_using_ts_only(ref_vcf_file,
                                   miss_vcf_file,
                                   imputed_vcf_file,
                                   ts_anc_ref,
                                   contig_id):
    sd_ref  = create_sample_data_from_vcf_file(ref_vcf_file)
    sd_miss = create_sample_data_from_vcf_file(miss_vcf_file)
    ts_matched = tsinfer.match_samples(sample_data  = sd_miss,
                                       ancestors_ts = ts_anc_ref)
    with open(imputed_vcf_file, "w") as vcf:
        ts_matched.write_vcf(vcf, contig_id = contig_id)

## Create data sets via simulations.

In [17]:
size_query =   1_000

size_yri   =   2_500
size_ceu   =  95_000
size_chb   =   2_500
size_ref   = size_yri + size_ceu + size_chb

print(f"Size of reference panel    : {size_ref}")
print(f"Number of samples from YRI : {size_yri}")
print(f"Number of samples from CEU : {size_ceu}")
print(f"Number of samples from CHB : {size_chb}")

num_replicates = 10

num_missing_sites = 10_000

contig_id = '1'
ploidy_level = 1
sequence_length = 1_000_000

base_dir = "../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/"

Size of reference panel    : 100000
Number of samples from YRI : 2500
Number of samples from CEU : 95000
Number of samples from CHB : 2500


In [18]:
rate_map = msprime.RateMap.uniform(
    sequence_length = sequence_length,
    rate = 1e-8
)

In [19]:
yaml_file = "../demes/gutenkunst_ooa_2009.yaml"
ooa_graph = demes.load(yaml_file)
demography_model = msprime.Demography.from_demes(ooa_graph)

In [20]:
sample_set = [
    msprime.SampleSet(num_samples = size_yri + size_query,
                      population = "YRI", # id = 3
                      ploidy = ploidy_level),
    msprime.SampleSet(num_samples = size_ceu,
                      population = "CEU", # id = 4
                      ploidy = ploidy_level),
    msprime.SampleSet(num_samples = size_chb,
                      population = "CHB", # id = 5
                      ploidy = ploidy_level)
]

src_ts = [] # List of full simulated ts.

tic = time.time()

print(f"Simulating {num_replicates} ts without duplicate site positions.")
success = 0

while success < num_replicates:
    sim_ts = msprime.sim_ancestry(
        samples = sample_set,
        demography = demography_model,
        ploidy = ploidy_level,
        model = "hudson",
        recombination_rate = rate_map,
        discrete_genome = True
    )
    
    sim_mts = msprime.sim_mutations(
        sim_ts,
        rate = 1e-8,
        discrete_genome = True
    )
    
    src_ts.append(sim_mts)
    success += 1
    
toc = time.time()
print(f"Simulation of {num_replicates} ts took {round(toc - tic, 2)} seconds.")

Simulating 10 ts without duplicate site positions.
Simulation of 10 ts took 11.03 seconds.


In [21]:
# Impute into YRI samples.
ts = src_ts[0]

individuals_query = ts.samples(population = 3)[:size_query]
samples_query     = individuals_query # When haploid

individuals_ref   = np.concatenate([ts.samples(population = 3)[size_query:],
                                    ts.samples(population = 4),
                                    ts.samples(population = 5)])
samples_ref       = individuals_ref   # When haploid

gt_mask = mask_genotype.MissingGenotypeMask(individuals        = individuals_query,
                                            sequence_length    = sequence_length,
                                            proportion_missing = 0.05)

print(f"Number of reference samples : {len(individuals_ref)}")
print(f"Number of study     samples : {len(individuals_query)}")

Number of reference samples : 100000
Number of study     samples : 1000


In [22]:
anc_ts = [] # List of simulated ancestor ts.

for i, ts in enumerate(src_ts):
    print(f"Processing ts {i}.")
    ref_vcf_file  = base_dir + "ref/"  + "ref."  + str(i) + ".vcf"
    true_vcf_file = base_dir + "true/" + "true." + str(i) + ".vcf"
    miss_vcf_file = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    ts_anc_ref_file = base_dir + "ts_anc_ref/" + "ts_anc_ref." + str(i) + ".trees"
    
    sd_all = tsinfer.SampleData.from_tree_sequence(ts, use_sites_time = False)
    
    sd_ref   = sd_all.subset(individuals = individuals_ref)
    sd_query = sd_all.subset(individuals = individuals_query)
    
    sites_to_keep     = find_biallelic_sites(sd_ref, sd_query)
    sd_ref_filtered   =   sd_ref.subset(sites = sites_to_keep[0])
    sd_query_filtered = sd_query.subset(sites = sites_to_keep[1])
    
    # TODO: Refactor.
    print("Printing ancestors ts.")
    sim_ts_anc_ref = make_ancestors_ts(samples = samples_ref,
                                       ts = ts,
                                       remove_leaves = True)
    tmp_tables = sim_ts_anc_ref.dump_tables()
    tmp_tables.populations.metadata_schema = tskit.MetadataSchema(schema = None)
    sim_ts_anc_ref = tmp_tables.tree_sequence()
    anc_ts.append(sim_ts_anc_ref)
    sim_ts_anc_ref.dump(ts_anc_ref_file)
    
    print("Printing reference panel VCF.")
    print_sample_data_to_vcf(sample_data = sd_ref_filtered,
                             individuals = individuals_ref,
                             samples = samples_ref,
                             ploidy_level = ploidy_level,
                             mask = None,
                             out_vcf_file = ref_vcf_file,
                             contig_id = contig_id,
                             sequence_length_max = 1e24)
    
    print("Printing query VCF with non-missing genotypes.")
    print_sample_data_to_vcf(sample_data = sd_query_filtered,
                             individuals = individuals_query,
                             samples = samples_query,
                             ploidy_level = ploidy_level,
                             mask = None,
                             out_vcf_file = true_vcf_file,
                             contig_id = contig_id,
                             sequence_length_max = 1e24)
    
    print("Printing query VCF with missing genotypes.")
    print_sample_data_to_vcf(sample_data = sd_query_filtered,
                             individuals = individuals_query,
                             samples = samples_query,
                             ploidy_level = ploidy_level,
                             mask = gt_mask,
                             out_vcf_file = miss_vcf_file,
                             contig_id = contig_id,
                             sequence_length_max = 1e24)

Processing ts 0.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.
Processing ts 1.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.
Processing ts 2.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.
Processing ts 3.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.
Processing ts 4.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.
Processing ts 5.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.
Processing ts 6.
Printing ancestors ts.


## Perform genotype imputation.

In [23]:
print("Doing imputation using ts only.")

for i in np.arange(len(src_ts)):
    print(f"Imputing VCF {i}")
    ref_vcf_file     = base_dir + "ref/"  + "ref."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsonly/" + "imputed." + str(i) + ".vcf"
    impute_genotypes_using_ts_only(ref_vcf_file = ref_vcf_file,
                                   miss_vcf_file = miss_vcf_file,
                                   imputed_vcf_file = imputed_vcf_file,
                                   ts_anc_ref = anc_ts[i],
                                   contig_id = contig_id)

Doing imputation using ts only.
Imputing VCF 0
Imputing VCF 1
Imputing VCF 2
Imputing VCF 3
Imputing VCF 4
Imputing VCF 5
Imputing VCF 6
Imputing VCF 7
Imputing VCF 8
Imputing VCF 9


In [28]:
print("Doing imputation using tsinfer.")

for i in np.arange(len(src_ts)):
    print(f"Imputing VCF {i}")
    ref_vcf_file     = base_dir + "ref/" + "ref."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsinfer/" + "imputed." + str(i) + ".vcf"
    impute_genotypes_using_tsinfer(ref_vcf_file = ref_vcf_file,
                                   miss_vcf_file = miss_vcf_file,
                                   imputed_vcf_file = imputed_vcf_file,
                                   contig_id = contig_id)

Doing imputation using tsinfer.
Imputing VCF 0
Imputing VCF 1
Imputing VCF 2
Imputing VCF 3
Imputing VCF 4
Imputing VCF 5
Imputing VCF 6
Imputing VCF 7
Imputing VCF 8
Imputing VCF 9


In [25]:
print("Doing imputation using BEAGLE.")

beagle_exe = "../analysis/beagle/beagle.28Jun21.220.jar"

for i in np.arange(len(src_ts)):
    print(f"Imputing VCF {i}")
    ref_vcf_file     = base_dir + "ref/"  + "ref."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_beagle/" + "imputed." + str(i)
    beagle_cmd = [
        "java", "-jar", beagle_exe,
        "ref=" + ref_vcf_file,
        "gt="  + miss_vcf_file,
        "out=" + imputed_vcf_file
    ]
    beagle_cmd = " ".join(beagle_cmd)
    print(beagle_cmd + "\n")

Doing imputation using BEAGLE.
Imputing VCF 0
java -jar ../analysis/beagle/beagle.28Jun21.220.jar ref=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.0.vcf gt=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/miss/miss.0.vcf out=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_beagle/imputed.0

Imputing VCF 1
java -jar ../analysis/beagle/beagle.28Jun21.220.jar ref=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.1.vcf gt=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/miss/miss.1.vcf out=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_beagle/imputed.1

Imputing VCF 2
java -jar ../analysis/beagle/beagle.28Jun21.220.jar ref=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/ref/ref.2.vcf gt=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/miss/miss.2.vcf out=../data/modern_ooa_unequal_900505_haploid_miss05_yri_demes_yri/imputed_beagle/imputed.2



## Get imputation accuracy metrics.

In [29]:
print("Computing imputation accuracy metrics for ts only.")
results = []

for i in np.arange(len(src_ts)):
    true_vcf_file    = base_dir + "true/" + "true."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss."  + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsonly/" + "imputed." + str(i) + ".vcf"
    stats = compare_variants(true_vcf_file,
                             miss_vcf_file,
                             imputed_vcf_file,
                             ploidy_level = 1)
    results.append(stats)
    
for y in results:
    print(",".join([str(x) for x in y]))

Computing imputation accuracy metrics for ts only.
376732,376697,0.9999070957603814
370184,370162,0.9999405700948717
375151,375121,0.9999200322003673
369503,369478,0.9999323415506776
367133,367113,0.999945523829239
363183,363166,0.9999531916416793
377197,377172,0.9999337216361742
367301,367276,0.9999319359326547
369965,369947,0.9999513467490168
373975,373942,0.9999117588074069


In [30]:
print("Computing imputation accuracy metrics for tsinfer.")
results = []

for i in np.arange(len(src_ts)):
    true_vcf_file    = base_dir + "true/" + "true."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss."  + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsinfer/" + "imputed." + str(i) + ".vcf"
    stats = compare_variants(true_vcf_file,
                             miss_vcf_file,
                             imputed_vcf_file,
                             ploidy_level = 1)
    results.append(stats)
    
for y in results:
    print(",".join([str(x) for x in y]))

Computing imputation accuracy metrics for tsinfer.
376732,376644,0.9997664121975304
370184,370088,0.9997406695048948
375151,375044,0.9997147815146434
369503,369411,0.9997510169064933
367133,367020,0.9996922096352003
363183,363107,0.9997907391039779
377197,377112,0.9997746535629923
367301,367209,0.9997495242321692
369965,369879,0.9997675455786358
373975,373887,0.9997646901530851


In [31]:
print("Computing imputation accuracy metrics for BEAGLE.")
results = []

for i in np.arange(len(src_ts)):
    true_vcf_file    = base_dir + "true/" + "true."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss."  + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_beagle/" + "imputed." + str(i) + ".vcf.gz"
    stats = compare_variants(true_vcf_file = true_vcf_file,
                             miss_vcf_file = miss_vcf_file,
                             imputed_vcf_file = imputed_vcf_file,
                             ploidy_level = 1)
    results.append(stats)
    
for y in results:
    print(",".join([str(x) for x in y]))

Computing imputation accuracy metrics for BEAGLE.


[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround

376732,376655,0.9997956106728391
370184,370126,0.9998433211592073
375151,375093,0.9998453955873768
369503,369441,0.9998322070456803
367133,367075,0.9998420191047931
363183,363117,0.9998182734324018
377197,377140,0.9998488853304772
367301,367234,0.9998175882995146
369965,369923,0.9998864757477058
373975,373914,0.9998368874924795
