In [35]:
from copy import deepcopy
import math
import numpy as np
import os
import random
import sys
import time

from IPython.display import SVG

sys.path.append("../modules/")
import mask_genotype
import parse_vcf

import tsinfer
from tsinfer import make_ancestors_ts
import tskit
import msprime
import stdpopsim
import cyvcf2

print(f"tskit {tskit.__version__}")
print(f"tsinfer {tsinfer.__version__}")
print(f"msprime {msprime.__version__}")
print(f"stdpopsim {stdpopsim.__version__}")
print(f"cyvcf2 {cyvcf2.__version__}")

tskit 0.4.1
tsinfer 0.2.3.dev9+gc8568d5
msprime 1.1.1
stdpopsim 0.1.2
cyvcf2 0.30.14


In [36]:
def print_sample_data_to_vcf(sample_data,
                             individuals,
                             samples,
                             mask,
                             out_vcf_file,
                             contig_id,
                             sequence_length_max = 1e12):
    """
    Fields:
    CHROM contig_id
    POS row index in genotype_matrix
    ID .
    REF ancestral allele
    ALT derived allele(s)
    QUAL .
    FILTER PASS
    INFO
    FORMAT GT
    individual 0
    individual 1
    ...
    individual n - 1; n = number of individuals
    """
    CHROM = contig_id
    ID = '.'
    QUAL = '.'
    FILTER = 'PASS'
    FORMAT = 'GT'
    
    assert 2 * len(individuals) == len(samples),\
        "Some individuals may not be diploid."
    
    # Assume that both sample and individual ids are ordered the same way.
    individual_id_map = np.repeat(individuals, 2)
    
    header  = "##fileformat=VCFv4.2\n"\
            + "##source=tskit " + tskit.__version__ + "\n"\
            + "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n"\
            + "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
    header += "##contig=<ID=" + contig_id + "," + "length=" + str(int(ts.sequence_length)) + ">\n"
    header += "\t".join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']\
                        + ["s" + str(x) for x in individuals])
    
    with open(out_vcf_file, "w") as vcf:
        vcf.write(header + "\n")
        for i, variant in enumerate(ts.variants()):
            site_id = variant.site.id
            POS = int(np.round(variant.site.position))
            if POS > sequence_length_max:
                break
            # Since the tree sequence was produced using simulation,
            #    there's no reference sequence other than the ancestral sequence.
            REF = variant.site.ancestral_state
            alt_alleles = list(set(variant.alleles) - {REF})
            AA = variant.site.ancestral_state
            ALT = ",".join(alt_alleles) if len(alt_alleles) > 0 else "."
            INFO = "AA" + "=" + AA
            record = [str(x)
                      for x
                      in [CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT]]
            
            for j in individuals:
                sample_ids = [samples[x]
                              for x
                              in np.where(individual_id_map == j)[0].tolist()]
                genotype = "|".join([str(variant.genotypes[k])
                                     for k
                                     in sample_ids])
                if mask is not None\
                    and mask.query_position(individual = j, position = POS) == True:
                    genotype = '.|.' # Or "./."
                record += [genotype]
                
            vcf.write("\t".join(record) + "\n")

In [37]:
# Sourced and modified from:
# https://tsinfer.readthedocs.io/en/latest/tutorial.html#data-example
def add_populations(vcf,
                    samples):
    """
    TODO
    """
    pop_ids = [sample_name[0] for sample_name in vcf.samples]
    pop_codes = np.unique(pop_ids)
    pop_lookup = {}
    for p in pop_codes:
        pop_lookup[p] = samples.add_population(metadata = {"name" : p})
    return [pop_lookup[pop_id] for pop_id in pop_ids]


def add_diploid_individuals(vcf,
                            samples,
                            populations):
    for name, population in zip(vcf.samples, populations):
        samples.add_individual(ploidy = 2,
                               metadata = {"name": name},
                               population = population)


def get_chromosome_length(vcf):
    assert len(vcf.seqlens) == 1
    return vcf.seqlens[0]


def add_diploid_sites(vcf,
                      samples,
                      warn_monomorphic_sites = False):
    """
    Read the sites in the VCF and add them to the samples object,
    reordering the alleles to put the ancestral allele first,
    if it is available.
    """
    pos = 0
    for variant in vcf:
        # Check for duplicate site positions.
        if pos == variant.POS:
            raise ValueError("Duplicate positions for variant at position", pos)
        else:
            pos = variant.POS
        # Check that the genotypes are phased.
        if any([not phased for _, _, phased in variant.genotypes]):
            raise ValueError("Unphased genotypes for variant at position", pos)
        alleles = [variant.REF] + variant.ALT # Exactly as in the input VCF file.
        if warn_monomorphic_sites:
            if len(alleles) < 2:
                print(f"Monomorphic site at {pos}")
        ancestral = variant.INFO.get("AA", variant.REF) # Dangerous action!!!
        # Ancestral state must be first in the allele list.
        ordered_alleles = [ancestral] + list(set(alleles) - {ancestral})
        # Create an index mapping from the input VCF to tsinfer input.
        allele_index = {
            old_index: ordered_alleles.index(allele)
            for old_index, allele in enumerate(alleles)
        }
        # When genotype is missing...
        if variant.num_unknown > 0:
            allele_index[-1] = tskit.MISSING_DATA
            ordered_alleles += [None]
        # Map original allele indexes to their indexes in the new alleles list.
        genotypes = [
            allele_index[old_index]
            for row in variant.genotypes # cyvcf2 uses -1 to indicate missing data.
            for old_index in row[0:2] # Each is a 3-tuple (allele 1, allele 2, is phased?).
        ]
        samples.add_site(pos,
                         genotypes = genotypes,
                         alleles = ordered_alleles)


def create_sample_data_from_vcf_file(vcf_file):
    vcf = cyvcf2.VCF(vcf_file,
                     gts012 = False, # 0=HOM_REF, 1=HET, 2=UNKNOWN, 3=HOM_ALT
                     strict_gt = True)
    with tsinfer.SampleData(
        sequence_length = get_chromosome_length(vcf)
    ) as samples:
        populations = add_populations(vcf, samples)
        add_diploid_individuals(vcf, samples, populations)
        add_diploid_sites(vcf, samples)
    return(samples)

In [38]:
def parse_vcf_file(vcf_file):
    """
    If gt_types = False, then 0=HOM_REF, 1=HET, 2=UNKNOWN, the coordinates are 0-based.
    It returns a list of dictionaries, each containing a VCF record.
    """
    parsed_vcf = []
    for variant in cyvcf2.VCF(vcf_file,
                              gts012 = False, # 0=HOM_REF, 1=HET, 2=UNKNOWN, 3=HOM_ALT
                              strict_gt = True):
        record = {
            'ref': variant.REF,
            'alt': variant.ALT,
            'ctg': variant.CHROM, # Contig id/name
            'pos': int(variant.start),
            'aa' : variant.INFO.get('AA'), # Ancestral allele
            'gt' : variant.genotypes
        }
        parsed_vcf.append(record)
    return(parsed_vcf)


def compare_vcf(vcf_1, vcf_2):
    assert len(vcf_1) == len(vcf_2)
    for i in range(len(vcf_1)):
        is_valid_ref = vcf_1[i].get('ref') == vcf_2[i].get('ref')
        is_valid_alt = vcf_1[i].get('alt') == vcf_2[i].get('alt')
        is_valid_ctg = vcf_1[i].get('ctg') == vcf_2[i].get('ctg')
        is_valid_pos = vcf_1[i].get('pos') == vcf_2[i].get('pos')
        is_valid_aa  = vcf_1[i].get('aa' ) == vcf_2[i].get('aa' )
        is_all_valid = np.all([is_valid_ref,
                               is_valid_alt,
                               is_valid_ctg,
                               is_valid_pos,
                               is_valid_aa])
        if not is_all_valid:
            return(False)
    return(True)


def get_common_positions_in_vcf(vcf_1, vcf_2):
    pos_1 = []
    pos_2 = []
    for i, record in enumerate(vcf_1):
        pos_1.append(record.get('pos'))
    for i, record in enumerate(vcf_2):
        pos_2.append(record.get('pos'))
    # All positions should be unique.
    assert len(pos_1) == len(set(pos_1)),\
        "The positions in vcf_1 are not all unique."
    assert len(pos_2) == len(set(pos_2)),\
        "The positions in vcf_2 are not all unique."
    common_pos = list(set.intersection(set(pos_1), set(pos_2)))
    return(common_pos)


def compare_variants(true_vcf_file,
                     miss_vcf_file,
                     imputed_vcf_file):
    true_vcf    = parse_vcf_file(true_vcf_file)
    miss_vcf    = parse_vcf_file(miss_vcf_file)
    imputed_vcf = parse_vcf_file(imputed_vcf_file)    
    assert compare_vcf(true_vcf, miss_vcf),\
        "true_vcf and miss_vcf are not comparable."
    # Imputed VCF file must have at most the number of positions as the true/miss VCF files.
    common_pos = get_common_positions_in_vcf(miss_vcf, imputed_vcf)
    # Number of genotypes imputed, correctly or not.
    nbr_gt_total = 0
    # Number of instances of genotypes correctly imputed.
    nbr_gt_correct = 0
    for i in range(len(imputed_vcf)):
        if true_vcf[i]['pos'] not in common_pos\
            or miss_vcf[i]['pos'] not in common_pos\
            or imputed_vcf[i]['pos'] not in common_pos:
            continue
        imputed_bool = [x == [-1, -1, True]
                        for x
                        in miss_vcf[i]['gt']]
        true_gt_oi = [x
                      for x, y
                      in zip(true_vcf[i]['gt'], imputed_bool) if y]
        imputed_gt_oi = [x
                         for x, y
                         in zip(imputed_vcf[i]['gt'], imputed_bool) if y]
        nbr_gt_total   += len(true_gt_oi)
        nbr_gt_correct += np.count_nonzero([x == y
                                            for x, y
                                            in zip(true_gt_oi, imputed_gt_oi)])
    concordance_rate = float(nbr_gt_correct) / float(nbr_gt_total)
    #print(",".join([str(nbr_gt_total),
    #                str(nbr_gt_correct),
    #                str(concordance_rate)]))
    return((nbr_gt_total,
            nbr_gt_correct,
            concordance_rate))

In [39]:
def find_biallelic_sites(sample_data_1,
                             sample_data_2):
    variants_1 = sample_data_1.variants()
    variants_2 = sample_data_2.variants()
    # Keep only biallelic sites
    sites_1 = []
    sites_2 = []
    for var_1, var_2 in zip(variants_1, variants_2):
        assert var_1.site.position == var_2.site.position
        alleles_1 = set(var_1.alleles) - {None}
        alleles_2 = set(var_2.alleles) - {None}
        if len(alleles_1) == 2\
            and len(alleles_2) == 2\
            and alleles_1 == alleles_2:
            sites_1.append(var_1.site.id)
            sites_2.append(var_2.site.id)
    assert len(sites_1) == len(sites_2),\
        "The number of site positions in sites_1 and sites_2 are different."
    return(sites_1, sites_2)

In [40]:
def get_ts_with_discretized_coordinates(ts):
    ts_tables = ts.dump_tables()
    ts_tables.sites.position = np.round(ts_tables.sites.position)
    ts_tables.deduplicate_sites()
    ts_tables.sort()
    ts_tables.build_index()
    ts_tables.compute_mutation_times()
    ts_discretized = ts_tables.tree_sequence()
    return(ts_discretized)

In [41]:
def impute_genotypes_using_tsinfer(ref_vcf_file,
                                   miss_vcf_file,
                                   imputed_vcf_file,
                                   contig_id):
    sd_ref  = create_sample_data_from_vcf_file(ref_vcf_file)
    sd_miss = create_sample_data_from_vcf_file(miss_vcf_file)
    ad_ref     = tsinfer.generate_ancestors(sample_data = sd_ref)
    # This step is to infer a tree sequence from the sample data.
    ts_anc_ref = tsinfer.match_ancestors(sample_data   = sd_ref,
                                         ancestor_data = ad_ref)
    ts_matched = tsinfer.match_samples(sample_data  = sd_miss,
                                       ancestors_ts = ts_anc_ref)
    with open(imputed_vcf_file, "w") as vcf:
        ts_matched.write_vcf(vcf, contig_id = contig_id)

In [42]:
def impute_genotypes_using_ts_only(ref_vcf_file,
                                   miss_vcf_file,
                                   imputed_vcf_file,
                                   ts_anc_ref,
                                   contig_id):
    sd_ref  = create_sample_data_from_vcf_file(ref_vcf_file)
    sd_miss = create_sample_data_from_vcf_file(miss_vcf_file)
    ts_matched = tsinfer.match_samples(sample_data  = sd_miss,
                                       ancestors_ts = ts_anc_ref)
    with open(imputed_vcf_file, "w") as vcf:
        ts_matched.write_vcf(vcf, contig_id = contig_id)

## Create data sets via simulations.

In [65]:
size_ref   = 9_999
#size_ref   = 1_000
size_query =   100

ploidy_level = 2
eff_pop_size = 10_000

num_replicates = 2

num_missing_sites = 1_000

#contig_id = '1'
contig_id = 'chr20'

base_dir = "../data/modern_outofafrica_unequal_900505_p1000/"

In [66]:
#rate_map = msprime.RateMap.uniform(
#    sequence_length = 1_000_000,
#    rate = 1e-8
#)
# chr20:1 - chr20:849,253
map_file = "../hapmap/genetic_map_GRCh37_" + contig_id + "_reduced_v2.txt"
rate_map = msprime.RateMap.read_hapmap(
    fileobj = map_file
)

In [51]:
contig_id = '1'

sample_set = [
    msprime.SampleSet(num_samples = size_query,
                      time = 0,
                      #time = 100
                      ploidy = 2),
    msprime.SampleSet(num_samples = size_ref,
                      time = 0,
                      ploidy = 2)
]

print(f"Size of the reference panel is {size_ref}")
print(f"Size of the query is {size_query}")
print(f"Ploidy level is {ploidy_level}")
print(f"Population size is {eff_pop_size}")

src_ts = [] # List of full simulated ts.

tic = time.time()

print(f"Simulating {num_replicates} ts without duplicate site positions.")
success = 0
while success < num_replicates:
    sim_ts = msprime.sim_ancestry(
        samples = sample_set,
        population_size = eff_pop_size,
        ploidy = ploidy_level,
        model = "hudson",
        recombination_rate = rate_map,
        discrete_genome = True
    )
    
    sim_mts = msprime.sim_mutations(
        sim_ts,
        rate = 1e-8, # per base per generation
        discrete_genome = True
    )
    
    #pos_discretized = np.round(sim_mts.tables.sites.position)
    #num_pos_total   = len(pos_discretized)
    #num_pos_unique  = len(np.unique(pos_discretized))
    #if num_pos_total != num_pos_unique:
    #    continue
    #sim_mts_discretized = get_ts_with_discretized_coordinates(sim_mts)
    #src_ts.append(sim_mts_discretized)
    src_ts.append(sim_mts)
    success += 1
    
toc = time.time()
print(f"Simulation of {num_replicates} ts took {round(toc - tic, 2)} seconds.")

Size of the reference panel is 1000
Size of the query is 100
Ploidy level is 2
Population size is 10000
Simulating 10 ts without duplicate site positions.
Simulation of 10 ts took 0.14 seconds.


In [67]:
species = stdpopsim.get_species("HomSap")
#contig  = species.get_contig(contig_id)
contig  = stdpopsim.Contig(mutation_rate = 1e-8,
                           recombination_map = rate_map)
model   = species.get_demographic_model('OutOfAfricaArchaicAdmixture_5R19')

print(f"Number of (sampling and non-sampling) populations is {model.num_populations}.")
print(f"Number of sampling populations is {model.num_sampling_populations}.")
print(f"Populations in this model are: {[pop.id for pop in model.populations]}.")

samples = model.get_samples(2 * (size_query + int(size_ref * 0.05)), # YRI
                            2 * int(size_ref * 0.05),                # CHB
                            2 * int(size_ref * 0.90))                # CEU

engine = stdpopsim.get_engine('msprime')

src_ts = [] # List of full simulated ts.

tic = time.time()

print(f"Simulating {num_replicates} ts without duplicate site positions.")
success = 0
while success < num_replicates:
    sim_ts = engine.simulate(model,
                             contig,
                             samples,
                             discrete_genome = True)
    
    sim_mts = msprime.sim_mutations(sim_ts,
                                    rate = 1e-8,
                                    discrete_genome = True,
                                    keep = False)
    
    #pos_discretized = np.round(sim_mts.tables.sites.position)
    #num_pos_total   = len(pos_discretized)
    #num_pos_unique  = len(np.unique(pos_discretized))
    #if num_pos_total != num_pos_unique:
    #    print(f"{num_pos_total} {num_pos_unique}")
    #    continue
    #sim_mts_discretized = get_ts_with_discretized_coordinates(sim_mts)
    #src_ts.append(sim_mts_discretized)
    src_ts.append(sim_mts)
    success += 1
    
toc = time.time()
print(f"Simulation of {num_replicates} ts took {round(toc - tic, 2)} seconds.")

Number of (sampling and non-sampling) populations is 5.
Number of sampling populations is 3.
Populations in this model are: ['YRI', 'CEU', 'CHB', 'Neanderthal', 'ArchaicAFR'].
Simulating 2 ts without duplicate site positions.
Simulation of 2 ts took 6.44 seconds.


In [68]:
size_ref = int(size_ref * 0.05) + int(size_ref * 0.05) + int(size_ref * 0.90) # rounding errors

individuals_query = np.arange(size_query)
samples_query     = np.arange(2 * size_query)

individuals_ref   = np.arange(size_query,
                              size_query + size_ref)
samples_ref       = np.arange(2 * size_query,
                              2 * (size_query + size_ref))

gt_mask = mask_genotype.MissingGenotypeMask(individuals = individuals_query)

In [69]:
anc_ts = [] # List of simulated ancestor ts.

for i, ts in enumerate(src_ts):
    print(f"Processing ts {i}.")
    ref_vcf_file  = base_dir + "ref/"  + "ref."  + str(i) + ".vcf"
    true_vcf_file = base_dir + "true/" + "true." + str(i) + ".vcf"
    miss_vcf_file = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    ts_anc_ref_file = base_dir + "ts_anc_ref/" + "ts_anc_ref." + str(i) + ".trees"
    
    sd_all = tsinfer.SampleData.from_tree_sequence(ts, use_sites_time = False)
    
    sd_ref   = sd_all.subset(individuals = individuals_ref)
    sd_query = sd_all.subset(individuals = individuals_query)
    
    sites_to_keep     = find_biallelic_sites(sd_ref, sd_query)
    sd_ref_filtered   =   sd_ref.subset(sites = sites_to_keep[0])
    sd_query_filtered = sd_query.subset(sites = sites_to_keep[1])
    
    # TODO: Refactor.
    # TODO: Remove some monomorphic sites?
    print("Printing ancestors ts.")
    sim_ts_anc_ref = make_ancestors_ts(samples = samples_ref,
                                       ts = ts,
                                       remove_leaves = True)
    tmp_tables = sim_ts_anc_ref.dump_tables()
    tmp_tables.populations.metadata_schema = tskit.MetadataSchema(schema = None)
    sim_ts_anc_ref = tmp_tables.tree_sequence()
    anc_ts.append(sim_ts_anc_ref)
    sim_ts_anc_ref.dump(ts_anc_ref_file)
    
    print("Printing reference panel VCF.")
    print_sample_data_to_vcf(sample_data = sd_ref_filtered,
                             individuals = individuals_ref,
                             samples = samples_ref,
                             mask = None,
                             out_vcf_file = ref_vcf_file,
                             contig_id = contig_id,
                             sequence_length_max = 1e24)
    
    print("Printing query VCF with non-missing genotypes.")
    print_sample_data_to_vcf(sample_data = sd_query_filtered,
                             individuals = individuals_query,
                             samples = samples_query,
                             mask = None,
                             out_vcf_file = true_vcf_file,
                             contig_id = contig_id,
                             sequence_length_max = 1e24)
    
    print("Printing query VCF with missing genotypes.")
    print_sample_data_to_vcf(sample_data = sd_query_filtered,
                             individuals = individuals_query,
                             samples = samples_query,
                             mask = gt_mask,
                             out_vcf_file = miss_vcf_file,
                             contig_id = contig_id,
                             sequence_length_max = 1e24)

Processing ts 0.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.
Processing ts 1.
Printing ancestors ts.
Printing reference panel VCF.
Printing query VCF with non-missing genotypes.
Printing query VCF with missing genotypes.


## Impute genotypes using ts with true genealogy.

In [71]:
for i in range(len(anc_ts)):
    ref_vcf_file     = base_dir + "ref/"  + "ref."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsonly/" + "imputed." + str(i) + ".vcf"
    impute_genotypes_using_ts_only(ref_vcf_file = ref_vcf_file,
                                   miss_vcf_file = miss_vcf_file,
                                   imputed_vcf_file = imputed_vcf_file,
                                   ts_anc_ref = anc_ts[i],
                                   contig_id = contig_id)

## Impute genotypes using tsinfer.

In [61]:
for i in range(len(src_ts)):
    ref_vcf_file     = base_dir + "ref/" + "ref."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsinfer/" + "imputed." + str(i) + ".vcf"
    impute_genotypes_using_tsinfer(ref_vcf_file = ref_vcf_file,
                                   miss_vcf_file = miss_vcf_file,
                                   imputed_vcf_file = imputed_vcf_file,
                                   contig_id = contig_id)

## Impute genotypes using BEAGLE.

In [76]:
beagle_exe = "../analysis/beagle/beagle.28Jun21.220.jar"
map_file = "../hapmap/genetic_map_GRCh37_chr20_reduced_plink_v2.txt"

for i in range(len(src_ts)):
    ref_vcf_file     = base_dir + "ref/"  + "ref."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_beagle/" + "imputed." + str(i)
    beagle_cmd = [
        "java", "-jar", beagle_exe,
        #"map=" + map_file,
        "ref=" + ref_vcf_file,
        "gt="  + miss_vcf_file,
        "out=" + imputed_vcf_file
    ]
    beagle_cmd = " ".join(beagle_cmd)
    print(beagle_cmd + "\n")

java -jar ../analysis/beagle/beagle.28Jun21.220.jar ref=../data/modern_outofafrica_unequal_900505_p1000/ref/ref.0.vcf gt=../data/modern_outofafrica_unequal_900505_p1000/miss/miss.0.vcf out=../data/modern_outofafrica_unequal_900505_p1000/imputed_beagle/imputed.0

java -jar ../analysis/beagle/beagle.28Jun21.220.jar ref=../data/modern_outofafrica_unequal_900505_p1000/ref/ref.1.vcf gt=../data/modern_outofafrica_unequal_900505_p1000/miss/miss.1.vcf out=../data/modern_outofafrica_unequal_900505_p1000/imputed_beagle/imputed.1



## Get imputation accuracy metrics.

In [72]:
# Get imputation accuracy metrics for ts with true genealogy.
print("Computing imputation accuracy metrics for ts only.")
results = []
for i in range(len(anc_ts)):
    true_vcf_file    = base_dir + "true/" + "true."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss."  + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsonly/" + "imputed." + str(i) + ".vcf"
    stats = compare_variants(true_vcf_file, miss_vcf_file, imputed_vcf_file)
    results.append(stats)
for y in results:
    print(",".join([str(x) for x in y]))

Computing imputation accuracy metrics for ts only.
11999,11989,0.9991665972164347
12206,12197,0.9992626577093233


In [49]:
# Get imputation accuracy metrics for tsinfer.
print("Computing imputation accuracy metrics for tsinfer WITH inference.")
results = []
for i in range(len(src_ts)):
    true_vcf_file    = base_dir + "true/" + "true."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss."  + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_tsinfer/" + "imputed." + str(i) + ".vcf"
    stats = compare_variants(true_vcf_file, miss_vcf_file, imputed_vcf_file)
    results.append(stats)
for y in results:
    print(",".join([str(x) for x in y]))

Computing imputation accuracy metrics for tsinfer WITH inference.
963,929,0.9646936656282451
883,851,0.9637599093997735
974,937,0.9620123203285421
946,912,0.9640591966173362
873,846,0.9690721649484536
910,886,0.9736263736263736
880,833,0.946590909090909
868,845,0.9735023041474654
916,895,0.9770742358078602
906,879,0.9701986754966887


In [77]:
# Get imputation accuracy metrics for BEAGLE.
print("Computing imputation accuracy metrics for BEAGLE.")
results = []
for i in range(len(src_ts)):
    true_vcf_file    = base_dir + "true/" + "true."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss."  + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_beagle/" + "imputed." + str(i) + ".vcf.gz"
    stats = compare_variants(true_vcf_file, miss_vcf_file, imputed_vcf_file)
    results.append(stats)
for y in results:
    print(",".join([str(x) for x in y]))

Computing imputation accuracy metrics for BEAGLE.


[W::vcf_parse] Contig 'chr20' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'chr20' is not defined in the header. (Quick workaround: index the file with tabix.)


11999,11931,0.994332861071756
12206,12146,0.9950843847288219
