In [1]:
import math
import os
import random
import sys
import time

import gzip

import numpy as np

import msprime
import tskit
import tsinfer
from tsinfer import make_ancestors_ts

import cyvcf2
import demes
import demesdraw

print(f"tskit {tskit.__version__}")
print(f"tsinfer {tsinfer.__version__}")
print(f"msprime {msprime.__version__}")
print(f"demes {demes.__version__}")
print(f"cyvcf2 {cyvcf2.__version__}")

tskit 0.5.0
tsinfer 0.2.3
msprime 1.1.1
demes 0.2.1
cyvcf2 0.30.14


In [2]:
# Sourced and modified from:
# https://tsinfer.readthedocs.io/en/latest/tutorial.html#data-example
def get_sequence_length(vcf):
    assert len(vcf.seqlens) == 1
    return vcf.seqlens[0]


def add_populations(vcf, samples):
    """
    TODO
    """
    pop_ids = [sample_name[0] for sample_name in vcf.samples]
    pop_codes = np.unique(pop_ids)
    pop_lookup = {}
    for p in pop_codes:
        pop_lookup[p] = samples.add_population(metadata={"name" : p})
    return [pop_lookup[pop_id] for pop_id in pop_ids]


def add_individuals(vcf, samples, ploidy_level, populations):
    for name, population in zip(vcf.samples, populations):
        samples.add_individual(ploidy=ploidy_level,
                               metadata={"name": name},
                               population=population)


def add_sites(vcf, samples, ploidy_level, warn_monomorphic_sites=False):
    """
    Read the sites in the VCF and add them to the SampleData object,
    reordering the alleles to put the ancestral allele first,
    if it is available.
    """
    assert ploidy_level == 1 or ploidy_level == 2,\
        f"ploidy_level {ploidy_level} is not recognized."
    
    pos = 0
    for variant in vcf:
        # Check for duplicate site positions.
        if pos == variant.POS:
            raise ValueError("Duplicate positions for variant at position", pos)
        else:
            pos = variant.POS
        # Check that the genotypes are phased.
        #if any([not phased for _, _, phased in variant.genotypes]):
        #    raise ValueError("Unphased genotypes for variant at position", pos)
        alleles = [variant.REF] + variant.ALT # Exactly as in the input VCF file.
        if warn_monomorphic_sites:
            if len(set(alleles) - {'.'}) == 1:
                print(f"Monomorphic site at {pos}")
        ancestral = variant.INFO.get("AA", variant.REF) # Dangerous action!!!
        # Ancestral state must be first in the allele list.
        ordered_alleles = [ancestral] + list(set(alleles) - {ancestral})
        # Create an index mapping from the input VCF to tsinfer input.
        allele_index = {
            old_index: ordered_alleles.index(allele)
            for old_index, allele in enumerate(alleles)
        }
        # When genotype is missing...
        if variant.num_unknown > 0:
            allele_index[-1] = tskit.MISSING_DATA
            ordered_alleles += [None]
        # Map original allele indexes to their indexes in the new alleles list.
        genotypes = [
            allele_index[old_index]
            for row in variant.genotypes # cyvcf2 uses -1 to indicate missing data.
            for old_index in row[0:ploidy_level] # Each is a 3-tuple (allele 1, allele 2, is phased?).
        ]
        samples.add_site(pos, genotypes=genotypes, alleles=ordered_alleles)


def create_sample_data_from_vcf_file(vcf_file):
    vcf = cyvcf2.VCF(vcf_file,
                     gts012 = False, # 0=HOM_REF, 1=HET, 2=UNKNOWN, 3=HOM_ALT
                     strict_gt = True)
    with tsinfer.SampleData(
        sequence_length = get_sequence_length(vcf)
    ) as samples:
        populations = add_populations(vcf, samples)
        add_individuals(vcf, samples, ploidy_level, populations)
        add_sites(vcf, samples, ploidy_level)
    return(samples)

In [3]:
def find_common_biallelic_sites(sd_1, sd_2):
    """
    Iterate through the variants in two SampleData objects
    to identify biallelic sites contained in both of the objects.
    """
    variants_1 = sd_1.variants()
    variants_2 = sd_2.variants()
    
    # Keep only biallelic sites
    sites_1 = []
    sites_2 = []
    
    for var_1, var_2 in zip(variants_1, variants_2):
        assert var_1.site.position == var_2.site.position
        alleles_1 = set(var_1.alleles) - {None}
        alleles_2 = set(var_2.alleles) - {None}
        if len(alleles_1) == 2 and alleles_1 == alleles_2:
            sites_1.append(var_1.site.id)
            sites_2.append(var_2.site.id)
            
    assert len(sites_1) == len(sites_2),\
        "The number of site positions in sites_1 and sites_2 are different."
    
    num_sites_1_all = len(sd_1.sites_position)
    num_sites_2_all = len(sd_2.sites_position)
    num_sites_1_biallelic = len(sites_1)
    num_sites_2_biallelic = len(sites_2)
    
    return(sites_1, sites_2)

In [4]:
def get_random_site_mask(ts, missing, mask=True):
    """
    TODO
    """
    assert missing >=0 and missing <= 1,\
        "Proportion of missing sites is not between 0 and 1."
    site_mask = np.random.random(ts.num_sites) < missing
    site_positions = np.array([s.position for s in ts.sites()])[site_mask]
    if mask:
        return(site_mask)
    else:
        return(site_positions)

In [5]:
def convert_into_ancestor_tree_sequence(ts, samples):
    """
    Remove the tips (or the sample nodes at time 0) from a tree sequence,
    and return an ancestor tree sequence.
    
    Presently, there is an extra step to remove the metadata from the
    ancestor tree sequence.
    """
    ts_tipless = make_ancestors_ts(samples=samples, ts=ts, remove_leaves=True)
    tmp_tables = ts_tipless.dump_tables()
    tmp_tables.populations.metadata_schema = tskit.MetadataSchema(schema=None)
    ts_new = tmp_tables.tree_sequence()
    return(ts_new)

In [6]:
def impute_genotypes_using_ts_only(ref_vcf_file,
                                   miss_vcf_file,
                                   imputed_vcf_file,
                                   imputed_ts_file,
                                   ts_anc_ref,
                                   contig_id):
    sd_ref = create_sample_data_from_vcf_file(ref_vcf_file)
    sd_miss = create_sample_data_from_vcf_file(miss_vcf_file)
    
    # Clean ts_anc_ref
    tmp_tables = ts_anc_ref.dump_tables()
    tmp_tables.individuals.clear()
    ts_anc_ref = tmp_tables.tree_sequence()
    
    ts_fixed = tsinfer.match_samples(sample_data=sd_miss,
                                     ancestors_ts=ts_anc_ref)
    
    with gzip.open(imputed_vcf_file, "wt") as f:
        ts_fixed.write_vcf(f, contig_id=contig_id)
    ts_fixed.dump(imputed_ts_file)
    
    return(ts_fixed)

In [7]:
def impute_genotypes_using_tsinfer(ref_vcf_file,
                                   miss_vcf_file,
                                   imputed_vcf_file,
                                   imputed_ts_file,
                                   contig_id):
    sd_ref = create_sample_data_from_vcf_file(ref_vcf_file)
    sd_miss = create_sample_data_from_vcf_file(miss_vcf_file)
    ad_ref = tsinfer.generate_ancestors(sample_data=sd_ref)
    
    # This step is to infer a tree sequence from the sample data.
    ts_anc_ref = tsinfer.match_ancestors(sample_data=sd_ref, ancestor_data=ad_ref)
    
    # Retain only the sites used during ts inference
    sites_infer_sd_ref = ts_anc_ref.tables.sites.position # Only sites used in inference are kept in ts
    sites_sd_miss = sd_miss.sites_position[:] # Not yet in the API doc
    site_ids_keep = np.where(np.isin(sites_sd_miss, sites_infer_sd_ref)[0])
    sd_miss.subset(sites=site_ids_keep)
    
    ts_matched = tsinfer.match_samples(sample_data=sd_miss, ancestors_ts=ts_anc_ref)
    
    with gzip.open(imputed_vcf_file, "wt") as f:
        ts_matched.write_vcf(vcf, contig_id=contig_id)
    ts_matched.dump(imputed_ts_file)
    
    return(ts_matched)

## Create data sets via simulations.

In [8]:
base_dir = "../data/ancient_panmictic_haploid_miss80_time1e2/"

sampling_time_query = 100

num_replicates = 10

size_query = 100
size_ref   = 1_000

eff_pop_size = 10_000
mutation_rate = 1e-8
recombination_rate = 1e-8

proportion_missing_sites = 0.80

contig_id = '1'
ploidy_level = 1
sequence_length = 10_000_000 # 10 Mbp

print(f"Size of the reference panel is {size_ref}")
print(f"Size of the query is {size_query}")
print(f"Ploidy level is {ploidy_level}")
print(f"Population size is {eff_pop_size}")
print(f"Sampling time query : {sampling_time_query}")
print(f"Base directory : {base_dir}")

Size of the reference panel is 1000
Size of the query is 100
Ploidy level is 1
Population size is 10000
Sampling time query : 100
Base directory : ../data/ancient_panmictic_haploid_miss80_time1e2/


In [9]:
rate_map = msprime.RateMap.uniform(
    sequence_length = sequence_length,
    rate = recombination_rate
)

In [10]:
sample_set = [
    msprime.SampleSet(num_samples = size_query,
                      time = sampling_time_query,
                      ploidy = ploidy_level),
    msprime.SampleSet(num_samples = size_ref,
                      time = 0,
                      ploidy = ploidy_level)
]

In [11]:
src_ts = [] # List of full ts.
anc_ts = [] # List of ancestor ts.

In [12]:
print(f"Simulating {num_replicates} tree sequences.")

tic = time.time()

for i in np.arange(num_replicates):
    sim_ts = msprime.sim_mutations(
        msprime.sim_ancestry(
            samples=sample_set,
            population_size=eff_pop_size,
            model="hudson",
            recombination_rate=rate_map,
            discrete_genome=True
        ),
        rate=mutation_rate,
        discrete_genome=True
    )
    src_ts.append(sim_ts)
    
toc = time.time()
print(f"Simulation of {num_replicates} ts took {round(toc - tic, 2)} seconds.")

Simulating 10 tree sequences.
Simulation of 10 ts took 5.14 seconds.


In [13]:
individuals_query = np.arange(size_query, dtype=int)
individual_names_query = ["query_" + str(i) for i in individuals_query]
samples_query = np.arange(ploidy_level * size_query, dtype=int)

individuals_ref = np.arange(size_query, size_query + size_ref, dtype=int)
individual_names_ref = ["ref_" + str(i) for i in individuals_ref]
samples_ref = np.arange(ploidy_level * size_query, ploidy_level * (size_query + size_ref), dtype=int)

In [16]:
for i, ts in enumerate(src_ts[:1]):
    print(f"Processing ts {i}.")
    tic = time.time()
    
    ref_vcf_file = base_dir + "ref/"  + "ref."  + str(i) + ".vcf.gz"
    true_vcf_file = base_dir + "true/" + "true." + str(i) + ".vcf.gz"
    miss_vcf_file = base_dir + "miss/" + "miss." + str(i) + ".vcf.gz"
    
    ts_full_ref_file = base_dir + "ref/" + "ts_full_ref." + str(i) + ".trees"
    ts_anc_ref_file = base_dir + "ts_anc_ref/" + "ts_anc_ref." + str(i) + ".trees"
    
    print("\tGetting ancestors ts...")
    ts_anc_ref = convert_into_ancestor_tree_sequence(ts, samples=samples_ref)
    anc_ts.append(ts_anc_ref)
    
    ts.dump(ts_full_ref_file)
    ts_anc_ref.dump(ts_anc_ref_file)
    
    masked_positions = get_random_site_mask(ts, missing=proportion_missing_sites, mask=False)
    
    def get_sample_mask(variant):
        num_samples = len(individuals_query)
        if variant.site.position not in masked_positions:
            return(np.ones(num_samples, dtype=bool))
        else:
            return(np.zeros(num_samples, dtype=bool))
    
    print("\tPrinting reference VCF...")
    with gzip.open(ref_vcf_file, "wt") as f:
        ts.write_vcf(f,
                     individuals=individuals_ref,
                     individual_names=individual_names_ref)
        
    print("\tPrinting query VCF with non-missing genotypes...")
    with gzip.open(true_vcf_file, "wt") as f:
        ts.write_vcf(f,
                     individuals=individuals_query,
                     individual_names=individual_names_query)
        
    print("\tPrinting query VCF with missing genotypes...")
    with gzip.open(miss_vcf_file, "wt") as f:
        ts.write_vcf(f,
                     individuals=individuals_query,
                     individual_names=individual_names_query,
                     sample_mask=get_sample_mask)
        
    toc = time.time()
    print(f"\tTook {toc - tic} seconds to process ts {i}.")

Processing ts 0.
	Getting ancestors ts...
	Printing query VCF with missing genotypes...


ValueError: Sample mask must be a numpy array of size num_samples

## Perform genotype imputation.

In [15]:
print("Doing imputation using ts only.")

for i in np.arange(len(src_ts[:1])):
    print(f"Imputing VCF {i}")
    tic = time.time()
    
    ref_vcf_file     = base_dir + "ref/"  + "ref."  + str(i) + ".vcf.gz"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf.gz"
    imputed_vcf_file = base_dir + "imputed_tsonly/" + "imputed." + str(i) + ".vcf.gz"
    ts_imputed_file  = base_dir + "imputed_tsonly/" + "imputed." + str(i) + ".trees"
    ts_imputed       = impute_genotypes_using_ts_only(ref_vcf_file=ref_vcf_file,
                                                      miss_vcf_file=miss_vcf_file,
                                                      imputed_vcf_file=imputed_vcf_file,
                                                      imputed_ts_file=ts_imputed_file,
                                                      ts_anc_ref=anc_ts[i],
                                                      contig_id=contig_id)
    
    toc = time.time()
    print(f"Took {toc - tic} seconds to process ts {i}.")

Doing imputation using ts only.
Imputing VCF 0


LibraryError: Individual out of bounds. (TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS)

In [20]:
print("Doing imputation using tsinfer.")

for i in np.arange(len(src_ts[:1])):
    print(f"Imputing VCF {i}")
    tic = time.time()
    
    ref_vcf_file     = base_dir + "ref/" + "ref."  + str(i) + ".vcf.gz"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf.gz"
    imputed_vcf_file = base_dir + "imputed_tsinfer/" + "imputed." + str(i) + ".vcf.gz"
    ts_imputed_file  = base_dir + "imputed_tsinfer/" + "imputed." + str(i) + ".trees"
    ts_imputed       = impute_genotypes_using_tsinfer(ref_vcf_file=ref_vcf_file,
                                                      miss_vcf_file=miss_vcf_file,
                                                      imputed_vcf_file=imputed_vcf_file,
                                                      imputed_ts_file=ts_imputed_file,
                                                      contig_id=contig_id)
    
    toc = time.time()
    print(f"Took {toc - tic} seconds to process ts {i}.")

Doing imputation using tsinfer.
Imputing VCF 0


IndexError: index 6121 is out of bounds for axis 0 with size 6121

In [27]:
sd_ref = create_sample_data_from_vcf_file(ref_vcf_file)
sd_ref_site_pos = [s.position for s in sd_ref.sites()]
print(sd_ref_site_pos)

[322.0, 1124.0, 1147.0, 2091.0, 2202.0, 2219.0, 2248.0, 2293.0, 2591.0, 4040.0, 4533.0, 5611.0, 5761.0, 6074.0, 6328.0, 6521.0, 7420.0, 7597.0, 7798.0, 7871.0, 8040.0, 8184.0, 8421.0, 8457.0, 9067.0, 9605.0, 9813.0, 10029.0, 10346.0, 10414.0, 10420.0, 10604.0, 10983.0, 11044.0, 11301.0, 11582.0, 11962.0, 12620.0, 12818.0, 13812.0, 13850.0, 13973.0, 14058.0, 14400.0, 14463.0, 14544.0, 14662.0, 14758.0, 14956.0, 15388.0, 15637.0, 15980.0, 16824.0, 16836.0, 16972.0, 17090.0, 17109.0, 17957.0, 18200.0, 18809.0, 19778.0, 19986.0, 20220.0, 20691.0, 21142.0, 21205.0, 21244.0, 21674.0, 22193.0, 22275.0, 22884.0, 23548.0, 23839.0, 23894.0, 24015.0, 24653.0, 24671.0, 24963.0, 25058.0, 27395.0, 27495.0, 28058.0, 28297.0, 28565.0, 28568.0, 29030.0, 30046.0, 30150.0, 30314.0, 30645.0, 30880.0, 31038.0, 31412.0, 31590.0, 32574.0, 32971.0, 33184.0, 33433.0, 33940.0, 34124.0, 35107.0, 36367.0, 36552.0, 36587.0, 36794.0, 37422.0, 37848.0, 37989.0, 38006.0, 38020.0, 38941.0, 38996.0, 39739.0, 39874.0, 3

In [28]:
sd_miss = create_sample_data_from_vcf_file(miss_vcf_file)
sd_miss_site_pos = [s.position for s in sd_miss.sites()]
print(sd_miss_site_pos)

[5611.0, 6521.0, 7420.0, 7597.0, 8184.0, 8421.0, 9605.0, 10346.0, 11301.0, 12620.0, 13812.0, 14544.0, 14662.0, 15388.0, 16824.0, 23894.0, 24015.0, 24671.0, 30645.0, 31038.0, 39739.0, 39874.0, 41619.0, 45425.0, 46187.0, 47033.0, 49623.0, 52616.0, 53452.0, 54823.0, 60158.0, 60526.0, 64562.0, 64888.0, 66300.0, 67698.0, 71131.0, 71946.0, 74318.0, 76798.0, 77595.0, 79108.0, 81144.0, 84613.0, 88159.0, 90624.0, 91575.0, 92205.0, 92359.0, 94146.0, 94799.0, 94973.0, 96391.0, 97059.0, 98139.0, 98461.0, 98973.0, 101592.0, 102672.0, 103199.0, 115103.0, 116651.0, 121684.0, 121882.0, 121933.0, 121999.0, 122944.0, 123460.0, 124843.0, 125155.0, 127127.0, 129402.0, 130444.0, 133988.0, 135098.0, 137995.0, 138869.0, 140345.0, 141948.0, 144252.0, 144931.0, 144960.0, 145198.0, 148201.0, 149083.0, 150134.0, 150243.0, 150419.0, 152921.0, 153044.0, 154461.0, 156074.0, 156103.0, 158170.0, 158891.0, 159716.0, 160312.0, 160491.0, 161324.0, 164551.0, 165206.0, 165288.0, 166287.0, 166992.0, 171054.0, 175669.0, 176

In [31]:
ad_ref = tsinfer.generate_ancestors(sample_data=sd_ref)

In [32]:
ts_anc_ref = tsinfer.match_ancestors(sample_data=sd_ref, ancestor_data=ad_ref)

In [47]:
ts_anc_ref.tables.sites.position

array([1.124000e+03, 2.091000e+03, 2.202000e+03, ..., 9.998793e+06,
       9.999200e+06, 9.999726e+06])

In [48]:
sd_miss.subset(sites=np.where(np.isin(sd_miss.sites_position[:], ts_anc_ref.tables.sites.position))[0])

<tsinfer.formats.SampleData at 0x7fbcc87281f0>

In [51]:
ts_anc_ref.tables.mutations

id,site,node,time,derived_state,parent,metadata,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103
0,0,15462,,T,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,16901,,C,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,4018,,A,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,9997,,C,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,8823,,A,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,5,828,,A,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,6,6663,,G,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,7,10872,,C,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,8,6663,,C,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,9,12430,,G,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
print("Doing imputation using BEAGLE.")

beagle_exe = "../analysis/beagle/beagle.28Jun21.220.jar"

for i in np.arange(len(src_ts[:1])):
    ref_vcf_file     = base_dir + "ref/"  + "ref."  + str(i) + ".vcf"
    miss_vcf_file    = base_dir + "miss/" + "miss." + str(i) + ".vcf"
    imputed_vcf_file = base_dir + "imputed_beagle/" + "imputed." + str(i)
    beagle_cmd = [
        "java", "-jar", beagle_exe,
        "ref=" + ref_vcf_file,
        "gt="  + miss_vcf_file,
        "out=" + imputed_vcf_file
    ]
    beagle_cmd = " ".join(beagle_cmd)
    print(beagle_cmd + "\n")

Doing imputation using BEAGLE.
java -jar ../analysis/beagle/beagle.28Jun21.220.jar ref=../data/ancient_panmictic_haploid_miss80/ref/ref.0.vcf gt=../data/ancient_panmictic_haploid_miss80/miss/miss.0.vcf out=../data/ancient_panmictic_haploid_miss80/imputed_beagle/imputed.0

