In [1]:
import math
import os
import random
import sys
import time

import gzip

import numpy as np

import msprime
import tskit
import tsinfer
from tsinfer import make_ancestors_ts

import cyvcf2
import demes
import demesdraw

print(f"tskit {tskit.__version__}")
print(f"tsinfer {tsinfer.__version__}")
print(f"msprime {msprime.__version__}")
print(f"demes {demes.__version__}")
print(f"cyvcf2 {cyvcf2.__version__}")

tskit 0.5.0
tsinfer 0.2.3
msprime 1.1.1
demes 0.2.1
cyvcf2 0.30.14


### Simulate genealogy and genetic variation

In [2]:
sampling_time_query = 100

size_query = 100
size_ref   = 1_000

eff_pop_size = 10_000
mutation_rate = 1e-8
recombination_rate = 1e-8

proportion_missing_sites = 0.80

contig_id = '1'
ploidy_level = 1
sequence_length = 1_000_000

Size of the reference panel is 1000
Size of the query is 100
Ploidy level is 1
Population size is 10000
Sampling time query : 100


In [3]:
print(f"Size of the reference panel is {size_ref}")
print(f"Size of the query is {size_query}")
print(f"Ploidy level is {ploidy_level}")
print(f"Population size is {eff_pop_size}")
print(f"Sampling time query : {sampling_time_query}")

Size of the reference panel is 1000
Size of the query is 100
Ploidy level is 1
Population size is 10000
Sampling time query : 100


In [4]:
rate_map = msprime.RateMap.uniform(
    sequence_length=sequence_length,
    rate=recombination_rate,
)

In [5]:
sample_set = [
    msprime.SampleSet(num_samples=size_query,
                      time=sampling_time_query,
                      ploidy=ploidy_level),
    msprime.SampleSet(num_samples=size_ref,
                      time=0,
                      ploidy=ploidy_level),
]

In [7]:
ts_full = msprime.sim_mutations(
    msprime.sim_ancestry(
        samples=sample_set,
        population_size=eff_pop_size,
        model="hudson",
        recombination_rate=rate_map,
        discrete_genome=True,
    ),
    rate=mutation_rate,
    discrete_genome=True,
)

In [8]:
individuals_query = np.arange(size_query, dtype=int)
individual_names_query = ["query_" + str(i) for i in individuals_query]
samples_query = np.arange(ploidy_level * size_query, dtype=int)

In [9]:
individuals_ref = np.arange(size_query, size_query + size_ref, dtype=int)
individual_names_ref = ["ref_" + str(i) for i in individuals_ref]
samples_ref = np.arange(ploidy_level * size_query, ploidy_level * (size_query + size_ref), dtype=int)

In [10]:
print("Samples - query")
print(samples_query[:10])
print(samples_query[-10:])
print("\n")
print("Sample - reference panel")
print(samples_ref[:10])
print(samples_ref[-10:])

Samples - query
[0 1 2 3 4 5 6 7 8 9]
[90 91 92 93 94 95 96 97 98 99]


Sample - reference panel
[100 101 102 103 104 105 106 107 108 109]
[1090 1091 1092 1093 1094 1095 1096 1097 1098 1099]


### Match query genomes to an ancestor tree sequence

In [11]:
def create_sample_data_from_tree_sequence(ts, skip_multiallelic_sites=False):
    """
    Match samples in a SampleData object to an ancestors ts,
    without the need to read from a VCF file.
    
    Optionally, skip multi-allelic sites.
    """
    with tsinfer.SampleData(ts.sequence_length) as sample_data:
        for variant in ts.variants():
            # Skip multi-allelic sites
            if skip_multiallelic_sites and len(variant.alleles) > 2:
                continue
            sample_data.add_site(
                position=variant.site.position,
                genotypes=variant.genotypes,
                alleles=variant.alleles,
            )
    return(sample_data)

In [12]:
sd_full = create_sample_data_from_tree_sequence(ts_full, skip_multiallelic_sites=True)

sd_query = sd_full.subset(individuals=individuals_query) # Used for matching

In [13]:
ts_ref = ts_full.simplify(samples_ref, filter_sites=False) # Remove private branches

ts_anc = make_ancestors_ts(ts=ts_ref, remove_leaves=True, samples=None)

tmp_tables = ts_anc.dump_tables()
tmp_tables.individuals.clear()
tmp_tables.populations.metadata_schema = tskit.MetadataSchema(schema=None)

ts_anc = tmp_tables.tree_sequence() # Used for matching

In [14]:
ts_matched = tsinfer.match_samples(sample_data=sd_query, ancestors_ts=ts_anc)
ts_matched

Tree Sequence,Unnamed: 1
Trees,1297
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,100
Total Size,579.0 KiB
Metadata,dict

Table,Rows,Size,Has Metadata
Edges,7447,232.7 KiB,
Individuals,100,3.0 KiB,✅
Migrations,0,8 Bytes,
Mutations,2090,75.5 KiB,
Nodes,1857,50.8 KiB,
Populations,0,8 Bytes,
Provenances,8,5.3 KiB,
Sites,3064,153.4 KiB,✅


### Impute into query genomes from the ancestor tree sequence

In [15]:
def mask_sites_in_sample_data(sd, sequence_length, sites_to_mask=[]):
    """
    Create a SampleData object from an existing SampleData object,
    while masking out specified sites listed in sites_to_mask.
    """
    with tsinfer.SampleData(sequence_length) as sample_data:
        for i, variant in enumerate(sd.variants()):
            if i in sites_to_mask:
                sample_data.add_site(
                    position=variant.site.position,
                    genotypes=np.repeat(tskit.MISSING_DATA, len(variant.genotypes)),
                    alleles=variant.alleles, # Keep the same
                )
            else:
                sample_data.add_site(
                    position=variant.site.position,
                    genotypes=variant.genotypes,
                    alleles=variant.alleles,
                )
    return(sample_data)

In [17]:
masked_sites = random.sample(
    [i for i in range(sd_query.num_sites)],
    sd_query.num_sites
)

sd_query_masked = mask_sites_in_sample_data(
    sd_query,
    sequence_length=sequence_length,
    sites_to_mask=masked_sites
)

In [18]:
ts_masked_matched = tsinfer.match_samples(sample_data=sd_query_masked, ancestors_ts=ts_anc)
ts_masked_matched

Tree Sequence,Unnamed: 1
Trees,627
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,100
Total Size,253.4 KiB
Metadata,dict

Table,Rows,Size,Has Metadata
Edges,1564,48.9 KiB,
Individuals,100,3.0 KiB,✅
Migrations,0,8 Bytes,
Mutations,373,13.5 KiB,
Nodes,654,17.9 KiB,
Populations,0,8 Bytes,
Provenances,8,5.3 KiB,
Sites,3064,152.6 KiB,✅


### Compare the true and imputed tree sequences

In [19]:
ts_matched

Tree Sequence,Unnamed: 1
Trees,1297
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,100
Total Size,579.0 KiB
Metadata,dict

Table,Rows,Size,Has Metadata
Edges,7447,232.7 KiB,
Individuals,100,3.0 KiB,✅
Migrations,0,8 Bytes,
Mutations,2090,75.5 KiB,
Nodes,1857,50.8 KiB,
Populations,0,8 Bytes,
Provenances,8,5.3 KiB,
Sites,3064,153.4 KiB,✅


In [20]:
ts_masked_matched

Tree Sequence,Unnamed: 1
Trees,627
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,100
Total Size,253.4 KiB
Metadata,dict

Table,Rows,Size,Has Metadata
Edges,1564,48.9 KiB,
Individuals,100,3.0 KiB,✅
Migrations,0,8 Bytes,
Mutations,373,13.5 KiB,
Nodes,654,17.9 KiB,
Populations,0,8 Bytes,
Provenances,8,5.3 KiB,
Sites,3064,152.6 KiB,✅


### Write results to VCF

In [21]:
true_vcf_file = "test.true.vcf.gz"
imputed_vcf_file = "test.imputed.vcf.gz"

In [22]:
with gzip.open(true_vcf_file, "wt") as f:
    ts_matched.write_vcf(f)

In [23]:
with gzip.open(imputed_vcf_file, "wt") as f:
    ts_masked_matched.write_vcf(f)