In [1]:
import numpy as np
import tskit

import sys
sys.path.append("../src")
import simulate_ts


In [2]:
# Population-matched imputation.
num_ref_inds = 1_000
num_query_inds = 500
ts_full = simulate_ts.get_ts_ten_pop(
    num_ref_inds=num_ref_inds,
    num_query_inds=num_query_inds,
    recombination_rate=0,   # CHANGE
    sequence_length=1e6,    # 1 Mbp
    pop_ref='CEU',
    pop_query='CEU',
)
ts_full


Tree Sequence,Unnamed: 1
Trees,1
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,3000
Total Size,843.3 KiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,5998,187.4 KiB,
Individuals,1500,41.0 KiB,
Migrations,0,8 Bytes,
Mutations,6529,235.9 KiB,
Nodes,5999,164.0 KiB,
Populations,0,183 Bytes,
Provenances,3,8.5 KiB,
Sites,6522,159.2 KiB,


In [3]:
# Prepare files for tsimpute.
out_dir = "./simulated_data/rho_0_mu_e-8"
prefix = out_dir + "/" + "jacobs_ceu_ceu_2k.rho_0"
ts_full_file = prefix + ".full.trees"
ts_ref_file = prefix + ".ref.trees"
ts_query_file = prefix + ".query.trees"
npy_query_file = prefix + ".query.npy"


In [4]:
ploidy = 2
num_ref_haps = ploidy * num_ref_inds
num_query_haps = ploidy * num_query_inds
idx_ref_inds = np.arange(num_ref_inds)
idx_ref_haps = np.arange(num_ref_haps)
idx_query_inds = np.arange(num_ref_inds, num_ref_inds + num_query_inds)
idx_query_haps = np.arange(num_ref_haps, num_ref_haps + num_query_haps)
assert np.all(ts_full.nodes_flags[:(num_ref_haps + num_query_haps)] == 1)
assert np.all(ts_full.nodes_flags[(num_ref_haps + num_query_haps):] == 0)
assert np.all(ts_full.nodes_flags[idx_ref_haps] == 1)
assert np.all(ts_full.nodes_flags[idx_query_haps] == 1)


In [5]:
# Simplify down to reference haplotypes, removing monoallelic sites.
ts_ref = ts_full.simplify(idx_ref_haps, filter_sites=True)
ts_ref


Tree Sequence,Unnamed: 1
Trees,1
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,2000
Total Size,687.0 KiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,3998,124.9 KiB,
Individuals,1000,27.4 KiB,
Migrations,0,8 Bytes,
Mutations,6359,229.8 KiB,
Nodes,3999,109.4 KiB,
Populations,0,183 Bytes,
Provenances,4,9.0 KiB,
Sites,6352,155.1 KiB,


In [6]:
# Identify and remove sites with private mutations.
af = np.zeros(ts_ref.num_sites, dtype=np.int32)
i = 0
for v in ts_ref.variants():
    af[i] = min(v.counts().values())
    i += 1
sites_private_mutation = np.where(af < 2)[0]
print(f"Sites with private mutation: {len(sites_private_mutation)}")
ts_ref_filtered = ts_ref.delete_sites(site_ids=sites_private_mutation)
ts_ref_filtered


Sites with private mutation: 466


Tree Sequence,Unnamed: 1
Trees,1
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,2000
Total Size,659.3 KiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,3998,124.9 KiB,
Individuals,1000,27.4 KiB,
Migrations,0,8 Bytes,
Mutations,5893,212.9 KiB,
Nodes,3999,109.4 KiB,
Populations,0,183 Bytes,
Provenances,5,9.6 KiB,
Sites,5886,143.7 KiB,


In [7]:
# Identify sites with high MAF.
maf = np.zeros(ts_ref_filtered.num_sites, dtype=np.float32)
i = 0
for v in ts_ref_filtered.variants():
    maf[i] = min(v.frequencies().values())
    i += 1
sites_high_maf = np.where(maf >= 0.05)[0]
print(f"Sites with high MAF: {len(sites_high_maf)}")


Sites with high MAF: 3068


In [9]:
# Randomly select genotyped markers.
num_markers = int(3333 / 10) # Density of 3,333 markers per 10 Mb
ref_idx_all = np.arange(ts_ref_filtered.num_sites)
ref_idx_m = np.random.choice(sites_high_maf, size=num_markers, replace=False)
ref_idx_m.sort()    # In-place sort
ref_idx_x = np.setdiff1d(ref_idx_all, ref_idx_m)
assert np.union1d(ref_idx_m, ref_idx_x).size == ts_ref_filtered.num_sites


In [10]:
ref_pos_m = ts_ref_filtered.sites_position[ref_idx_m]
ref_pos_x = ts_ref_filtered.sites_position[ref_idx_x]


In [11]:
print(f"Reference markers: {ts_ref_filtered.num_sites}")
print(f"Genotyped markers: {len(ref_idx_m)}")
print(f"Ungenotyped markers: {len(ref_idx_x)}")


Reference markers: 5886
Genotyped markers: 333
Ungenotyped markers: 5553


In [12]:
# Prepare query haplotypes.
# WARN: Extracting query haplotypes like this only works when using ACGT encoding.
ts_query = ts_full.simplify(idx_query_haps, filter_sites=False)
ts_query


Tree Sequence,Unnamed: 1
Trees,1
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,1000
Total Size,531.3 KiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,1998,62.4 KiB,
Individuals,500,13.7 KiB,
Migrations,0,8 Bytes,
Mutations,5989,216.4 KiB,
Nodes,1999,54.7 KiB,
Populations,0,183 Bytes,
Provenances,4,9.0 KiB,
Sites,6522,159.2 KiB,


In [13]:
# Filter sites in query haplotypes down to reference markers.
remove_sites = np.where(np.isin(ts_query.sites_position, ts_ref_filtered.sites_position, invert=True))[0]
ts_query_filtered = ts_query.delete_sites(site_ids=remove_sites)
assert ts_query_filtered.num_sites == ts_ref_filtered.num_sites
assert np.array_equal(ts_query_filtered.sites_position, ts_ref_filtered.sites_position)
ts_query_filtered


Tree Sequence,Unnamed: 1
Trees,1
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,1000
Total Size,505.1 KiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,1998,62.4 KiB,
Individuals,500,13.7 KiB,
Migrations,0,8 Bytes,
Mutations,5679,205.2 KiB,
Nodes,1999,54.7 KiB,
Populations,0,183 Bytes,
Provenances,5,9.6 KiB,
Sites,5886,143.7 KiB,


In [14]:
# Unmasked query haplotypes
ts_query_h = ts_query_filtered.genotype_matrix(alleles=tskit.ALLELES_ACGT)
print(ts_query_h.shape)
ts_query_h


(5886, 1000)


array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [3, 3, 3, ..., 3, 3, 3],
       [0, 0, 0, ..., 0, 2, 0],
       [2, 2, 2, ..., 2, 2, 2]], dtype=int32)

In [15]:
# Masked query haplotypes
ts_query_h_masked = np.copy(ts_query_h)
ts_query_h_masked[ref_idx_x, :] = -1
ts_query_h_masked
assert ts_query_h.shape == ts_query_h_masked.shape


In [16]:
with open(npy_query_file, "wb") as f:
    np.save(f, ts_query_h)
    np.save(f, ts_query_h_masked)
    np.save(f, ref_idx_m)
    np.save(f, ref_idx_x)
    np.save(f, ref_pos_m)
    np.save(f, ref_pos_x)


In [17]:
ts_full.dump(ts_full_file)
ts_ref_filtered.dump(ts_ref_file)
ts_query_filtered.dump(ts_query_file)


In [19]:
# Prepare files for BEAGLE 4.1.
import gzip
with gzip.open(prefix + ".ref.vcf.gz", "wt") as f:
    ts_ref_filtered.write_vcf(f)
site_mask = np.zeros(ts_ref_filtered.num_sites, dtype=bool)
site_mask[ref_idx_x] = True
assert np.sum(site_mask) == len(ref_idx_x)
with gzip.open(prefix + ".query.vcf.gz", "wt") as f:
    ts_query_filtered.write_vcf(f, site_mask=site_mask)


In [20]:
# Prepare files for tsinfer.
import tsinfer
sd_ref_file = prefix + ".ref.samples"
sd_ref = tsinfer.SampleData.from_tree_sequence(ts_ref, path=sd_ref_file)


### Enrich trees with ancestors

In [21]:
max_node_age = 100
num_ancestors_all = np.sum(ts_ref.nodes_flags != 1)
num_ancestors_recent = np.sum(np.logical_and(ts_ref.nodes_time > 0, ts_ref.nodes_time <= max_node_age))
print(f"Ancestors (all)   : {num_ancestors_all}")
print(f"Ancestors (recent): {num_ancestors_recent}")
extra_haps = np.where(np.logical_and(ts_ref.nodes_time > 0, ts_ref.nodes_time <= max_node_age))[0]


Ancestors (all)   : 1999
Ancestors (recent): 1717


In [22]:
tables = ts_ref.dump_tables()
for u in extra_haps:
    v = tables.nodes.add_row(time=-1, flags=tskit.NODE_IS_SAMPLE)
    tables.edges.add_row(left=0, right=ts_ref.sequence_length, child=v, parent=u)
tables.sort()
ts_enriched = tables.tree_sequence()
ts_enriched


Tree Sequence,Unnamed: 1
Trees,1
Sequence Length,1000000.0
Time Units,generations
Sample Nodes,3717
Total Size,801.0 KiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,5715,178.6 KiB,
Individuals,1000,27.4 KiB,
Migrations,0,8 Bytes,
Mutations,6359,229.8 KiB,
Nodes,5716,156.3 KiB,
Populations,0,183 Bytes,
Provenances,4,9.0 KiB,
Sites,6352,155.1 KiB,


In [23]:
ts_enriched.dump(out_dir + "/" + prefix + ".enriched.trees")
