In [1]:
import numpy as np
import tskit
import tsinfer
import msprime


In [2]:
import sys
sys.path.append("../src")
import masks
import measures
import util
import simulate_ts


In [3]:
# Population-matched imputation
num_ref_inds = 1_500
num_query_inds = 500
ts_full = simulate_ts.get_ts_ten_pop(
    num_ref_inds=num_ref_inds,
    num_query_inds=num_query_inds,
    sequence_length=1e7,    # 10 Mbp
    pop_ref='CEU',
    pop_query='CEU',
)
ts_full


Tree Sequence,Unnamed: 1
Trees,68195
Sequence Length,10000000.0
Time Units,generations
Sample Nodes,4000
Total Size,15.7 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,252655,7.7 MiB,
Individuals,2000,54.7 KiB,
Migrations,0,8 Bytes,
Mutations,77673,2.7 MiB,
Nodes,51342,1.4 MiB,
Populations,0,183 Bytes,
Provenances,3,8.5 KiB,
Sites,77552,1.8 MiB,


In [4]:
# Prepare files for tsimpute
prefix = "jacobs_ceu_ceu_2k"
ts_full_file = prefix + ".full.trees"
ts_ref_file = prefix + ".ref.trees"
ts_query_file = prefix + ".query.trees"
npy_query_file = prefix + ".query.npy"


In [5]:
ploidy = 2
num_ref_haps = ploidy * num_ref_inds
num_query_haps = ploidy * num_query_inds
idx_ref_inds = np.arange(num_ref_inds)
idx_ref_haps = np.arange(num_ref_haps)
idx_query_inds = np.arange(num_ref_inds, num_ref_inds + num_query_inds)
idx_query_haps = np.arange(num_ref_haps, num_ref_haps + num_query_haps)
assert np.all(ts_full.nodes_flags[:(num_ref_haps + num_query_haps)] == 1)
assert np.all(ts_full.nodes_flags[(num_ref_haps + num_query_haps):] == 0)
assert np.all(ts_full.nodes_flags[idx_ref_haps] == 1)
assert np.all(ts_full.nodes_flags[idx_query_haps] == 1)


In [6]:
# Simplify down to reference haplotypes, removing monoallelic sites.
ts_ref = ts_full.simplify(idx_ref_haps, filter_sites=True)
ts_ref


Tree Sequence,Unnamed: 1
Trees,67250
Sequence Length,10000000.0
Time Units,generations
Sample Nodes,3000
Total Size,15.3 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,246862,7.5 MiB,
Individuals,1500,41.0 KiB,
Migrations,0,8 Bytes,
Mutations,76715,2.7 MiB,
Nodes,48571,1.3 MiB,
Populations,0,183 Bytes,
Provenances,4,9.0 KiB,
Sites,76594,1.8 MiB,


In [7]:
# Identify and remove sites with private mutations.
af = np.zeros(ts_ref.num_sites, dtype=np.int32)
i = 0
for v in ts_ref.variants():
    af[i] = min(v.counts().values())
    i += 1
sites_private_mutation = np.where(af < 2)[0]
print(f"Sites with private mutation: {len(sites_private_mutation)}")
ts_ref_filtered = ts_ref.delete_sites(site_ids=sites_private_mutation)
ts_ref_filtered


Sites with private mutation: 4593


Tree Sequence,Unnamed: 1
Trees,67250
Sequence Length,10000000.0
Time Units,generations
Sample Nodes,3000
Total Size,15.0 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,246862,7.5 MiB,
Individuals,1500,41.0 KiB,
Migrations,0,8 Bytes,
Mutations,72107,2.5 MiB,
Nodes,48571,1.3 MiB,
Populations,0,183 Bytes,
Provenances,5,9.6 KiB,
Sites,72001,1.7 MiB,


In [8]:
# Identify sites with high MAF.
maf = np.zeros(ts_ref_filtered.num_sites, dtype=np.float64)
i = 0
for v in ts_ref_filtered.variants():
    maf[i] = min(v.frequencies().values())
    i += 1
sites_high_maf = np.where(maf >= 0.05)[0]
print(f"Sites with high MAF: {len(sites_high_maf)}")


Sites with high MAF: 38662


In [9]:
# Randomly select genotyped markers
reference_markers = np.arange(ts_ref_filtered.num_sites)
num_markers = 3333 # Density of 3,333 markers per 10 Mb
genotyped_markers = np.random.choice(sites_high_maf, size=num_markers, replace=False)
genotyped_markers.sort()    # In-place sort
ungenotyped_markers = np.setdiff1d(reference_markers, genotyped_markers)
assert np.union1d(genotyped_markers,
                  ungenotyped_markers).size == ts_ref_filtered.num_sites


In [10]:
genotyped_site_pos = ts_ref_filtered.sites_position[genotyped_markers]
ungenotyped_site_pos = ts_ref_filtered.sites_position[ungenotyped_markers]


In [11]:
print(f"Reference markers: {ts_ref_filtered.num_sites}")
print(f"Genotyped markers: {len(genotyped_markers)}")
print(f"Ungenotyped markers: {len(ungenotyped_markers)}")


Reference markers: 72001
Genotyped markers: 3333
Ungenotyped markers: 68668


In [12]:
# Prepare query haplotypes
ts_query = ts_full.simplify(idx_query_haps, filter_sites=False)
ts_query


Tree Sequence,Unnamed: 1
Trees,61706
Sequence Length,10000000.0
Time Units,generations
Sample Nodes,1000
Total Size,13.9 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,220648,6.7 MiB,
Individuals,500,13.7 KiB,
Migrations,0,8 Bytes,
Mutations,71253,2.5 MiB,
Nodes,40724,1.1 MiB,
Populations,0,183 Bytes,
Provenances,4,9.0 KiB,
Sites,77552,1.8 MiB,


In [13]:
# Filter sites in query haplotypes down to reference markers.
remove_sites = np.where(np.isin(ts_query.sites_position, ts_ref_filtered.sites_position, invert=True))[0]
ts_query_filtered = ts_query.delete_sites(site_ids=remove_sites)
assert ts_query_filtered.num_sites == ts_ref_filtered.num_sites
assert np.array_equal(ts_query_filtered.sites_position, ts_ref_filtered.sites_position)
ts_query_filtered


Tree Sequence,Unnamed: 1
Trees,61706
Sequence Length,10000000.0
Time Units,generations
Sample Nodes,1000
Total Size,13.7 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,220648,6.7 MiB,
Individuals,500,13.7 KiB,
Migrations,0,8 Bytes,
Mutations,68871,2.4 MiB,
Nodes,40724,1.1 MiB,
Populations,0,183 Bytes,
Provenances,5,9.6 KiB,
Sites,72001,1.7 MiB,


In [14]:
# Unmasked query haplotypes
ts_query_h = ts_query_filtered.genotype_matrix(alleles=tskit.ALLELES_ACGT)
print(ts_query_h.shape)
ts_query_h


(72001, 1000)


array([[2, 2, 2, ..., 2, 2, 2],
       [2, 0, 2, ..., 0, 0, 2],
       [0, 0, 3, ..., 0, 0, 0],
       ...,
       [3, 3, 2, ..., 2, 3, 3],
       [2, 2, 2, ..., 2, 2, 2],
       [1, 1, 1, ..., 1, 1, 1]], dtype=int32)

In [15]:
# Masked query haplotypes
ts_query_h_masked = np.copy(ts_query_h)
ts_query_h_masked[ungenotyped_markers, :] = -1
ts_query_h_masked
assert ts_query_h.shape == ts_query_h_masked.shape


In [16]:
with open(npy_query_file, "wb") as f:
    np.save(f, ts_query_h)
    np.save(f, ts_query_h_masked)
    np.save(f, genotyped_markers)
    np.save(f, ungenotyped_markers)
    np.save(f, genotyped_site_pos)
    np.save(f, ungenotyped_site_pos)


In [17]:
ts_full.dump(ts_full_file)
ts_ref_filtered.dump(ts_ref_file)
ts_query_filtered.dump(ts_query_file)


In [18]:
# Prepare files for BEAGLE 4.1
import gzip
with gzip.open(prefix + ".ref.vcf.gz", "wt") as f:
    ts_ref_filtered.write_vcf(f)


In [20]:
site_mask = np.zeros(ts_ref_filtered.num_sites, dtype=bool)
site_mask[ungenotyped_markers] = True
assert np.sum(site_mask) == len(ungenotyped_markers)


In [22]:
with gzip.open(prefix + ".query.vcf.gz", "wt") as f:
    ts_query_filtered.write_vcf(f, site_mask=site_mask)
