In [1]:
import perform_imputation_by_sample_matching as pism
import util
import json
import os
import sys

import numpy as np
import pandas as pd

import sgkit as sg
from sgkit.io.vcf import vcf_to_zarr

import tsinfer
import tskit
import tszip

sys.path.append("../../tsimpute/src/")


### Split into reference panel and target cohorts

In [2]:
data_dir = "../data/trees/"
ts_hc_bi_file = data_dir + "hgdp_tgp_sgdp_chr20_p.dated.hc_bi.trees.tsz"
ts = tszip.decompress(ts_hc_bi_file)


In [3]:
ref_size = 700
target_size = ts.num_individuals - ref_size

print(f"Reference panel size: {ref_size}")
print(f"Target cohort size: {target_size}")


Reference panel size: 700
Target cohort size: 176


In [4]:
np.random.seed(1234)

all_inds = np.arange(ts.num_individuals)
ref_inds = np.sort(np.random.choice(all_inds, ref_size, replace=False))
target_inds = np.array(list(set(all_inds) - set(ref_inds)))

print(f"Reference panel size: {len(ref_inds)}")
print(f"Target cohort size: {len(target_inds)}")

assert len(ref_inds) == ref_size
assert len(target_inds) == target_size


Reference panel size: 700
Target cohort size: 176


In [5]:
all_nodes = np.arange(ts.num_nodes)
ref_nodes = all_nodes[np.isin(ts.nodes_individual, ref_inds)]
target_nodes = all_nodes[np.isin(ts.nodes_individual, target_inds)]


In [6]:
ref_ts = ts.simplify(
    samples=ref_nodes,
    filter_populations=True,
    filter_individuals=True,
    filter_nodes=True,
    filter_sites=True,
)


In [7]:
target_ts = ts.simplify(
    samples=target_nodes,
    filter_populations=True,
    filter_individuals=True,
    filter_nodes=True,
    filter_sites=True,
)


In [8]:
ref_ts


Tree Sequence,Unnamed: 1
Trees,38054
Sequence Length,28050000.0
Time Units,unknown
Sample Nodes,1400
Total Size,122.2 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,984341,30.0 MiB,
Individuals,700,241.3 KiB,✅
Migrations,0,8 Bytes,
Mutations,907372,32.0 MiB,
Nodes,164145,12.3 MiB,✅
Populations,54,2.7 KiB,✅
Provenances,18,11.9 KiB,
Sites,504314,40.1 MiB,✅


In [9]:
target_ts


Tree Sequence,Unnamed: 1
Trees,26906
Sequence Length,28050000.0
Time Units,unknown
Sample Nodes,352
Total Size,62.1 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,443714,13.5 MiB,
Individuals,176,60.6 KiB,✅
Migrations,0,8 Bytes,
Mutations,452473,16.0 MiB,
Nodes,86696,6.5 MiB,✅
Populations,50,2.5 KiB,✅
Provenances,18,11.9 KiB,
Sites,285446,22.6 MiB,✅


In [10]:
out_dir = "../data/trees/"
ref_ts_file = out_dir + "ref.tsz"
target_ts_file = out_dir + "target.tsz"
tszip.compress(ref_ts, ref_ts_file)
tszip.compress(target_ts, target_ts_file)
