In [1]:
import numpy as np
import tszip


### Split into reference panel and target cohorts

In [2]:
trees_dir = "../data/trees"


In [None]:
ts_hc_bi_file = trees_dir + "hgdp_tgp_sgdp_chr20_p.dated.hc_bi.trees.tsz"
ts = tszip.decompress(ts_hc_bi_file)


In [None]:
ref_size = 700
target_size = ts.num_individuals - ref_size

print(f"Reference panel size: {ref_size}")
print(f"Target cohort size: {target_size}")


In [None]:
np.random.seed(1234)

all_inds = np.arange(ts.num_individuals)
ref_inds = np.sort(np.random.choice(all_inds, ref_size, replace=False))
target_inds = np.array(list(set(all_inds) - set(ref_inds)))

print(f"Reference panel size: {len(ref_inds)}")
print(f"Target cohort size: {len(target_inds)}")

assert len(ref_inds) == ref_size
assert len(target_inds) == target_size


In [None]:
all_nodes = np.arange(ts.num_nodes)
ref_nodes = all_nodes[np.isin(ts.nodes_individual, ref_inds)]
target_nodes = all_nodes[np.isin(ts.nodes_individual, target_inds)]


In [None]:
ref_ts = ts.simplify(
    samples=ref_nodes,
    filter_populations=True,
    filter_individuals=True,
    filter_nodes=True,
    filter_sites=True,
)


In [None]:
target_ts = ts.simplify(
    samples=target_nodes,
    filter_populations=True,
    filter_individuals=True,
    filter_nodes=True,
    filter_sites=True,
)


In [None]:
ref_ts


In [None]:
target_ts


In [None]:
ref_ts_file = trees_dir + "ref.tsz"
target_ts_file = trees_dir + "target.tsz"
tszip.compress(ref_ts, ref_ts_file)
tszip.compress(target_ts, target_ts_file)
