In [1]:
from tqdm import tqdm

import numpy as np

import sgkit as sg
from sgkit.io.vcf import vcf_to_zarr

import tszip

import sys
sys.path.append("../../tsimpute/src/")
import compare_vcfs as cv
import impute_by_sample_matching as impute
import util


### Run tskit.lshmm


In [2]:
trees_dir = "../data/trees/"
zarr_dir = "../data/zarr/"
vcf_dir = "../data/vcf/"
lshmm_dir = "../analysis/lshmm/"


In [3]:
ref_ts_file = trees_dir + "ref.tsz"
ref_ts = tszip.decompress(ref_ts_file)


In [4]:
target_ds = sg.load_dataset(zarr_dir + "target.zarr")
target_chip_ds_compat = sg.load_dataset(zarr_dir + "target_chip_compat.zarr")


In [5]:
switch_prob = np.repeat(1e-08, ref_ts.num_sites)
mismatch_prob = np.repeat(1e-08, ref_ts.num_sites)
precision = 25


In [6]:
h1, h2, h3 = impute.impute_by_sample_matching(
    ref_ts=ref_ts,
    target_ds=target_chip_ds_compat,
    switch_prob=switch_prob,
    mismatch_prob=mismatch_prob,
    precision=precision,
)


100%|██████████| 7899/7899 [00:54<00:00, 145.62it/s]
100%|██████████| 352/352 [46:21<00:00,  7.90s/it]
100%|██████████| 504314/504314 [00:03<00:00, 127704.31it/s]


In [7]:
np.save(lshmm_dir + "h1" + "_" + "p" + str(precision) + ".npy", h1)
np.save(lshmm_dir + "h2" + "_" + "p" + str(precision) + ".npy", h2)
np.save(lshmm_dir + "h3" + "_" + "p" + str(precision) + ".npy", h3)


In [8]:
h2 = np.load(lshmm_dir + "h2" + "_" + "p" + str(precision) + ".npy")


In [9]:
assert h2.shape[0] == target_ds.dims["samples"] * target_ds.dims["ploidy"]
assert h2.shape[1] == ref_ts.num_sites

sample_names = target_ds.vcf_header.split("\n")[5].split("\t")[9:]

lshmm_ts = ref_ts

i = 0
for sample_name in tqdm(sample_names):
    metadata_str = f"\"name\": \"{sample_name}\", "
    metadata_str += f"\"status\": \"imputed\", "
    metadata_str += f"\"recomb\": \"uniform\", "
    metadata_str += f"\"precision\": \"{precision}\""

    path_1 = util.SamplePath(
        individual=sample_name,
        nodes=h2[2 * i],
        site_positions=ref_ts.sites_position,
        metadata=metadata_str.encode('ascii')
    )
    path_2 = util.SamplePath(
        individual=sample_name,
        nodes=h2[2 * i + 1],
        site_positions=ref_ts.sites_position,
        metadata=metadata_str.encode('ascii')
    )

    assert path_1.is_valid
    assert path_2.is_valid

    _, lshmm_ts = util.add_individual_to_tree_sequence(
        ts=lshmm_ts,
        paths=[path_1, path_2],
        metadata=metadata_str.encode('ascii')
    )

    i += 1


100%|██████████| 176/176 [05:04<00:00,  1.73s/it]


In [10]:
lshmm_ts


Tree Sequence,Unnamed: 1
Trees,43101
Sequence Length,28050000.0
Time Units,unknown
Sample Nodes,1752
Total Size,123.6 MiB
Metadata,No Metadata

Table,Rows,Size,Has Metadata
Edges,1020322,31.1 MiB,
Individuals,876,259.9 KiB,✅
Migrations,0,8 Bytes,
Mutations,907372,32.0 MiB,
Nodes,164497,12.3 MiB,✅
Populations,54,2.7 KiB,✅
Provenances,18,11.9 KiB,
Sites,504314,40.1 MiB,✅


In [11]:
lshmm_ts_file = lshmm_dir + "target_lshmm.tsz"
tszip.compress(lshmm_ts, lshmm_ts_file)


In [12]:
lshmm_vcf_file = vcf_dir + "target_lshmm.vcf"

assert ref_ts.num_individuals == lshmm_ts.num_individuals - len(sample_names)

with open(lshmm_vcf_file, "w") as f:
    lshmm_ts.write_vcf(
        output=f,
        contig_id="20",
        individuals=np.arange(
            ref_ts.num_individuals,
            lshmm_ts.num_individuals
        ),
        individual_names=sample_names,
    )


In [13]:
%%bash
vcf_dir="../data/vcf/"
bgzip ${vcf_dir}"target_lshmm.vcf"
bcftools index ${vcf_dir}"target_lshmm.vcf.gz"


In [14]:
lshmm_vcf_file = vcf_dir + "target_lshmm.vcf.gz"
vcf_to_zarr(lshmm_vcf_file, zarr_dir + "target_lshmm.zarr")
lshmm_ds = sg.load_dataset(zarr_dir + "target_lshmm.zarr")


In [15]:
lshmm_ds_compat = cv.remap_to_acgt(lshmm_ds, num_workers=6)


100%|██████████| 504314/504314 [49:40<00:00, 169.21it/s]  
100%|██████████| 504314/504314 [1:19:17<00:00, 106.01it/s]


In [16]:
sg.save_dataset(lshmm_ds_compat, zarr_dir + "target_lshmm_compat.zarr")
