In [1]:
import json
import os
import sys

import numpy as np
import pandas as pd

import sgkit as sg
from sgkit.io.vcf import vcf_to_zarr

import tsinfer
import tskit
import tszip

sys.path.append("../../tsimpute/src/")
import util
import perform_imputation_by_sample_matching as pism


### Prepare data for imputation


In [2]:
data_dir = "../data/trees/"
ref_ts_file = data_dir + "ref.tsz"
target_ts_file = data_dir + "target.tsz"
ref_ts = tszip.decompress(ref_ts_file)
target_ts = tszip.decompress(target_ts_file)


In [3]:
ref_ind_names = []
for ind in ref_ts.individuals():
    ref_ind_names.append(json.loads(ind.metadata)["sample"])
print(f"Number of individuals in ref_ts: {len(ref_ind_names)}")


Number of individuals in ref_ts: 700


In [4]:
target_ind_names = []
for ind in target_ts.individuals():
    target_ind_names.append(json.loads(ind.metadata)["sample"])
print(f"Number of individuals in target_ts: {len(target_ind_names)}")


Number of individuals in target_ts: 176


In [5]:
# Reference panel
# VCF for BEAGLE
out_dir = "../data/vcf/"
out_file = out_dir + "ref.vcf"
with open(out_file, "w") as f:
    ref_ts.write_vcf(
        output=f,
        contig_id="20",
        individual_names=ref_ind_names,
    )


In [6]:
# Target cohort
out_file = out_dir + "target.vcf"
with open(out_file, "w") as f:
    target_ts.write_vcf(
        output=f,
        contig_id="20",
        individual_names=target_ind_names,
    )


In [7]:
chip_dir = "../data/array/"
chip_file = chip_dir + "InfiniumOmniExpress-24v1-2_A2_chr20_pos.csv"
chip_df = pd.read_csv(chip_file)
chip_site_pos = chip_df["MapInfo"].values


In [8]:
print(f"Number of sites in ref. panel: {ref_ts.num_sites}")
print(f"Number of sites in target cohort: {target_ts.num_sites}")
print(f"Number of sites in chip: {len(chip_site_pos)}") # Both p- and q-arms


Number of sites in ref. panel: 504314
Number of sites in target cohort: 285446
Number of sites in chip: 18166


In [9]:
all_sites = np.arange(target_ts.num_sites)
remove_sites = all_sites[np.isin(target_ts.sites_position, chip_site_pos, invert=True)]
target_chip_ts = target_ts.delete_sites(site_ids=remove_sites)
target_chip_ts
print(f"Number of sites in target cohort after removing sites not in chip: {target_chip_ts.num_sites}")


Number of sites in target cohort after removing sites not in chip: 7899


In [10]:
# Target cohort (chip-like)
# VCF for BEAGLE
out_file = out_dir + "target_chip.vcf"
with open(out_file, "w") as f:
    target_chip_ts.write_vcf(
        output=f,
        contig_id="20",
        individual_names=target_ind_names,
    )
assert target_chip_ts.num_individuals == target_ts.num_individuals


In [13]:
%%bash
data_dir="../data/vcf/"
bgzip $data_dir"ref.vcf"
bgzip $data_dir"target.vcf"
bgzip $data_dir"target_chip.vcf"


In [14]:
%%bash
data_dir="../data/vcf/"
bcftools index $data_dir"ref.vcf.gz"
bcftools index $data_dir"target.vcf.gz"
bcftools index $data_dir"target_chip.vcf.gz"
