### Generate CADD-SV input bed and config files for all INTERVAL SVs 

* Input to CADD SV cannot include "INV" class 

In [3]:
import sys
import pandas as pd
from pathlib import Path 
from pybedtools import BedTool

In [4]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

out_dir = wkdir_path.joinpath("5_misexp_vrnts/scores/cadd_sv/")
out_dir.mkdir(parents=True, exist_ok=True)
# input directory 
input_dir = out_dir.joinpath("input")
input_dir.mkdir(parents=True, exist_ok=True)
# config directory 
config_dir = out_dir.joinpath("config")
config_dir.mkdir(parents=True, exist_ok=True)

In [5]:
sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
sv_info_df = pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID": str}).rename(columns={"plinkID":"vrnt_id"})
vrnt_ids = sv_info_df.vrnt_id.unique()
print(f"Number of SVs: {len(vrnt_ids)}")

Number of SVs: 123801


In [10]:
# remove inversions from input dataframe 
sv_info_no_inv_df = sv_info_df[sv_info_df["SVTYPE"] != "INV"]
vrnt_num_no_inv = len(sv_info_no_inv_df.vrnt_id.unique())
print(f"Number of SVs no inversions: {vrnt_num_no_inv}")
# write to file 
sv_info_no_inversions_path = out_dir.joinpath("final_sites_critical_info_allele.no_inversions.txt")
sv_info_no_inv_df.to_csv(sv_info_no_inversions_path, sep="\t", index=False)

Number of SVs no inversions: 121042


In [11]:
caddsv_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/software/CADD-SV"
caddsv_dir = Path(caddsv_path)

In [12]:
# generate SV bed files split by 5000 variants and associated config.yml file 
config_path_list = []
bed_file_list = []
lines_per_file = 5000
bed_out = None
with open(sv_info_no_inversions_path) as f_in:
    for lineno, line in enumerate(f_in):
        if lineno % lines_per_file == 0:
            if bed_out:
                bed_out.close()
            bed_out_path = caddsv_dir.joinpath(f"input/id_intrvl_svs_no_inv_{lineno+lines_per_file}.bed")
            bed_file_list.append(bed_out_path)
            bed_out = open(bed_out_path, "w")
            # generate input config
            config_out_path = caddsv_dir.joinpath(f"config/config_no_inv_{lineno+lines_per_file}.yml")
            config_path_list.append(config_out_path)
            config_out = open(config_out_path, "w")
            config_out.write(f"---\ndataset:\n        - intrvl_svs_no_inv_{lineno+lines_per_file}\n...\n")
            config_out.close()
        if line.startswith("plinkID"):
            continue
        else: 
            chrom, start, end = line.split("\t")[2], line.split("\t")[3], line.split("\t")[4]
            sv_type = line.split("\t")[5]
            if sv_type == "MEI":
                sv_type = "INS"
            bed_out.write(f"{chrom}\t{start}\t{end}\t{sv_type}\n")
    if bed_out:
        bed_out.close()

In [16]:
config_path_out = out_dir.joinpath("config_paths_no_invs.txt")
with open(config_path_out, "w") as f: 
    for config_path in config_path_list: 
        f.write(str(config_path).split("/")[-2] + "/" + str(config_path).split("/")[-1] +"\n")

In [18]:
bed_path_out = out_dir.joinpath("bed_paths_no_invs.txt")
with open(bed_path_out, "w") as f: 
    for bed_path in bed_file_list: 
        f.write(str(bed_path) +"\n")