### Process and check output from CADD-SV 

* Reformat header of output bed files 
* Check that output variants match input variants from SV info file (inversions removed)
* Merge with information in SV info file 

In [1]:
import sys
import pandas as pd
from pathlib import Path

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
cadd_sv_dir = wkdir_path.joinpath("5_misexp_vrnts/scores/cadd_sv")
cadd_sv_output_path = cadd_sv_dir.joinpath("output")

In [3]:
processed_output_dir = cadd_sv_dir.joinpath("processed")
processed_output_dir.mkdir(parents=True, exist_ok=True)
scores_dir = cadd_sv_dir.joinpath("scores")
scores_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# clean all CADD-SV output files
output_bed_files = cadd_sv_output_path.glob("intrvl_svs_no_inv_*_score.bed")
for output_bed in output_bed_files: 
    bed_file = output_bed.name
    bed_file_header = f"{bed_file.split('.')[0]}.clean_header.{bed_file.split('.')[1]}"
    output_bed_clean_header = processed_output_dir.joinpath(bed_file_header)
    with open(output_bed, "r") as f_in, open(output_bed_clean_header, "w") as f_out:
        for line in f_in:
            if line.startswith("##"):
                continue
            if line.startswith('chr'): 
                header_list = line.split(" ") 
                # add missing column 
                header_list.insert(6, "Raw-Score-combined")
                updated_header = "\t".join(header_list) + "\n"
                f_out.write(updated_header)
            else:
                f_out.write(line)

In [6]:
# combine all processed output files 
clean_bed_list = []
output_bed_files_clean = processed_output_dir.glob("intrvl_svs_no_inv_*_score.clean_header.bed")
for clean_bed in output_bed_files_clean:
    clean_bed_list.append(pd.read_csv(clean_bed, sep="\t"))
intrvl_cadd_sv_scores_df = pd.concat(clean_bed_list)
# add variant ID 
intrvl_cadd_sv_scores_df["variant_id"] = "chr" + intrvl_cadd_sv_scores_df.chr.astype(str) + ":" + intrvl_cadd_sv_scores_df.start.astype(str) + ":" + intrvl_cadd_sv_scores_df.end.astype(str) + ":" + intrvl_cadd_sv_scores_df.type
cadd_sv_vrnt_ids = set(intrvl_cadd_sv_scores_df.variant_id.unique())
print(f"Number of variants with CADD-SV scores: {len(cadd_sv_vrnt_ids)}")

# check all variants are included 
sv_info_df = pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID": str}).rename(columns={"plinkID":"vrnt_id"})
sv_info_no_inv_df = sv_info_df[sv_info_df.SVTYPE != "INV"].copy()
sv_info_no_inv_df["SVTYPE"] = sv_info_no_inv_df["SVTYPE"].replace("MEI", "INS")
sv_info_no_inv_df["variant_id"] = sv_info_no_inv_df.chr + ":" + sv_info_no_inv_df.pos.astype(str) + ":" + sv_info_no_inv_df.end.astype(str) + ":" + sv_info_no_inv_df.SVTYPE 
sv_info_vrnt_ids = sv_info_no_inv_df.variant_id.tolist()
if set(cadd_sv_vrnt_ids) != set(sv_info_vrnt_ids): 
    raise ValueError("CADD-SV output variants are not the same as input")

Number of variants with CADD-SV scores: 121042


In [7]:
# add variant information 
intrvl_cadd_sv_scores_info_df = pd.merge(sv_info_no_inv_df, 
                                         intrvl_cadd_sv_scores_df.drop(columns=["chr", "start", "end", "type"]),  
                                         how="inner", 
                                         on="variant_id")
intrvl_cadd_sv_scores_info_df["SVTYPE"] = intrvl_cadd_sv_scores_info_df["SVTYPE"].replace("INS", "MEI")

In [9]:
# write to file 
cadd_sv_score_info_path = scores_dir.joinpath("intrvl_svs_no_inv_121042_cadd_sv_info.tsv")
intrvl_cadd_sv_scores_info_df.to_csv(cadd_sv_score_info_path, sep="\t", index=False)