### Check SV carrier count results 

In [1]:
from pathlib import Path
import pandas as pd
import pysam
from pybedtools import BedTool
from io import StringIO
from functools import reduce
import numpy as np

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

chrom="chr19"
vcf_path= "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/filtered_merged_gs_svp_10728.vcf.gz"
wgs_rna_paired_smpls_path= wkdir_path.joinpath("1_rna_seq_qc/wgs_rna_match/paired_wgs_rna_postqc_prioritise_wgs.tsv")
ge_matrix_flat_path = wkdir_path.joinpath("2_misexp_qc/misexp_gene_cov_corr/tpm_zscore_4568smpls_8610genes_flat_misexp_corr_qc.csv")
gencode_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/reference/gencode/gencode.v31.annotation.sorted.gtf.gz"
sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
vep_msc_path = wkdir_path.joinpath("4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv")
vep_all_path = wkdir_path.joinpath("4_vrnt_enrich/sv_vep/all/SV_vep_hg38_all_parsed.tsv")
root_dir = wkdir_path.joinpath("4_vrnt_enrich/sv_count_carriers/test/windows/200kb_window")

In [3]:
af_cutoff_list = [0, 0.01]
af_lower, af_upper = af_cutoff_list
window_start = 0
window_step_size = 200000 
window_max = 1000000 
z_cutoff_list = [2, 10, 20, 30, 40]

In [33]:
print("Inputs:")
print(f"- Chromosome: {chrom}")
print(f"- AF range: {af_lower}-{af_upper}")
print(f"- Window start: {window_start}")
print(f"- Window size: {window_step_size}")
print(f"- Max window: {window_max}")
print(f"- Z-score cutoffs: {z_cutoff_list}")
print("")

# constants 
tpm_cutoff = 0.5 

# create root directory 
root_dir_path = Path(root_dir)
root_dir_path.mkdir(parents=True, exist_ok=True)

### Collect samples with VCF calls and RNA sequencing 
# read in gene expression file 
ge_matrix_flat_df = pd.read_csv(ge_matrix_flat_path, sep=",")
gene_id_pass_qc_set = set(ge_matrix_flat_df.gene_id.unique())
print(f"Gene IDs passing filters: {len(gene_id_pass_qc_set)}")
smpl_id_pass_qc_set = set(ge_matrix_flat_df.rna_id.unique())
print(f"RNA-seq sample IDs passing QC: {len(smpl_id_pass_qc_set)}")
print("")
# egan ID, RNA ID sample links 
wgs_rna_paired_smpls_df = pd.read_csv(wgs_rna_paired_smpls_path, sep="\t")
egan_ids_with_rna = wgs_rna_paired_smpls_df[wgs_rna_paired_smpls_df.rna_id.isin(smpl_id_pass_qc_set)].egan_id.tolist()
# load VCF and subset to EGAN IDs with RNA 
print("Loading input VCF ...")
vcf_path = vcf_path
vcf = pysam.VariantFile(vcf_path, mode = "r")
print("VCF loaded.")
print("Subset VCF to samples with RNA-seq ...")
vcf_samples = [sample for sample in vcf.header.samples]
vcf_egan_ids_with_rna = set(egan_ids_with_rna).intersection(set(vcf_samples))
vcf.subset_samples(vcf_egan_ids_with_rna)
vcf_samples_with_rna = [sample for sample in vcf.header.samples]
print(f"Number of samples in VCF with RNA ID and passing QC: {len(vcf_samples_with_rna)}")
# subset egan ID and RNA ID links to samples with SV calls and passing QC 
wgs_rna_paired_smpls_with_sv_calls_df = wgs_rna_paired_smpls_df[wgs_rna_paired_smpls_df.egan_id.isin(vcf_samples_with_rna)]
# write EGAN-RNA ID pairs to file
egan_rna_smpls_dir = root_dir_path.joinpath("egan_rna_smpls")
Path(egan_rna_smpls_dir).mkdir(parents=True, exist_ok=True)
wgs_rna_paired_smpls_with_sv_calls_df.to_csv(egan_rna_smpls_dir.joinpath("egan_rna_ids_paired_pass_qc.tsv"), sep="\t", index=False)
rna_id_pass_qc_sv_calls = wgs_rna_paired_smpls_with_sv_calls_df.rna_id.unique().tolist()
print(f"Number of RNA IDs passing QC: {len(rna_id_pass_qc_sv_calls)}")

### write bed file for genes on chromosome passing QC 
gene_bed_dir = root_dir_path.joinpath("genes_bed")
gene_bed_dir.mkdir(parents=True, exist_ok=True)
gene_bed_path = gene_bed_dir.joinpath(f"{chrom}_genes.bed")
gene_id_pass_qc_on_chrom = []
with open(gene_bed_path, "w") as f:
    for gtf in pysam.TabixFile(gencode_path).fetch(chrom, parser = pysam.asGTF()):
        if gtf.feature == "gene" and gtf.gene_id.split('.')[0] in gene_id_pass_qc_set:
            # check for multiple entries with same name 
            gene_id_list, chrom_list, start_list, end_list, strand_list = [], [], [], [], []
            gene_id_list.append(gtf.gene_id.split('.')[0])
            chrom_list.append(gtf.contig)
            start_list.append(gtf.start)
            end_list.append((gtf.end))
            strand_list.append((gtf.strand))
            # check or write output
            if len(gene_id_list) > 1 or len(chrom_list) > 1 or len(start_list) > 1 or len(end_list) > 1:
                print(f"{gene_id} has multiple entries in gencode file - excluded from output file.")
            elif len(chrom_list) == 0 or len(start_list) == 0 or len(end_list) == 0: 
                print(f"{gene_id} has no entries in gencode file - excluded from output file.")
            else: 
                gene_id, gtf_chrom, start, end, strand = gene_id_list[0], chrom_list[0], start_list[0], end_list[0], strand_list[0]
                gene_id_pass_qc_on_chrom.append(gene_id)
                chrom_num = gtf_chrom.split("chr")[1]
                f.write(f"{chrom_num}\t{start}\t{end}\t{gene_id}\t0\t{strand}\n")
print(f"Number of genes pass QC on {chrom}: {len(gene_id_pass_qc_on_chrom)}")

# subset gene expression file to genes on chromosome 
ge_matrix_flat_chrom_df = ge_matrix_flat_df[ge_matrix_flat_df.gene_id.isin(gene_id_pass_qc_on_chrom)]
# subset gene expression file to samples with SV calls 
ge_matrix_flat_chrom_egan_df = pd.merge(ge_matrix_flat_chrom_df, wgs_rna_paired_smpls_with_sv_calls_df, how="inner", on="rna_id")
print(f"Sample IDs in gene expression matrix with SV calls: {len(ge_matrix_flat_chrom_egan_df.egan_id.unique())}")
# write to file 
ge_matrix_flat_chrom_egan_dir = root_dir_path.joinpath("express_mtx")
Path(ge_matrix_flat_chrom_egan_dir).mkdir(parents=True, exist_ok=True)
ge_matrix_flat_chrom_egan_df.to_csv(ge_matrix_flat_chrom_egan_dir.joinpath(f"{chrom}_ge_mtx_flat.tsv"), sep="\t", index=False)
# add gene-sample pair to expression dataframe
ge_matrix_flat_chrom_egan_df["gene_smpl_pair"] = ge_matrix_flat_chrom_egan_df.gene_id + "," + ge_matrix_flat_chrom_egan_df.rna_id

### write SVs on chromosome to bed file 
vrnts_bed_dir = root_dir_path.joinpath("vrnts_bed")
vrnts_bed_dir.mkdir(parents=True, exist_ok=True)
vrnts_bed_path = vrnts_bed_dir.joinpath(f"{chrom}_vrnts.bed")
with open(sv_info_path, "r") as f_in, open(vrnts_bed_path, "w") as f_out:
    for line in f_in:
        if line.startswith("plinkID"): 
            continue
        else: 
            vrnt_id, sv_chrom, pos, end = line.split("\t")[0], line.split("\t")[2], line.split("\t")[3], line.split("\t")[4]
            if sv_chrom == chrom:
                chrom_num = sv_chrom.split("chr")[1]
                f_out.write(f"{chrom_num}\t{pos}\t{end}\t{vrnt_id}\n")

### SV information 
sv_info_df = pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID":str})
sv_types_list = sv_info_df.SVTYPE.unique().tolist()
print(f"SV types in SV info file: {sv_types_list}")
sv_info_id_af_df = sv_info_df[["plinkID", "AF", "SVTYPE"]].rename(columns={"plinkID":"vrnt_id"})

### generate output directories 
# variant-window intersection directory 
intersect_bed_dir=root_dir_path.joinpath("intersect_bed")
Path(intersect_bed_dir).mkdir(parents=True, exist_ok=True)
# genotypes of variants in windows directory 
intersect_vrnt_gts_dir = root_dir_path.joinpath("intersect_vrnt_gts")
Path(intersect_vrnt_gts_dir).mkdir(parents=True, exist_ok=True)

# load variant and gene bed files 
vrnts_bed = BedTool(vrnts_bed_path)
genes_bed = BedTool(gene_bed_path)
# column names for variant window intersection bed file 
intersect_bed_columns={0:"chrom_gene", 1:"start_gene", 2:"end_gene", 3: "gene_id", 4: "score", 5: "strand", 
                       6:"chrom_vrnt", 7:"start_vrnt", 8:"end_vrnt", 9:"vrnt_id"}

# load most severe consequence 
vep_msc_df = pd.read_csv(vep_msc_path, sep="\t", dtype={"vrnt_id": str})
msc_list = vep_msc_df.Consequence.unique().tolist()
vep_msc_cnsqn_df = vep_msc_df.rename(columns={"Uploaded_variation": "vrnt_id"})[["vrnt_id", "Consequence"]]

Inputs:
- Chromosome: chr19
- AF range: 0-0.01
- Window start: 0
- Window size: 200000
- Max window: 1000000
- Z-score cutoffs: [2, 10, 20, 30, 40]

Gene IDs passing filters: 8610
RNA-seq sample IDs passing QC: 4568

Loading input VCF ...
VCF loaded.
Subset VCF to samples with RNA-seq ...
Number of samples in VCF with RNA ID and passing QC: 2640
Number of RNA IDs passing QC: 2640
Number of genes pass QC on chr19: 295
Sample IDs in gene expression matrix with SV calls: 2640
SV types in SV info file: ['DEL', 'MEI', 'DUP', 'INV']


In [34]:
### assign gene-variant pairs a window  
vrnt_gene_pairs_in_windows_df_list = []
for direction in ["upstream", "downstream"]:
    window_size = window_start
    seen_sv_gene_pairs = set()
    while window_size <= window_max:
        print(f"Counting variants {direction}, {window_size}bp ...")
        window_name = f"{direction}_{window_size}"
        if direction == "upstream": 
            l, r = window_size, 0 
        else: 
            l, r = 0, window_size
        vrnts_window_intersect_str = StringIO(str(genes_bed.window(vrnts_bed, 
                                                                   l=l, 
                                                                   r=r,
                                                                   sw=True
                                                                  )))        
        intersect_bed_df = pd.read_csv(vrnts_window_intersect_str, sep="\t", header=None).rename(columns=intersect_bed_columns)
        intersect_bed_df = intersect_bed_df[["vrnt_id", "gene_id"]].copy()
        # variant gene ID column 
        intersect_bed_df["vrnt_gene_id"] = intersect_bed_df["vrnt_id"].astype(str) + "," + intersect_bed_df["gene_id"].astype(str)
        # subset to variant gene pairs that have not been seen in previous windows    
        sv_gene_pairs_in_window = set(intersect_bed_df.vrnt_gene_id.unique())
        new_sv_gene_pairs = sv_gene_pairs_in_window - seen_sv_gene_pairs
        new_sv_gene_pairs_df = intersect_bed_df[intersect_bed_df.vrnt_gene_id.isin(new_sv_gene_pairs)].copy()
        new_sv_gene_pairs_df["window"] = window_name
        vrnt_gene_pairs_in_windows_df_list.append(new_sv_gene_pairs_df)
        seen_sv_gene_pairs = seen_sv_gene_pairs.union(sv_gene_pairs_in_window)
        window_size += window_step_size

vrnt_gene_pairs_in_windows_df = pd.concat(vrnt_gene_pairs_in_windows_df_list)
rename_window = {"upstream_0": "gene_body", "downstream_0": "gene_body"}
vrnt_gene_pairs_in_windows_df.window = vrnt_gene_pairs_in_windows_df.window.replace(rename_window)
vrnt_gene_pairs_in_windows_df = vrnt_gene_pairs_in_windows_df.drop_duplicates()

Counting variants upstream, 0bp ...
Counting variants upstream, 200000bp ...
Counting variants upstream, 400000bp ...
Counting variants upstream, 600000bp ...
Counting variants upstream, 800000bp ...
Counting variants upstream, 1000000bp ...
Counting variants downstream, 0bp ...
Counting variants downstream, 200000bp ...
Counting variants downstream, 400000bp ...
Counting variants downstream, 600000bp ...
Counting variants downstream, 800000bp ...
Counting variants downstream, 1000000bp ...


In [35]:
# assign variants MAF bins 
maf_bins = [0, 0.01, 0.05, 0.1, 0.5]
af_bin_labels = ["0-1", "1-5", "5-10", "10-50"]
sv_info_id_af_df["maf_bin"] = pd.cut(sv_info_id_af_df.AF, bins=maf_bins, labels=af_bin_labels, right=False)

In [36]:
# add AF and SV type 
vrnt_gene_pairs_windows_info_df = pd.merge(vrnt_gene_pairs_in_windows_df, 
                                         sv_info_id_af_df, 
                                         on="vrnt_id",
                                         how="inner")

In [37]:
### get variant genotypes 
intersect_vrnt_ids = vrnt_gene_pairs_in_windows_df.vrnt_id.unique()
# genotypes for variant IDs in windows
vrnt_gt_egan_dict = {}
count = 0
for vrnt_id in intersect_vrnt_ids:
    # get chromosome, start and end of SV 
    chrom_vrnt, pos, end, = [sv_info_df[sv_info_df.plinkID == vrnt_id][col].item() for col in ["chr", "pos", "end"]]
    chrom_vrnt = chrom_vrnt.split("chr")[1]
    records = vcf.fetch(str(chrom_vrnt), pos-1, end)
    found_vrnt_id = False
    for rec in records: 
        vcf_vrnt_id = str(rec.id)
        if vrnt_id == vcf_vrnt_id:
            found_vrnt_id = True 
            gts = [s["GT"] for s in rec.samples.values()]
            for i, gt in enumerate(gts): 
                vrnt_gt_egan_dict[count] = [vrnt_id, vcf.header.samples[i], gt]
                count += 1 
    if not found_vrnt_id: 
        raise ValueError(f"Did not find {vrnt_id} in {vcf_path}")
vrnt_genotypes_df = pd.DataFrame.from_dict(vrnt_gt_egan_dict, orient="index", columns=["vrnt_id", "egan_id", "genotype"])
vrnt_genotypes_df = vrnt_genotypes_df.astype({"genotype":str})

In [38]:
# add genotypes 
vrnt_gene_pairs_windows_gts_df = pd.merge(vrnt_gene_pairs_windows_info_df, 
                                             vrnt_genotypes_df, 
                                             on="vrnt_id", 
                                             how="inner")

In [39]:
# add expression data 
vrnt_gene_pairs_windows_gts_express_df = pd.merge(vrnt_gene_pairs_windows_gts_df, 
                                                 ge_matrix_flat_chrom_egan_df, 
                                                 on=["gene_id", "egan_id"],
                                                 how="inner"
                                                )

In [11]:
# check column numbers 
#vrnt_gene_pairs_windows_gts_express_df.nunique()

vrnt_id             4876
gene_id              295
vrnt_gene_id       48786
window                11
AF                   390
SVTYPE                 4
maf_bin                4
egan_id             2640
genotype               4
rna_id              2640
TPM                41142
z-score            41466
gene_smpl_pair    778800
dtype: int64

In [40]:
# subset to carriers 
carrier_gts = ['(0, 1)', '(1, 1)']
vrnt_gene_carriers_df = vrnt_gene_pairs_windows_gts_express_df[vrnt_gene_pairs_windows_gts_express_df.genotype.isin(carrier_gts)]

### Check gene body window enrichment

In [41]:
# subset to variants in +/- 200kb and gene body windows 
gene_body_windows = ["gene_body", "upstream_200000", "downstream_200000"]
vrnt_gene_body_window_carriers_df = vrnt_gene_carriers_df[vrnt_gene_carriers_df.window.isin(gene_body_windows)].copy()
vrnt_gene_body_window_carriers_df["window"] = "gene_body_200000"

In [42]:
# create dataframe with all MAF bin, SV type combinations 
maf_window_svtype_df = vrnt_gene_body_window_carriers_df[["maf_bin", "window", "SVTYPE"]].drop_duplicates()
maf_window_svtype_df = maf_window_svtype_df[maf_window_svtype_df.maf_bin.notna()].reset_index(drop=True)

In [48]:
count_misexp_cntrl_carriers_gene_body_df_list = []
for z_cutoff in z_cutoff_list:
    # misexpression events 
    print(f"Z-score threshold: {z_cutoff}")
    misexp_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df["z-score"] > z_cutoff) & 
                                             (ge_matrix_flat_chrom_egan_df["TPM"] > tpm_cutoff)]
    misexp_genes = misexp_df.gene_id.unique()
    print(f"\tNumber of misexpressed genes: {len(misexp_genes)}")
    misexp_rna_id = misexp_df.rna_id.unique()
    misexp_gene_smpl = misexp_df.gene_smpl_pair.unique()
    print(f"\tNumber of misexpressed gene-sample pairs: {len(misexp_gene_smpl)}")
    # control events
    cntrl_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df.gene_id.isin(misexp_genes)) & 
                                            (~ge_matrix_flat_chrom_egan_df.gene_smpl_pair.isin(misexp_gene_smpl))]
    cntrl_gene_smpl = cntrl_df.gene_smpl_pair.unique() 
    print(f"\tNumber of control gene-sample pairs: {len(cntrl_gene_smpl)}")
    # count misexpression SV carriers 
    misexp_carriers_df = vrnt_gene_body_window_carriers_df[vrnt_gene_body_window_carriers_df.gene_smpl_pair.isin(misexp_gene_smpl)]
    count_misexp_carriers_df = pd.DataFrame(misexp_carriers_df.groupby(["window", "maf_bin"]).gene_smpl_pair.nunique())
    count_misexp_carriers_df = count_misexp_carriers_df.reset_index()
    count_misexp_carriers_df = count_misexp_carriers_df.rename(columns={"gene_smpl_pair": "all_sv_misexp_check"})
    
    # count control SV carriers
    cntrl_carriers_df = vrnt_gene_body_window_carriers_df[vrnt_gene_body_window_carriers_df.gene_smpl_pair.isin(cntrl_gene_smpl)]
    count_cntrl_carriers_df = pd.DataFrame(cntrl_carriers_df.groupby(["window", "maf_bin"]).gene_smpl_pair.nunique())
    count_cntrl_carriers_df = count_cntrl_carriers_df.reset_index()
    count_cntrl_carriers_df = count_cntrl_carriers_df.rename(columns={"gene_smpl_pair": "all_sv_contrl_check"})
    
    # add SV type misexpression carriers 
    count_misexp_carriers_svtype_df = pd.DataFrame(misexp_carriers_df.groupby(["window", "maf_bin", "SVTYPE"]).gene_smpl_pair.nunique())
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.reset_index()
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.pivot(index=["window", "maf_bin"], columns="SVTYPE", values="gene_smpl_pair")
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.reset_index()
    rename_columns = {sv_type:f"{sv_type}_misexp_check" for sv_type in ["DEL", "DUP", "INV", "MEI"]}  
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.rename(columns=rename_columns)
    
    # add SV type control carriers 
    count_cntrl_carriers_svtype_df = pd.DataFrame(cntrl_carriers_df.groupby(["window", "maf_bin", "SVTYPE"]).gene_smpl_pair.nunique())
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.reset_index()
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.pivot(index=["window", "maf_bin"], columns="SVTYPE", values="gene_smpl_pair")
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.reset_index()
    rename_columns = {sv_type:f"{sv_type}_contrl_check" for sv_type in ["DEL", "DUP", "INV", "MEI"]}  
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.rename(columns=rename_columns)
    
    data_frames = [count_misexp_carriers_df, count_cntrl_carriers_df, count_misexp_carriers_svtype_df,
                   count_cntrl_carriers_svtype_df]
    ### combine all dataframes 
    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=["window", "maf_bin"],
                                            how='inner'), data_frames)
    
    df_merged["z_cutoff"] = z_cutoff
    df_merged["misexp_genes_check"] = len(misexp_genes)
    df_merged["total_misexp_check"] = len(misexp_gene_smpl)
    df_merged["total_control_check"] = len(cntrl_gene_smpl)
    count_misexp_cntrl_carriers_gene_body_df_list.append(df_merged)

Z-score threshold: 2
	Number of misexpressed genes: 150
	Number of misexpressed gene-sample pairs: 714
	Number of control gene-sample pairs: 395286
Z-score threshold: 10
	Number of misexpressed genes: 150
	Number of misexpressed gene-sample pairs: 438
	Number of control gene-sample pairs: 395562
Z-score threshold: 20
	Number of misexpressed genes: 110
	Number of misexpressed gene-sample pairs: 162
	Number of control gene-sample pairs: 290238
Z-score threshold: 30
	Number of misexpressed genes: 54
	Number of misexpressed gene-sample pairs: 57
	Number of control gene-sample pairs: 142503
Z-score threshold: 40
	Number of misexpressed genes: 25
	Number of misexpressed gene-sample pairs: 25
	Number of control gene-sample pairs: 65975


In [49]:
count_misexp_cntrl_carriers_gene_body_df = pd.concat(count_misexp_cntrl_carriers_gene_body_df_list)
count_misexp_cntrl_carriers_gene_body_df.maf_bin = count_misexp_cntrl_carriers_gene_body_df.maf_bin.astype(str)
count_misexp_cntrl_carriers_gene_body_df = count_misexp_cntrl_carriers_gene_body_df.fillna(0)

In [59]:
# add missing columns 

sv_carrier_cols = ['all_sv_misexp_check', 'all_sv_contrl_check', 'DEL_misexp_check', 'DUP_misexp_check', 
                   'MEI_misexp_check', 'INV_misexp_check', 'DEL_contrl_check', 'DUP_contrl_check', 
                   'INV_contrl_check','MEI_contrl_check']
missing_sv_carrier_cols = []
for col in sv_carrier_cols: 
    if col not in count_misexp_cntrl_carriers_gene_body_df.columns: 
        missing_sv_carrier_cols.append(col)
        

In [60]:
count_misexp_cntrl_carriers_gene_body_all_df = count_misexp_cntrl_carriers_gene_body_df.copy()
for col in missing_sv_carrier_cols: 
    count_misexp_cntrl_carriers_gene_body_all_df[col] = 0 

In [61]:
# results 
sv_gene_window_carrier_count_check_path = wkdir_path.joinpath(f"4_vrnt_enrich/sv_count_carriers/gene_body/200kb_window/carrier_count_gene_msc_reg_af50/{chrom}_carrier_count_gene_msc.tsv")
sv_gene_window_carrier_count_check_df = pd.read_csv(sv_gene_window_carrier_count_check_path, sep="\t")

In [62]:
misexp_cntrl_sv_cols = ["z_cutoff", "maf_bin", "misexp_genes", "all_sv_misexp", "total_misexp", "all_sv_contrl", "total_control"]
misexp_sv_type_cols = [f"{sv_type}_misexp" for sv_type in ["DEL", "DUP", "INV", "MEI"]]
cntrl_sv_type_cols = [f"{sv_type}_contrl" for sv_type in ["DEL", "DUP", "INV", "MEI"]]
cols_to_check = misexp_cntrl_sv_cols + misexp_sv_type_cols + cntrl_sv_type_cols
sv_gene_window_carrier_count_trunc_check_df = sv_gene_window_carrier_count_check_df[cols_to_check]
sv_gene_window_carrier_count_trunc_z_check_df = sv_gene_window_carrier_count_trunc_check_df[sv_gene_window_carrier_count_trunc_check_df.z_cutoff.isin(z_cutoff_list)]

In [63]:
check_gene_body_carriers_df = pd.merge(count_misexp_cntrl_carriers_gene_body_all_df, 
                                       sv_gene_window_carrier_count_trunc_z_check_df, 
                                       on=["maf_bin", "z_cutoff"], 
                                       how="inner")

In [64]:
# checks 
if not check_gene_body_carriers_df["all_sv_misexp"].equals(check_gene_body_carriers_df["all_sv_misexp_check"]): 
    raise ValueError("Number of misexpression carriers does not match.")
    
if not check_gene_body_carriers_df["all_sv_contrl_check"].equals(check_gene_body_carriers_df["all_sv_contrl"]): 
    raise ValueError("Number of control carriers does not match.")
    
if not check_gene_body_carriers_df["misexp_genes_check"].equals(check_gene_body_carriers_df["misexp_genes"]): 
    raise ValueError("Number of misexpressed genes does not match.")
    
if not check_gene_body_carriers_df["total_control_check"].equals(check_gene_body_carriers_df["total_control"]): 
    raise ValueError("Number of controls does not match.")
    
if not check_gene_body_carriers_df["total_misexp_check"].equals(check_gene_body_carriers_df["total_misexp"]): 
    raise ValueError("Number of misexpression events does not match.")
    
for sv_type in ["DEL", "DUP", "INV", "MEI"]:
    for group in ["misexp", "contrl"]: 
        if not check_gene_body_carriers_df[f"{sv_type}_{group}_check"].astype(int).equals(check_gene_body_carriers_df[f"{sv_type}_{group}"].astype(int)): 
            raise ValueError(f"Number of {sv_type} {group} events does not match.")

### Check SV window enrichment 

In [65]:
count_misexp_cntrl_carriers_df_list = []
for z_cutoff in z_cutoff_list:
    # misexpression events 
    print(f"Z-score threshold: {z_cutoff}")
    misexp_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df["z-score"] > z_cutoff) & 
                                             (ge_matrix_flat_chrom_egan_df["TPM"] > tpm_cutoff)]
    misexp_genes = misexp_df.gene_id.unique()
    print(f"\tNumber of misexpressed genes: {len(misexp_genes)}")
    misexp_rna_id = misexp_df.rna_id.unique()
    misexp_gene_smpl = misexp_df.gene_smpl_pair.unique()
    print(f"\tNumber of misexpressed gene-sample pairs: {len(misexp_gene_smpl)}")
    # control events
    cntrl_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df.gene_id.isin(misexp_genes)) & 
                                            (~ge_matrix_flat_chrom_egan_df.gene_smpl_pair.isin(misexp_gene_smpl))]
    cntrl_gene_smpl = cntrl_df.gene_smpl_pair.unique() 
    print(f"\tNumber of control gene-sample pairs: {len(cntrl_gene_smpl)}")
    # count misexpression carriers 
    misexp_carriers_df = vrnt_gene_carriers_df[vrnt_gene_carriers_df.gene_smpl_pair.isin(misexp_gene_smpl)]
    count_misexp_carriers_df = pd.DataFrame(misexp_carriers_df.groupby(["window", "maf_bin"]).gene_smpl_pair.nunique())
    count_misexp_carriers_df = count_misexp_carriers_df.reset_index()
    count_misexp_carriers_df = count_misexp_carriers_df.rename(columns={"gene_smpl_pair": "all_sv_misexp_check"})
    
    # count control carriers
    cntrl_carriers_df = vrnt_gene_carriers_df[vrnt_gene_carriers_df.gene_smpl_pair.isin(cntrl_gene_smpl)]
    count_cntrl_carriers_df = pd.DataFrame(cntrl_carriers_df.groupby(["window", "maf_bin"]).gene_smpl_pair.nunique())
    count_cntrl_carriers_df = count_cntrl_carriers_df.reset_index()
    count_cntrl_carriers_df = count_cntrl_carriers_df.rename(columns={"gene_smpl_pair": "all_sv_contrl_check"})
    
    # merge control and misexpression carriers count
    count_misexp_cntrl_carriers_df = pd.merge(count_misexp_carriers_df, 
                                              count_cntrl_carriers_df, 
                                              on = ["window", "maf_bin"], 
                                              how = "inner"
                                             )
    count_misexp_cntrl_carriers_df["z_cutoff"] = z_cutoff
    count_misexp_cntrl_carriers_df["misexp_genes_check"] = len(misexp_genes)
    count_misexp_cntrl_carriers_df["total_misexp_check"] = len(misexp_gene_smpl)
    count_misexp_cntrl_carriers_df["total_control_check"] = len(cntrl_gene_smpl)
    count_misexp_cntrl_carriers_df_list.append(count_misexp_cntrl_carriers_df)

Z-score threshold: 2
	Number of misexpressed genes: 150
	Number of misexpressed gene-sample pairs: 714
	Number of control gene-sample pairs: 395286
Z-score threshold: 10
	Number of misexpressed genes: 150
	Number of misexpressed gene-sample pairs: 438
	Number of control gene-sample pairs: 395562
Z-score threshold: 20
	Number of misexpressed genes: 110
	Number of misexpressed gene-sample pairs: 162
	Number of control gene-sample pairs: 290238
Z-score threshold: 30
	Number of misexpressed genes: 54
	Number of misexpressed gene-sample pairs: 57
	Number of control gene-sample pairs: 142503
Z-score threshold: 40
	Number of misexpressed genes: 25
	Number of misexpressed gene-sample pairs: 25
	Number of control gene-sample pairs: 65975


In [66]:
count_misexp_cntrl_carriers_zscore_df = pd.concat(count_misexp_cntrl_carriers_df_list)
count_misexp_cntrl_carriers_rare_zscore_df = count_misexp_cntrl_carriers_zscore_df[count_misexp_cntrl_carriers_zscore_df.maf_bin == "0-1"]

In [67]:
# results 
carrier_count_check_path = wkdir_path.joinpath(f"4_vrnt_enrich/sv_count_carriers/windows/200kb_window/carrier_count/{chrom}_carrier_count.tsv")
carrier_count_check_df = pd.read_csv(carrier_count_check_path, sep="\t")

In [68]:
carrier_count_check_df["window"] = carrier_count_check_df.direction + "_" + carrier_count_check_df.window_size.astype(str)
carrier_count_check_trunc_df = carrier_count_check_df[["window", "z_cutoff", "misexp_genes", "all_sv_misexp", "total_misexp", "all_sv_contrl", "total_control"]]
carrier_count_check_trunc_z_df = carrier_count_check_trunc_df[carrier_count_check_trunc_df.z_cutoff.isin(z_cutoff_list)].copy()
carrier_count_check_trunc_z_df.window = carrier_count_check_trunc_z_df.window.replace({"upstream_0": "gene_body"})

In [69]:
check_carriers_df = pd.merge(carrier_count_check_trunc_z_df, 
                             count_misexp_cntrl_carriers_rare_zscore_df, 
                             on=["window", "z_cutoff"], 
                             how="inner")

# checks 
if not check_carriers_df["all_sv_misexp"].equals(check_carriers_df["all_sv_misexp_check"]): 
    raise ValueError("Number of misexpression carriers does not match.")
    
if not check_carriers_df["all_sv_contrl_check"].equals(check_carriers_df["all_sv_contrl"]): 
    raise ValueError("Number of control carriers does not match.")
    
if not check_carriers_df["misexp_genes_check"].equals(check_carriers_df["misexp_genes"]): 
    raise ValueError("Number of misexpressed genes does not match.")
    
if not check_carriers_df["total_control_check"].equals(check_carriers_df["total_control"]): 
    raise ValueError("Number of controls does not match.")
    
if not check_carriers_df["total_misexp_check"].equals(check_carriers_df["total_misexp"]): 
    raise ValueError("Number of misexpression events does not match.")

### Check VEP consequence enrichment 

In [70]:
# annotate variants with VEP consequence in +/- 200kb window 
# subset to variant-gene carriers in window 
vrnt_gene_pairs_df = vrnt_gene_body_window_carriers_df[["gene_id", "vrnt_id"]].drop_duplicates()

In [71]:
vep_msc_ranks = [
    "transcript_ablation",
    "splice_acceptor_variant",
    "splice_donor_variant",
    "stop_gained",
    "frameshift_variant",
    "stop_lost",
    "start_lost",
    "transcript_amplification",
    "inframe_insertion",
    "inframe_deletion",
    "missense_variant",
    "protein_altering_variant",
    "splice_donor_5th_base_variant", 
    "splice_region_variant",
    "splice_donor_region_variant", 
    "splice_polypyrimidine_tract_variant"
    "incomplete_terminal_codon_variant",
    "start_retained_variant",
    "stop_retained_variant",
    "synonymous_variant",
    "coding_sequence_variant",
    "mature_miRNA_variant",
    "5_prime_UTR_variant",
    "3_prime_UTR_variant",
    "non_coding_transcript_exon_variant",
    "intron_variant",
    "NMD_transcript_variant",
    "non_coding_transcript_variant",
    "coding_transcript_variant",
    "upstream_gene_variant",
    "downstream_gene_variant",
    "TFBS_ablation",
    "TFBS_amplification",
    "TF_binding_site_variant",
    "regulatory_region_ablation",
    "regulatory_region_amplification",
    "feature_elongation",
    "regulatory_region_variant",
    "feature_truncation",
    "intergenic_variant",
    "sequence_variant",
    "no_predicted_effect"]

# from all VEP consequences that have no gene annotation
vep_msc_not_linked_to_gene = ['TFBS_ablation', 
                              'TF_binding_site_variant',
                              'regulatory_region_variant', 
                              'TFBS_amplification',
                              'intergenic_variant', 
                              'regulatory_region_ablation',
                              'regulatory_region_amplification']

In [72]:
# annotate variant-gene pairs with VEP consequence 
vep_all_consq_df = pd.read_csv(vep_all_path, sep="\t")
# collapse to gene-variant consequence pairs 
vep_all_consq_drop_dups_df = vep_all_consq_df[["vrnt_id", "gene_id", "Consequence"]].drop_duplicates()
# add variant-gene consequences 
vrnt_gene_pairs_consq_df = pd.merge(vrnt_gene_pairs_df, 
                                    vep_all_consq_drop_dups_df, 
                                    on=["vrnt_id", "gene_id"], 
                                    how="left").fillna("no_predicted_effect")
# group consequences together for each variant-gene pair
vrnt_gene_pairs_consq_df["gene_consequence"] = vrnt_gene_pairs_consq_df.groupby(["vrnt_id", "gene_id"])['Consequence'].transform(lambda x: ','.join(x))
# remove duplicates arising from variant have different consequences across gene transcripts 
vrnt_gene_pair_consq_collapse_df = vrnt_gene_pairs_consq_df.drop(columns=["Consequence"]).drop_duplicates()
if vrnt_gene_pair_consq_collapse_df.shape[0] != vrnt_gene_pairs_df.shape[0]: 
    raise ValueError("Variant gene pair number does not match number of variant gene pairs with consequence.")
# assign each variant-gene pair a unique consequence based on VEP rank  
vrnt_gene_pair_msc_consq = []
for index, row in vrnt_gene_pair_consq_collapse_df.iterrows(): 
    gene_consequence = row["gene_consequence"]
    gene_consequence_list = gene_consequence.split(",")
    for consq in vep_msc_ranks: 
        if consq in gene_consequence_list: 
            break 
    vrnt_gene_pair_msc_consq.append(consq)
if vrnt_gene_pair_consq_collapse_df.shape[0] != len(vrnt_gene_pair_msc_consq): 
    raise ValueError("Number of MSC per gene does not match number of variant-gene pairs.")
vrnt_gene_pair_consq_collapse_df["consequence"] = vrnt_gene_pair_msc_consq
vrnt_gene_pair_consq_collapse_df = vrnt_gene_pair_consq_collapse_df.drop(columns=["gene_consequence"])
# split into variants with annotated gene effect and no predicted gene effect
vrnt_gene_effect_df = vrnt_gene_pair_consq_collapse_df[vrnt_gene_pair_consq_collapse_df.consequence != "no_predicted_effect"]
no_predicted_effect_df = vrnt_gene_pair_consq_collapse_df[vrnt_gene_pair_consq_collapse_df.consequence == "no_predicted_effect"]
# load VEP MSC annotations
vep_msc_df = pd.read_csv(vep_msc_path, sep="\t").rename(columns={"Uploaded_variation": "vrnt_id", "Consequence": "msc"})
# merge VEP MSC with no predicted effect 
no_prediced_effect_df = pd.merge(no_predicted_effect_df[["vrnt_id", "gene_id"]], 
                                 vep_msc_df[["vrnt_id", "msc"]], 
                                 on="vrnt_id", 
                                 how="left")
# check for NaNs 
if no_prediced_effect_df.msc.isnull().values.any(): 
    raise ValueError("Variants missing VEP most severe consequence.")
no_prediced_effect_df["consequence"] = np.where(no_prediced_effect_df.msc.isin(vep_msc_not_linked_to_gene), 
                                                no_prediced_effect_df.msc, 
                                                "no_predicted_effect")
no_prediced_effect_trunc_df = no_prediced_effect_df[["vrnt_id", "gene_id", "consequence"]]
# combine variants with annotated gene effect and variants with no predicted effect with updated regulatory consequence
vrnt_gene_pair_consq_msc_added_df = pd.concat([vrnt_gene_effect_df, no_prediced_effect_trunc_df])
if vrnt_gene_pair_consq_msc_added_df[["vrnt_id", "gene_id"]].drop_duplicates().shape[0] != vrnt_gene_pairs_df.shape[0]: 
    raise ValueError("Number of gene pairs in expression input does not match number of gene pairs with annotated consequence.")

In [73]:
# add variant consequences 
vrnt_gene_body_window_carriers_vep_df = pd.merge(vrnt_gene_body_window_carriers_df, 
                                                 vrnt_gene_pair_consq_msc_added_df, 
                                                 on=["vrnt_id", "gene_id"], 
                                                 how="inner")
vrnt_gene_body_window_carriers_vep_df["sv_type_consequence"] = vrnt_gene_body_window_carriers_vep_df["SVTYPE"] + "_" + vrnt_gene_body_window_carriers_vep_df.consequence
sv_type_consequences = vrnt_gene_body_window_carriers_vep_df.sv_type_consequence.unique()

In [74]:
# count carriers 
count_misexp_cntrl_carriers_gene_body_vep_df_list = []
for z_cutoff in z_cutoff_list:
    # misexpression events 
    print(f"Z-score threshold: {z_cutoff}")
    misexp_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df["z-score"] > z_cutoff) & 
                                             (ge_matrix_flat_chrom_egan_df["TPM"] > tpm_cutoff)]
    misexp_genes = misexp_df.gene_id.unique()
    print(f"\tNumber of misexpressed genes: {len(misexp_genes)}")
    misexp_rna_id = misexp_df.rna_id.unique()
    misexp_gene_smpl = misexp_df.gene_smpl_pair.unique()
    print(f"\tNumber of misexpressed gene-sample pairs: {len(misexp_gene_smpl)}")
    # control events
    cntrl_df = ge_matrix_flat_chrom_egan_df[(ge_matrix_flat_chrom_egan_df.gene_id.isin(misexp_genes)) & 
                                            (~ge_matrix_flat_chrom_egan_df.gene_smpl_pair.isin(misexp_gene_smpl))]
    cntrl_gene_smpl = cntrl_df.gene_smpl_pair.unique() 
    print(f"\tNumber of control gene-sample pairs: {len(cntrl_gene_smpl)}")
    
    misexp_carriers_df = vrnt_gene_body_window_carriers_vep_df[vrnt_gene_body_window_carriers_vep_df.gene_smpl_pair.isin(misexp_gene_smpl)]
    cntrl_carriers_df = vrnt_gene_body_window_carriers_vep_df[vrnt_gene_body_window_carriers_vep_df.gene_smpl_pair.isin(cntrl_gene_smpl)]

    # add SV type misexpression carriers 
    count_misexp_carriers_svtype_df = pd.DataFrame(misexp_carriers_df.groupby(["window", "maf_bin", "sv_type_consequence"]).gene_smpl_pair.nunique())
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.reset_index()
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.pivot(index=["window", "maf_bin"], columns=["sv_type_consequence"], values="gene_smpl_pair")
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.reset_index()
    rename_columns = {col:f"{col}_misexp_check" for col in count_misexp_carriers_svtype_df.columns if col not in ["window", "maf_bin"]}  
    count_misexp_carriers_svtype_df = count_misexp_carriers_svtype_df.rename(columns=rename_columns)
    
    # add SV type control carriers 
    count_cntrl_carriers_svtype_df = pd.DataFrame(cntrl_carriers_df.groupby(["window", "maf_bin","sv_type_consequence"]).gene_smpl_pair.nunique())
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.reset_index()
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.pivot(index=["window", "maf_bin"], columns=["sv_type_consequence"], values="gene_smpl_pair")
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.reset_index()
    rename_columns = {col:f"{col}_contrl_check" for col in count_cntrl_carriers_svtype_df.columns if col not in ["window", "maf_bin"]}  
    count_cntrl_carriers_svtype_df = count_cntrl_carriers_svtype_df.rename(columns=rename_columns)
    
    data_frames = [count_misexp_carriers_svtype_df, count_cntrl_carriers_svtype_df]
    ### combine all dataframes 
    df_merged = reduce(lambda  left,right: pd.merge(left,right,on=["window", "maf_bin"],
                                            how='inner'), data_frames)
    
    df_merged["z_cutoff"] = z_cutoff
    df_merged["misexp_genes_check"] = len(misexp_genes)
    df_merged["total_misexp_check"] = len(misexp_gene_smpl)
    df_merged["total_control_check"] = len(cntrl_gene_smpl)
    count_misexp_cntrl_carriers_gene_body_vep_df_list.append(df_merged)

Z-score threshold: 2
	Number of misexpressed genes: 150
	Number of misexpressed gene-sample pairs: 714
	Number of control gene-sample pairs: 395286
Z-score threshold: 10
	Number of misexpressed genes: 150
	Number of misexpressed gene-sample pairs: 438
	Number of control gene-sample pairs: 395562
Z-score threshold: 20
	Number of misexpressed genes: 110
	Number of misexpressed gene-sample pairs: 162
	Number of control gene-sample pairs: 290238
Z-score threshold: 30
	Number of misexpressed genes: 54
	Number of misexpressed gene-sample pairs: 57
	Number of control gene-sample pairs: 142503
Z-score threshold: 40
	Number of misexpressed genes: 25
	Number of misexpressed gene-sample pairs: 25
	Number of control gene-sample pairs: 65975


In [75]:
count_misexp_cntrl_carriers_gene_body_vep_df = pd.concat(count_misexp_cntrl_carriers_gene_body_vep_df_list)
count_misexp_cntrl_carriers_gene_body_vep_df.maf_bin = count_misexp_cntrl_carriers_gene_body_vep_df.maf_bin.astype(str)
count_misexp_cntrl_carriers_gene_body_vep_df = count_misexp_cntrl_carriers_gene_body_vep_df.fillna(0)

In [76]:
# add missing cosequences 
vep_consq_no_carrier = []
for sv_vep in sv_type_consequences:
    for group in ["misexp", "contrl"]:
        name = f"{sv_vep}_{group}_check"
        if name not in count_misexp_cntrl_carriers_gene_body_vep_df.columns: 
            vep_consq_no_carrier.append(name)

In [77]:
count_misexp_cntrl_carriers_gene_body_vep_all_df = count_misexp_cntrl_carriers_gene_body_vep_df.copy()
for vep_consq in vep_consq_no_carrier: 
    count_misexp_cntrl_carriers_gene_body_vep_all_df[vep_consq] = 0 

In [78]:
check_sv_type_vep_df = pd.merge(sv_gene_window_carrier_count_check_df, 
                              count_misexp_cntrl_carriers_gene_body_vep_all_df, 
                              on=["maf_bin", "z_cutoff"], 
                              how="inner"
                             )

In [79]:
for sv_vep in sv_type_consequences: 
    for group in ["misexp", "contrl"]: 
        if not check_sv_type_vep_df[f"{sv_vep}_{group}_check"].astype(int).equals(check_sv_type_vep_df[f"{sv_vep}_{group}"].astype(int)): 
            raise ValueError(f"Number of {sv_vep} {group} events does not match.")