### Variant Enrichment Testing 

In [1]:
import pandas as pd 
import numpy as np
import sys 
import statsmodels.api as sm
from pathlib import Path
from scipy.stats import fisher_exact
from collections import Counter
from statsmodels.stats import multitest

In [15]:
### constants 
CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',
               'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY',
               ]

In [32]:
### inputs  
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)

sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
sv_carrier_count_gene_body_dir = wkdir_path.joinpath("4_vrnt_enrich/sv_count_carriers/gene_body/200kb_window/carrier_count_gene_msc_reg_af50")
sv_carrier_count_windows_dir = wkdir_path.joinpath("4_vrnt_enrich/sv_count_carriers/windows/200kb_window/carrier_count")
snp_indel_root_dir = wkdir_path.joinpath("4_vrnt_enrich/snp_indel_count_carriers/count_snp_indel_carriers_af50")

vep_msc_path = wkdir_path.joinpath("4_vrnt_enrich/sv_vep/msc/SV_vep_hg38_msc_parsed.tsv")
out_dir = wkdir_path.joinpath("4_vrnt_enrich/enrich_results_af50")

In [33]:
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)

In [24]:
enrich_results_cols= ["misexp_carrier", 'total_misexp', "control_carrier", 'total_control',
                      'risk_ratio', 'risk_ratio_lower', 'risk_ratio_upper', 
                      'odds_ratio', 'odds_ratio_lower', 'odds_ratio_upper', 'pval']

def misexp_enrich(conting_mtx): 
    '''
    Calculate enrichment metrics from contingency matrix. 
    '''
    conting_mtx = np.array(conting_mtx_list)
    oddsratio = sm.stats.Table2x2(conting_mtx).oddsratio
    riskratio = sm.stats.Table2x2(conting_mtx).riskratio
    _, pval = fisher_exact(conting_mtx)
    # 95% confidence intervals by a normal approximation 
    riskratio_confint_lower,  riskratio_confint_upper = sm.stats.Table2x2(conting_mtx).riskratio_confint(0.05, method="normal")
    oddsratio_confint_lower,  oddsratio_confint_upper = sm.stats.Table2x2(conting_mtx).oddsratio_confint(0.05, method="normal")
    return [conting_mtx_list[0][0], sum(conting_mtx_list[0]), conting_mtx_list[1][0], sum(conting_mtx_list[1]), riskratio, riskratio_confint_lower, riskratio_confint_upper, oddsratio, oddsratio_confint_lower, oddsratio_confint_upper, pval]

In [25]:
# list of SV classes including all SVs group 
sv_info_df = pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID":str})
sv_info_id_af_df = sv_info_df[["plinkID", "AF", "SVTYPE"]].rename(columns={"plinkID":"vrnt_id"})
sv_type_list = sv_info_id_af_df.SVTYPE.unique().tolist()
all_sv_type_list = ["all_sv"] + sv_type_list

# list of VEP consequences 
vep_msc_ranks = [
    "transcript_ablation",
    "splice_acceptor_variant",
    "splice_donor_variant",
    "stop_gained",
    "frameshift_variant",
    "stop_lost",
    "start_lost",
    "transcript_amplification",
    "inframe_insertion",
    "inframe_deletion",
    "missense_variant",
    "protein_altering_variant",
    "splice_donor_5th_base_variant", 
    "splice_region_variant",
    "splice_donor_region_variant", 
    "splice_polypyrimidine_tract_variant"
    "incomplete_terminal_codon_variant",
    "start_retained_variant",
    "stop_retained_variant",
    "synonymous_variant",
    "coding_sequence_variant",
    "mature_miRNA_variant",
    "5_prime_UTR_variant",
    "3_prime_UTR_variant",
    "non_coding_transcript_exon_variant",
    "intron_variant",
    "NMD_transcript_variant",
    "non_coding_transcript_variant",
    "coding_transcript_variant",
    "upstream_gene_variant",
    "downstream_gene_variant",
    "TFBS_ablation",
    "TFBS_amplification",
    "TF_binding_site_variant",
    "regulatory_region_ablation",
    "regulatory_region_amplification",
    "feature_elongation",
    "regulatory_region_variant",
    "feature_truncation",
    "intergenic_variant",
    "sequence_variant",
    "no_predicted_effect"
]


In [26]:
sv_carrier_count_gene_body_path = Path(sv_carrier_count_gene_body_dir)
# combine control and misexpression count files from chromosomes 
carrier_count_df_list = []
for chrom in CHROMOSOMES[:22]: 
    carrier_count_path = sv_carrier_count_gene_body_path.joinpath(f"{chrom}_carrier_count_gene_msc.tsv")
    carrier_count_df = pd.read_csv(carrier_count_path, sep="\t")
    carrier_count_df_list.append(carrier_count_df)
all_chrom_carrier_count_df = pd.concat(carrier_count_df_list).drop(columns=["smpls_pass_qc"])

In [27]:
window_raw = "gene_body_200"
window_name = "gene body +/-200kb"
sv_200kb_enrich_results_dict = {}
sv_200kb_enrich_count = 0
af_range_list = [[0,1], [1, 5], [5, 10], [10,50]]

for af_range in af_range_list: 
    af_lower, af_upper = af_range
    af_range_name = f"{af_lower}-{af_upper}"
    carrier_count_maf_df = all_chrom_carrier_count_df[all_chrom_carrier_count_df.maf_bin == af_range_name]
    # count all events for chromosomes 
    carrier_count_summed_df = carrier_count_maf_df.groupby(by=["z_cutoff"], as_index=False).sum()
    # calculate odds and risk ratio with confidence intervals 
    for z_cutoff in carrier_count_summed_df.z_cutoff:
        name = f"> {round(z_cutoff)}"
        # total misexpression and control events 
        total_misexp = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f"total_misexp"].item()
        total_control = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f"total_control"].item()
        for sv_type in all_sv_type_list: 
            # enrichment for all SVs and different SV classes 
            misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f"{sv_type}_misexp"].item()
            cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f"{sv_type}_contrl"].item()
            conting_mtx_list = [[misexp_carrier, total_misexp - misexp_carrier], 
                                [cntrl_carrier, total_control - cntrl_carrier]]
            enrich_results = misexp_enrich(conting_mtx_list)
            sv_200kb_enrich_results_dict[sv_200kb_enrich_count] = [f"{sv_type}", "all", af_range_name, z_cutoff, name, window_raw, window_name] + enrich_results
            sv_200kb_enrich_count += 1 
            # enrichment for most severe consequences stratified by SV type 
            if sv_type == "all_sv": 
                continue
            for msc in vep_msc_ranks: 
                misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f"{sv_type}_{msc}_misexp"].item()
                cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.z_cutoff == z_cutoff][f"{sv_type}_{msc}_contrl"].item()
                conting_mtx_list = [[misexp_carrier, total_misexp - misexp_carrier], 
                                    [cntrl_carrier, total_control - cntrl_carrier]]
                enrich_results = misexp_enrich(conting_mtx_list)
                sv_200kb_enrich_results_dict[sv_200kb_enrich_count] = [f"{sv_type}", f"{msc}", af_range_name, z_cutoff, name, window_raw, window_name] + enrich_results
                sv_200kb_enrich_count += 1 

In [28]:
sv_cols = ["vrnt_type", "consequence", "maf_range", "z_cutoff", "z_cutoff_name", "window_raw", "window_name"]
sv_enrich_cols = sv_cols + enrich_results_cols
sv_200kb_enrich_results_df = pd.DataFrame.from_dict(sv_200kb_enrich_results_dict, orient="index", columns=sv_enrich_cols)

In [29]:
# proportion of misexpression events with a rare SV
perc_misexp_sv_carrier = {}
for i, z_score in enumerate([2, 10, 20, 30, 40]):
    misexp_cntrl_carrier_df = sv_200kb_enrich_results_df[(sv_200kb_enrich_results_df.vrnt_type == "all_sv") & 
                                                      (sv_200kb_enrich_results_df.maf_range == "0-1") & 
                                                      (sv_200kb_enrich_results_df.consequence == "all") & 
                                                      (sv_200kb_enrich_results_df.window_raw == "gene_body_200") &
                                                      (sv_200kb_enrich_results_df.z_cutoff == z_score)
                                                     ]
    misexp_carrier = misexp_cntrl_carrier_df.misexp_carrier.item()
    misexp_total = misexp_cntrl_carrier_df.total_misexp.item()
    perc_carrier = (misexp_carrier/misexp_total)*100
    perc_misexp_sv_carrier[i] = [f"> {z_score}", perc_carrier, misexp_carrier, misexp_total]
    print(f"Percentage misexpression events with rare SV {z_score}: {(misexp_carrier/misexp_total)*100}% ({misexp_carrier}/{misexp_total})")

Percentage misexpression events with rare SV 2: 1.3176064441887225% (229/17380)
Percentage misexpression events with rare SV 10: 1.527934008248969% (163/10668)
Percentage misexpression events with rare SV 20: 1.8390307226308955% (85/4622)
Percentage misexpression events with rare SV 30: 2.4764735017335315% (50/2019)
Percentage misexpression events with rare SV 40: 4.732254047322541% (38/803)


In [30]:
# write carrier % to file 
perc_misexp_sv_carrier_cols = ["z-score", "perc_carrier", "misexp_carrier", "misexp_total"]
perc_misexp_sv_carrier_df = pd.DataFrame.from_dict(perc_misexp_sv_carrier, orient="index", columns=perc_misexp_sv_carrier_cols)
# write to file
perc_misexp_sv_carrier_path = out_dir.joinpath("perc_misexp_carrier_zscores.tsv")
perc_misexp_sv_carrier_df.to_csv(perc_misexp_sv_carrier_path, sep="\t", index=False)

# SV windows enrichment results

In [34]:
window_names_dict = {"downstream_0": "gene body",
                     "upstream_0": "gene body",
                     "gene_body": "gene body",
                     "gene_body_window_10000": "gene body +/-10kb",
                     "downstream_1000000": "800kb to 1Mb",
                     "downstream_200000": "TTS to 200kb",
                     "downstream_400000": "200kb to 400kb",
                     "downstream_600000": "400kb to 600kb",
                     "downstream_800000": "600kb to 800kb",
                     "upstream_1000000": "-800kb to -1Mb",
                     "upstream_200000": "TSS to -200kb",
                     "upstream_400000": "-200kb to -400kb",
                     "upstream_600000": "-400kb to -600kb",
                     "upstream_800000": "-600kb to -800kb",
                    }

In [35]:
sv_carrier_count_windows_path = Path(sv_carrier_count_windows_dir)
# combine control and misexpression count files from chromosomes 
sv_windows_enrich_results_dict = {}
sv_window_count = 0
af_range_list = [[0,1]]
for af_range in af_range_list: 
    af_lower, af_upper = af_range
    af_range_name = f"{af_lower}-{af_upper}"
    carrier_count_df_list = []
    for chrom in CHROMOSOMES[:22]: 
        carrier_count_path = sv_carrier_count_windows_path.joinpath(f"{chrom}_carrier_count.tsv")
        carrier_count_df = pd.read_csv(carrier_count_path, sep="\t")
        carrier_count_df_list.append(carrier_count_df)
    all_chrom_carrier_count_df = pd.concat(carrier_count_df_list).drop(columns=["smpls_pass_qc"])
    # count all events for chromosomes 
    all_chrom_carrier_count_df["name"] = all_chrom_carrier_count_df.z_cutoff.astype(str) + "_" + all_chrom_carrier_count_df.direction + "_" + all_chrom_carrier_count_df.window_size.astype(str)
    all_chrom_carrier_count_df = all_chrom_carrier_count_df.drop(columns=["z_cutoff", "direction", "window_size"])
    carrier_count_summed_df = all_chrom_carrier_count_df.groupby(["name"]).sum()
    all_enrich_results = {}
    
    for window in carrier_count_summed_df.index:
        z_cutoff = float(window.split("_")[0])
        name = f"> {round(z_cutoff)}"
        window_raw = window.split("_")[1] + "_" + window.split("_")[2]
        window_name = window_names_dict[window_raw]
        total_misexp = carrier_count_summed_df[carrier_count_summed_df.index == window][f"total_misexp"].item()
        total_control = carrier_count_summed_df[carrier_count_summed_df.index == window][f"total_control"].item()
        for sv_type in all_sv_type_list: 
            misexp_carrier = carrier_count_summed_df[carrier_count_summed_df.index == window][f"{sv_type}_misexp"].item()
            cntrl_carrier = carrier_count_summed_df[carrier_count_summed_df.index == window][f"{sv_type}_contrl"].item()
            conting_mtx_list = [[misexp_carrier, total_misexp - misexp_carrier], 
                                [cntrl_carrier, total_control - cntrl_carrier]]
            enrich_results = misexp_enrich(conting_mtx_list)
            sv_windows_enrich_results_dict[sv_window_count] = [f"{sv_type}", "all", af_range_name, z_cutoff, name, window_raw, window_name] + enrich_results
            sv_window_count += 1 
                
sv_window_enrich_results_df = pd.DataFrame.from_dict(sv_windows_enrich_results_dict, orient="index", columns=sv_enrich_cols)

In [36]:
# drop duplicate results 
sv_window_enrich_results_no_dupl_df = sv_window_enrich_results_df[sv_window_enrich_results_df.window_raw != "downstream_0"]

In [37]:
# combine SV results 
sv_combined_results_df = pd.concat([sv_200kb_enrich_results_df, 
                                    sv_window_enrich_results_no_dupl_df, 
                                   ])

### Enrichment calculations for SNVs and indels 

In [38]:
snp_indel_root_path = Path(snp_indel_root_dir)
snp_indel_carrier_count_df_list = []
for chrom in CHROMOSOMES[:22]:
    carrier_count_chrom_path = snp_indel_root_path.joinpath(f"{chrom}_carrier_count.tsv")
    carrier_count_chrom_df = pd.read_csv(carrier_count_chrom_path, sep="\t")
    snp_indel_carrier_count_df_list.append(carrier_count_chrom_df)
all_chrom_snp_indel_carrier_count_df = pd.concat(snp_indel_carrier_count_df_list)
all_chrom_snp_indel_carrier_sum_df = all_chrom_snp_indel_carrier_count_df.groupby(by=["z_cutoff", "window", "vrnt_type", "maf_range"], as_index=False).sum()
all_chrom_snp_indel_carrier_sum_df["misexp_noncarrier"] = all_chrom_snp_indel_carrier_sum_df.misexp_total - all_chrom_snp_indel_carrier_sum_df.misexp_carrier
all_chrom_snp_indel_carrier_sum_df["control_noncarrier"] = all_chrom_snp_indel_carrier_sum_df.control_total - all_chrom_snp_indel_carrier_sum_df.control_carrier

In [39]:
# SNV and indel gene body +/-10kb test enrichment testing 
snp_indel_10kb_set_df = all_chrom_snp_indel_carrier_sum_df[(all_chrom_snp_indel_carrier_sum_df.window == 'gene_body_window_10000')].copy()
# SNV and indel windows, rare variants, all TPM 
snp_indel_window_set_df = all_chrom_snp_indel_carrier_sum_df[(all_chrom_snp_indel_carrier_sum_df.maf_range == "0-0.01") &
                                                             (all_chrom_snp_indel_carrier_sum_df.window != "gene_body_window_10000")].copy()
# combine
snp_indel_test_set_df = pd.concat([snp_indel_10kb_set_df, snp_indel_window_set_df])

# enrichment testing 
snp_indel_test_set_results = {}
for index, row in snp_indel_test_set_df.iterrows():
    conting_mtx_list = [[row.misexp_carrier, row.misexp_noncarrier], 
                        [row.control_carrier, row.control_noncarrier]]
    entries = misexp_enrich(conting_mtx_list)
    snp_indel_test_set_results[index] = row.tolist()[:4] + entries

In [40]:
snp_indel_enrich_cols = snp_indel_test_set_df.columns.tolist()[:4] + enrich_results_cols
snp_indel_test_set_results_df = pd.DataFrame.from_dict(snp_indel_test_set_results, orient="index", columns=snp_indel_enrich_cols)
# add consequence 
snp_indel_test_set_results_df["consequence"] = "all"
snp_indel_all_results_df = snp_indel_test_set_results_df

In [41]:
# rename MAF ranges
rename_maf_range={"0-0.01": "0-1", "0.01-0.05": "1-5", 
                  "0.05-0.1": "5-10", "0.1-0.5": "10-50"}
snp_indel_all_results_df["maf_range"]  = snp_indel_all_results_df["maf_range"].replace(rename_maf_range)
# rename TPM cutoffs 
snp_indel_all_results_df["z_cutoff_name"]  = "> " + snp_indel_all_results_df["z_cutoff"].astype(str)
# window naming 
snp_indel_all_results_df["window_name"]  = snp_indel_all_results_df["window"].replace(window_names_dict)
snp_indel_all_results_df = snp_indel_all_results_df.rename(columns={"window": "window_raw"})

In [42]:
# combine SV and SNV/indel enrichment results 
snp_indel_sv_results_df = pd.concat([sv_combined_results_df, snp_indel_all_results_df])
snp_indel_sv_results_df["consequence_name"] = snp_indel_sv_results_df.consequence.str.split("_").str.join(" ").str.capitalize()

In [45]:
# write all results 
snp_indel_sv_results_df.to_csv(f"{out_dir}/snp_indel_sv_all_enrich_results_z_cutoff.tsv", sep="\t", index=False)