### Enrichment testing and multiple testing correction

* SNV, indel and SV across z-score cutoffs and allele frequency bins (3 x 4 x 5 = 60)
* SNV, indel and SV window tests across z-score cutoffs (3 x 11 x 5 = 165)
* DEL, DUP, INV, MEI rare z-score cutoffs (4 x 5 = 20) 
* DEL, DUP, INV, MEI windows (4 * 5 * 11 = 220)
* SV consequences by z-cutoffs (47 * 5 = 235), only includes consequences actually seen in the dataset 

In [1]:
import pandas as pd 
from pathlib import Path 
import statsmodels.api as sm
from scipy.stats import fisher_exact
from statsmodels.stats import multitest
import numpy as np

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

outdir_path = wkdir_path.joinpath("4_vrnt_enrich/enrich_results_mul_test")
outdir_path.mkdir(parents=True, exist_ok=True)

In [3]:
all_carrier_count_path = wkdir_path.joinpath("4_vrnt_enrich/combine_count_carriers/snp_indel_sv_all_carrier_count_z_cutoff.tsv")
all_carrier_count_df = pd.read_csv(all_carrier_count_path, sep="\t")

In [4]:
# SNV, indel, SV across z-score cutoffs and AF bins
z_cutoff_bins = [2, 10, 20, 30, 40]
snv_indel_sv_zscore_af_df = all_carrier_count_df[all_carrier_count_df.vrnt_type.isin(["all_sv", "snp", "indel"]) & 
                                                  (all_carrier_count_df.consequence == "all") & 
                                                  (all_carrier_count_df.z_cutoff.isin(z_cutoff_bins)) & 
                                                  (all_carrier_count_df.window_name.isin(["gene body +/-10kb"]))
                                                 ]
print(f"Number of tests: {snv_indel_sv_zscore_af_df.shape[0]}")

Number of tests: 60


In [5]:
# Rare SNV, indel, SV across z-score cutoffs and AF bins
z_cutoff_bins = [2, 10, 20, 30, 40]
snv_indel_sv_rare_windows_zscore_df = all_carrier_count_df[all_carrier_count_df.vrnt_type.isin(["all_sv", "snp", "indel"]) & 
                                                            (all_carrier_count_df.consequence == "all") & 
                                                            (all_carrier_count_df.maf_range == "0-1") &
                                                            (all_carrier_count_df.z_cutoff.isin(z_cutoff_bins)) & 
                                                            ~(all_carrier_count_df.window_name.isin(["gene body +/-10kb", "gene body +/-200kb"]))
                                                 ]
print(f"Number of tests: {snv_indel_sv_rare_windows_zscore_df.shape[0]}")

Number of tests: 165


In [6]:
# rare DEL, DUP, INV, MEI SV z-score cutoffs 
sv_types_rare_zscore_df = all_carrier_count_df[all_carrier_count_df.vrnt_type.isin(["DEL", "DUP", "INV", "MEI"]) &
                                                (all_carrier_count_df.consequence == "all") & 
                                                (all_carrier_count_df.z_cutoff.isin(z_cutoff_bins)) & 
                                                (all_carrier_count_df.maf_range == "0-1") &
                                                (all_carrier_count_df.window_name.isin(["gene body +/-200kb"]))]
print(f"Number of tests: {sv_types_rare_zscore_df.shape[0]}")

Number of tests: 20


In [7]:
# rare DEL, DUP, INV, MEI SV z-score cutoffs, windows  
sv_types_rare_windows_zscore_df = all_carrier_count_df[all_carrier_count_df.vrnt_type.isin(["DEL", "DUP", "INV", "MEI"]) &
                                                (all_carrier_count_df.consequence == "all") & 
                                                (all_carrier_count_df.z_cutoff.isin(z_cutoff_bins)) & 
                                                (all_carrier_count_df.maf_range == "0-1") &
                                                ~(all_carrier_count_df.window_name.isin(["gene body +/-10kb", "gene body +/-200kb"]))]
print(f"Number of tests: {sv_types_rare_windows_zscore_df.shape[0]}")

Number of tests: 220


In [8]:
# remove consequences never seen across the dataset 
sv_msc_zscore_2_df = all_carrier_count_df[all_carrier_count_df.vrnt_type.isin(["DEL", "DUP", "INV", "MEI"]) &
                                         (all_carrier_count_df.consequence != "all") & 
                                         (all_carrier_count_df.z_cutoff == 2) & 
                                         (all_carrier_count_df.maf_range == "0-1") & 
                                         (all_carrier_count_df.window_name == "gene body +/-200kb")
                                         ].copy()
sv_msc_zscore_2_df["total_carrier"] = sv_msc_zscore_2_df.misexp_carrier + sv_msc_zscore_2_df.control_carrier
sv_msc_zscore_2_total_carriers_df = sv_msc_zscore_2_df[["vrnt_type", "consequence", "total_carrier"]]
sv_msc_nonzero_carrier_df = sv_msc_zscore_2_total_carriers_df[sv_msc_zscore_2_total_carriers_df.total_carrier != 0].drop_duplicates()
sv_msc_nonzero_carrier_df = sv_msc_nonzero_carrier_df.drop(columns=["total_carrier"])
print(f"SV consequences with carriers: {sv_msc_nonzero_carrier_df.shape[0]}")

SV consequences with carriers: 47


In [9]:
sv_msc_zscore_df = all_carrier_count_df[all_carrier_count_df.vrnt_type.isin(["DEL", "DUP", "INV", "MEI"]) &
                                            (all_carrier_count_df.consequence != "all") & 
                                            (all_carrier_count_df.z_cutoff.isin(z_cutoff_bins)) & 
                                            (all_carrier_count_df.maf_range == "0-1") & 
                                            (all_carrier_count_df.window_name == "gene body +/-200kb") 
                                       ].drop_duplicates()


sv_msc_zscore_2_nonzero_carrier_df = pd.merge(sv_msc_zscore_df, 
                                                    sv_msc_nonzero_carrier_df, 
                                                    on=["vrnt_type", "consequence"], 
                                                    how="inner")
print(f"Number of tests: {sv_msc_zscore_2_nonzero_carrier_df.shape[0]}")

Number of tests: 235


In [10]:
all_carrier_count_df_list = [snv_indel_sv_zscore_af_df, 
                            snv_indel_sv_rare_windows_zscore_df,
                            sv_types_rare_zscore_df,
                            sv_types_rare_windows_zscore_df, 
                            sv_msc_zscore_2_nonzero_carrier_df
                           ]
all_carrier_count_df = pd.concat(all_carrier_count_df_list).drop(columns=["consequence_name"])
test_num = all_carrier_count_df.shape[0]
print(f"Total number of tests required: {all_carrier_count_df.shape[0]}")

Total number of tests required: 700


In [11]:
### enrichment testing 
def enrichment_test(row):
    """Enrichment testing"""
    misexp_carrier = row["misexp_carrier"]
    misexp_total = row["misexp_total"]
    control_carrier = row["control_carrier"]
    control_total =  row["control_total"]
    # contingency matrix 
    conting_mtx_list = [[misexp_carrier, misexp_total - misexp_carrier], [control_carrier, control_total - control_carrier]]
    conting_mtx = np.array(conting_mtx_list)
    # enrichment testing 
    oddsratio = sm.stats.Table2x2(conting_mtx).oddsratio
    riskratio = sm.stats.Table2x2(conting_mtx).riskratio
    _, pval = fisher_exact(conting_mtx)
    # 95% confidence intervals by a normal approximation 
    riskratio_confint_lower,  riskratio_confint_upper = sm.stats.Table2x2(conting_mtx).riskratio_confint(0.05, method="normal")
    oddsratio_confint_lower,  oddsratio_confint_upper = sm.stats.Table2x2(conting_mtx).oddsratio_confint(0.05, method="normal")
    return pd.Series({'risk_ratio': riskratio, 'risk_ratio_lower':riskratio_confint_lower, 
                      'risk_ratio_upper': riskratio_confint_upper, 'odds_ratio':oddsratio, 
                      'odds_ratio_lower': oddsratio_confint_lower, 'odds_ratio_upper':oddsratio_confint_upper, 
                      'pval': pval})

In [12]:
enrichment_results_df = all_carrier_count_df.apply(enrichment_test, axis=1, result_type="expand")
all_enrich_tests_df = pd.concat([all_carrier_count_df, enrichment_results_df], axis=1)

In [13]:
### multiple testing correction 
# multiple testing correction (BH FDR method)
pval_as_array = all_enrich_tests_df.pval.to_numpy()
for method in ["fdr_bh", "bonferroni"]: 
    pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=method)
    all_enrich_tests_df[f"{method}_pass"] = pass_test
    all_enrich_tests_df[f"{method}_pval_adj"] = pval_adj

In [14]:
# check Bonferroni cutoff 
if 0.05/test_num < all_enrich_tests_df[all_enrich_tests_df.bonferroni_pass].pval.max(): 
    raise ValueError("Max p-value passing Bonferroni greater than expected cutoff.")

In [15]:
# write to file
all_enrich_tests_bonf_path = outdir_path.joinpath(f"snp_indel_sv_all_enrich_results_{test_num}_bonf_adj_gene_body_10kb.tsv")
all_enrich_tests_df.to_csv(all_enrich_tests_bonf_path, sep="\t", index=False)

### Enrichment Results 

In [17]:
# significant enrichment for rare SVs (all SVs) in gene body window +/- 200 kb
# across all z-score cutoffs 
all_sv_rare_gene_body_200_df = all_enrich_tests_df[(all_enrich_tests_df.maf_range == "0-1") & 
                                                   (all_enrich_tests_df.vrnt_type == "all_sv") & 
                                                   (all_enrich_tests_df.window_raw == "gene_body_10")] 
all_sv_rare_gene_body_200 = all_sv_rare_gene_body_200_df.bonferroni_pass.all()
print(f"At all z-score cutoffs rare SVs are enriched in gene body +/-200kb window: {all_sv_rare_gene_body_200}")

At all z-score cutoffs rare SVs are enriched in gene body +/-200kb window: True


In [18]:
# only one significant enrichment observed for low frequency and common structural variants
# across all z-score cutoffs 
all_sv_not_rare_gene_body_200_df = all_enrich_tests_df[(all_enrich_tests_df.maf_range != "0-1") & 
                                                   (all_enrich_tests_df.vrnt_type == "all_sv") & 
                                                   (all_enrich_tests_df.window_raw == "gene_body_10") & 
                                                    (all_enrich_tests_df.risk_ratio > 1) 
                                                  ] 
all_sv_not_rare_gene_body_200_pass = all_sv_not_rare_gene_body_200_df[all_sv_not_rare_gene_body_200_df.bonferroni_pass].shape[0]
print(f"Number of tests where non-rare SVs are significantly enriched in gene body window: {all_sv_not_rare_gene_body_200_pass}")

Number of tests where non-rare SVs are significantly enriched in gene body window: 1


In [19]:
# no significant enrichment observed for SNVs and indels at any z-score cutoff
snv_indel_all_maf_gene_body_10_df = all_enrich_tests_df[(all_enrich_tests_df.vrnt_type.isin(["indel", "snp"])) & 
                                                        (all_enrich_tests_df.window_raw == "gene_body_window_10000") & 
                                                        (all_enrich_tests_df.risk_ratio > 1) 
                                                       ]
snv_indel_all_maf_gene_body_10_pass = snv_indel_all_maf_gene_body_10_df[snv_indel_all_maf_gene_body_10_df.bonferroni_pass].shape[0]
print(f"Number of tests where SNVs/indels (any MAF) are significantly enriched in gene body window: {snv_indel_all_maf_gene_body_10_pass}")
# max indel enrichment in gene body window +/- 10 kb 
# max risk ratio  
snv_max_nominal_enrich = all_enrich_tests_df[(all_enrich_tests_df.vrnt_type.isin(["snp"])) & 
                                                (all_enrich_tests_df.window_raw == "gene_body_window_10000")
                                               ].risk_ratio.max()
print(f"Max risk ratio SNVs in gene body +/- 10kb all MAF cutoffs all z-score cutoffs: {snv_max_nominal_enrich}")
# max risk ratio for indels 
indel_max_nominal_enrich = all_enrich_tests_df[(all_enrich_tests_df.vrnt_type.isin(["indel"])) & 
                                                (all_enrich_tests_df.window_raw == "gene_body_window_10000")
                                               ].risk_ratio.max()
print(f"Max risk ratio indels in gene body +/- 10kb all MAF cutoffs all z-score cutoffs: {indel_max_nominal_enrich}")

Number of tests where SNVs/indels (any MAF) are significantly enriched in gene body window: 0
Max risk ratio SNVs in gene body +/- 10kb all MAF cutoffs all z-score cutoffs: 1.0379045440191041
Max risk ratio indels in gene body +/- 10kb all MAF cutoffs all z-score cutoffs: 1.1159430987899683


In [21]:
# significant enrichment for rare SVs in gene body window and upstream window 
all_sv_rare_windows_df = all_enrich_tests_df[(all_enrich_tests_df.maf_range == "0-1") & 
                                             (all_enrich_tests_df.vrnt_type == "all_sv") & 
                                             ~(all_enrich_tests_df.window_raw.isin(["gene_body_200", "gene_body_10"])) & 
                                             (all_enrich_tests_df.bonferroni_pass)
                                            ]
print(all_sv_rare_windows_df[["z_cutoff", "window_name"]])

       z_cutoff    window_name
16769      10.0      gene body
16779      10.0  TSS to -200kb
16879       2.0      gene body
16889       2.0  TSS to -200kb
16934      20.0      gene body
16944      20.0  TSS to -200kb
17044      30.0      gene body
17054      30.0  TSS to -200kb
17154      40.0      gene body
17164      40.0  TSS to -200kb


In [22]:
# significant enrichment for rare SNVs and indels across windows
all_snv_indel_rare_windows_df = all_enrich_tests_df[(all_enrich_tests_df.maf_range == "0-1") & 
                                                    (all_enrich_tests_df.vrnt_type.isin(["indel", "snp"])) & 
                                                    (all_enrich_tests_df.window_raw != "gene_body_window_10000") & 
                                                    (all_enrich_tests_df.bonferroni_pass) &
                                                    (all_enrich_tests_df.risk_ratio > 1)
                                                   ]
# observe enrichment for indels in some windows but not consistent across windows and z-socres 
# risk ratios very low too

In [23]:
# max enrichment for indels 
max_indel_enrich_windows = all_snv_indel_rare_windows_df[all_snv_indel_rare_windows_df.vrnt_type == "indel"].risk_ratio.max()
print(f"Maximum significant rare indel enrichment across windows: {max_indel_enrich_windows}")

Maximum significant rare indel enrichment across windows: 1.1090835902065395


In [24]:
### SV type enrichment - gene body +/- 200kb and windows 

# 200kb window around genes only DELs and DUPs significantly enriched
all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["DEL"]) & 
                    (all_enrich_tests_df.consequence == "all") &
                    (all_enrich_tests_df.bonferroni_pass)
                   ].sort_values(by="risk_ratio")

# Deletions significantly enriched in gene body + 200kb window all z-scores 
# Deletions significantly enriched in +200kb window all z-scores 
# Deletions significantly enriched in gene body window at high z-scores (30, 40)

Unnamed: 0,vrnt_type,consequence,maf_range,z_cutoff,z_cutoff_name,window_raw,window_name,misexp_carrier,misexp_total,control_carrier,...,risk_ratio_lower,risk_ratio_upper,odds_ratio,odds_ratio_lower,odds_ratio_upper,pval,fdr_bh_pass,fdr_bh_pval_adj,bonferroni_pass,bonferroni_pval_adj
8373,DEL,all,0-1,2.0,> 2,gene_body_200,gene body +/-200kb,187,17380,80411,...,1.185375,1.576959,1.371213,1.186995,1.584021,4.326669e-05,True,0.0003563139,True,0.03028669
9017,DEL,all,0-1,10.0,> 10,gene_body_200,gene body +/-200kb,132,10668,80234,...,1.328671,1.865497,1.581563,1.331911,1.878009,9.504309e-07,True,1.108836e-05,True,0.0006653016
16890,DEL,all,0-1,2.0,> 2,upstream_200000,TSS to -200kb,108,17380,38088,...,1.380955,2.012405,1.671217,1.382783,2.019814,7.638483e-07,True,9.492433e-06,True,0.0005346938
9339,DEL,all,0-1,20.0,> 20,gene_body_200,gene body +/-200kb,68,4622,63210,...,1.500565,2.405959,1.913519,1.505865,2.431529,1.301025e-06,True,1.445584e-05,True,0.0009107176
16780,DEL,all,0-1,10.0,> 10,upstream_200000,TSS to -200kb,85,10668,38008,...,1.731307,2.645421,2.149259,1.73576,2.661262,3.145948e-10,True,8.156161e-09,True,2.202164e-07
9661,DEL,all,0-1,30.0,> 30,gene_body_200,gene body +/-200kb,38,2019,36970,...,1.770464,3.324942,2.453609,1.779645,3.382808,1.274055e-06,True,1.438449e-05,True,0.0008918384
16945,DEL,all,0-1,20.0,> 20,upstream_200000,TSS to -200kb,45,4622,29889,...,1.98785,3.55726,2.675504,1.994336,3.589326,1.045641e-08,True,1.829871e-07,True,7.319486e-06
17055,DEL,all,0-1,30.0,> 30,upstream_200000,TSS to -200kb,24,2019,17514,...,2.172666,4.815724,3.261533,2.18027,4.879028,9.986397e-07,True,1.14598e-05,True,0.0006990478
9983,DEL,all,0-1,40.0,> 40,gene_body_200,gene body +/-200kb,28,803,15706,...,3.244114,6.72126,4.802109,3.292669,7.003513,4.68506e-11,True,1.311817e-09,True,3.279542e-08
17165,DEL,all,0-1,40.0,> 40,upstream_200000,TSS to -200kb,17,803,7392,...,3.761657,9.646207,6.13242,3.790784,9.920527,8.151503e-09,True,1.46309e-07,True,5.706052e-06


In [25]:
# Duplications windows and gene body enrichment 
all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["DUP"]) & 
                    (all_enrich_tests_df.consequence == "all") &
                    (all_enrich_tests_df.bonferroni_pass)
                   ].sort_values(by="risk_ratio")
# Duplications significantly enriched in gene body + 200kb window all z-scores 
# Duplications significantly enriched in gene body window all z-scores 

Unnamed: 0,vrnt_type,consequence,maf_range,z_cutoff,z_cutoff_name,window_raw,window_name,misexp_carrier,misexp_total,control_carrier,...,risk_ratio_lower,risk_ratio_upper,odds_ratio,odds_ratio_lower,odds_ratio_upper,pval,fdr_bh_pass,fdr_bh_pval_adj,bonferroni_pass,bonferroni_pval_adj
8453,DUP,all,0-1,2.0,> 2,gene_body_200,gene body +/-200kb,36,17380,7906,...,1.930256,3.712788,2.680538,1.931458,3.720134,2.641502e-07,True,3.555869e-06,True,0.0001849052
9097,DUP,all,0-1,10.0,> 10,gene_body_200,gene body +/-200kb,26,10668,7901,...,2.143761,4.625803,3.154317,2.145324,4.637861,6.369462e-07,True,8.106589e-06,True,0.0004458624
9419,DUP,all,0-1,20.0,> 20,gene_body_200,gene body +/-200kb,17,4622,6242,...,2.991064,7.736091,4.824383,2.994565,7.772304,2.124559e-07,True,2.974382e-06,True,0.0001487191
9741,DUP,all,0-1,30.0,> 30,gene_body_200,gene body +/-200kb,15,2019,3807,...,5.611961,15.413631,9.362705,5.628203,15.575174,2.165972e-10,True,5.831464e-09,True,1.516181e-07
16882,DUP,all,0-1,2.0,> 2,upstream_0,gene body,22,17380,964,...,8.794992,20.468198,13.432803,8.800693,20.502952,1.2203900000000002e-17,True,9.491925e-16,True,8.542733e-15
10063,DUP,all,0-1,40.0,> 40,gene_body_200,gene body +/-200kb,12,803,1680,...,10.648617,32.870975,18.977759,10.71019,33.627351,5.427455e-12,True,1.651834e-10,True,3.799219e-09
16772,DUP,all,0-1,10.0,> 10,upstream_0,gene body,19,10668,959,...,12.044643,29.844,18.991483,12.055418,29.918202,3.894176e-18,True,4.543206e-16,True,2.725923e-15
16937,DUP,all,0-1,20.0,> 20,upstream_0,gene body,13,4622,772,...,17.204574,51.41681,29.823368,17.22528,51.635345,2.500025e-15,True,1.166678e-13,True,1.750018e-12
17047,DUP,all,0-1,30.0,> 30,upstream_0,gene body,13,2019,496,...,35.732239,107.118858,62.262029,35.835576,108.176309,2.3637209999999998e-19,True,5.5153490000000006e-17,True,1.654605e-16
17157,DUP,all,0-1,40.0,> 40,upstream_0,gene body,10,803,267,...,52.378475,183.73271,99.324615,52.62924,187.450534,3.515968e-17,True,2.461178e-15,True,2.461178e-14


In [26]:
all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["INV"]) & 
                    (all_enrich_tests_df.consequence == "all") &
                    (all_enrich_tests_df.bonferroni_pass)
                   ].sort_values(by="risk_ratio")
# INVs only enriched over gene body for >2 and >10 z-cutoffs 

Unnamed: 0,vrnt_type,consequence,maf_range,z_cutoff,z_cutoff_name,window_raw,window_name,misexp_carrier,misexp_total,control_carrier,...,risk_ratio_lower,risk_ratio_upper,odds_ratio,odds_ratio_lower,odds_ratio_upper,pval,fdr_bh_pass,fdr_bh_pval_adj,bonferroni_pass,bonferroni_pval_adj
16883,INV,all,0-1,2.0,> 2,upstream_0,gene body,7,17380,88,...,21.662808,100.957767,46.784119,21.665116,101.026638,3.942279e-10,True,9.515845e-09,True,2.759595e-07
16773,INV,all,0-1,10.0,> 10,upstream_0,gene body,7,10668,88,...,35.264085,164.315721,76.170566,35.270407,164.49924,1.374748e-11,True,4.009683e-10,True,9.623239e-09


In [27]:
# MEI enrichment 
all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["MEI"]) & 
                    (all_enrich_tests_df.consequence == "all") &
                    (all_enrich_tests_df.bonferroni_pass)
                   ].sort_values(by="risk_ratio")

Unnamed: 0,vrnt_type,consequence,maf_range,z_cutoff,z_cutoff_name,window_raw,window_name,misexp_carrier,misexp_total,control_carrier,...,risk_ratio_lower,risk_ratio_upper,odds_ratio,odds_ratio_lower,odds_ratio_upper,pval,fdr_bh_pass,fdr_bh_pval_adj,bonferroni_pass,bonferroni_pval_adj


In [28]:
# Deletion consequences enriched
all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["DEL"]) & 
                    (all_enrich_tests_df.consequence != "all") &
                    (all_enrich_tests_df.window_raw == "gene_body_200") &
                    (all_enrich_tests_df.bonferroni_pass)
                   ].sort_values(by="risk_ratio")

Unnamed: 0,vrnt_type,consequence,maf_range,z_cutoff,z_cutoff_name,window_raw,window_name,misexp_carrier,misexp_total,control_carrier,...,risk_ratio_lower,risk_ratio_upper,odds_ratio,odds_ratio_lower,odds_ratio_upper,pval,fdr_bh_pass,fdr_bh_pval_adj,bonferroni_pass,bonferroni_pval_adj
80,DEL,no_predicted_effect,0-1,2.0,> 2,gene_body_200,gene body +/-200kb,139,17380,53793,...,1.287061,1.793093,1.523336,1.288885,1.800436,3.596051e-06,True,3.701817e-05,True,0.002517
81,DEL,no_predicted_effect,0-1,10.0,> 10,gene_body_200,gene body +/-200kb,97,10668,53619,...,1.419819,2.110835,1.737895,1.422732,2.122871,5.88902e-07,True,7.633915e-06,True,0.000412
82,DEL,no_predicted_effect,0-1,20.0,> 20,gene_body_200,gene body +/-200kb,50,4622,41569,...,1.61232,2.799281,2.136759,1.616772,2.823985,1.445409e-06,True,1.556594e-05,True,0.001012
83,DEL,no_predicted_effect,0-1,30.0,> 30,gene_body_200,gene body +/-200kb,25,2019,23555,...,1.696621,3.699404,2.524165,1.701074,3.745519,4.32434e-05,True,0.0003563139,True,0.03027
45,DEL,upstream_gene_variant,0-1,2.0,> 2,gene_body_200,gene body +/-200kb,12,17380,1524,...,2.623654,8.167887,4.631731,2.624052,8.175499,1.844943e-05,True,0.0001655718,True,0.012915
84,DEL,no_predicted_effect,0-1,40.0,> 40,gene_body_200,gene body +/-200kb,19,803,9741,...,3.274796,7.970355,5.208521,3.302892,8.213618,1.483367e-08,True,2.532578e-07,True,1e-05
46,DEL,upstream_gene_variant,0-1,10.0,> 10,gene_body_200,gene body +/-200kb,10,10668,1526,...,3.368277,11.675195,6.275933,3.368979,11.691181,7.104095e-06,True,6.812146e-05,True,0.004973
39,DEL,non_coding_transcript_exon_variant,0-1,40.0,> 40,gene_body_200,gene body +/-200kb,4,803,329,...,11.910231,85.147102,31.999696,11.909887,85.97735,9.58854e-06,True,8.83155e-05,True,0.006712


In [29]:
# Duplication consequences 
all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["DUP"]) & 
                    (all_enrich_tests_df.consequence != "all") &
                    (all_enrich_tests_df.window_raw == "gene_body_200") &
                    (all_enrich_tests_df.bonferroni_pass)
                   ].sort_values(by="risk_ratio")

Unnamed: 0,vrnt_type,consequence,maf_range,z_cutoff,z_cutoff_name,window_raw,window_name,misexp_carrier,misexp_total,control_carrier,...,risk_ratio_lower,risk_ratio_upper,odds_ratio,odds_ratio_lower,odds_ratio_upper,pval,fdr_bh_pass,fdr_bh_pval_adj,bonferroni_pass,bonferroni_pval_adj
140,DUP,non_coding_transcript_exon_variant,0-1,2.0,> 2,gene_body_200,gene body +/-200kb,8,17380,349,...,6.687598,27.157057,13.482224,6.688336,27.177221,2.468041e-07,True,3.387507e-06,True,0.000173
120,DUP,transcript_amplification,0-1,2.0,> 2,gene_body_200,gene body +/-200kb,9,17380,284,...,9.596524,36.170803,18.64013,9.598025,36.200617,2.942343e-09,True,6.241334e-08,True,2e-06
121,DUP,transcript_amplification,0-1,10.0,> 10,gene_body_200,gene body +/-200kb,6,10668,287,...,8.915413,44.892958,20.016677,8.916202,44.936996,8.348578e-07,True,9.905092e-06,True,0.000584
141,DUP,non_coding_transcript_exon_variant,0-1,10.0,> 10,gene_body_200,gene body +/-200kb,8,10668,349,...,10.88659,44.199589,21.951592,10.888784,44.254011,6.156241e-09,True,1.165118e-07,True,4e-06
142,DUP,non_coding_transcript_exon_variant,0-1,20.0,> 20,gene_body_200,gene body +/-200kb,5,4622,314,...,11.630692,68.009897,28.154135,11.63188,68.145078,1.332198e-06,True,1.457091e-05,True,0.000933
122,DUP,transcript_amplification,0-1,20.0,> 20,gene_body_200,gene body +/-200kb,4,4622,209,...,12.574495,90.872781,33.831954,12.574486,91.025679,7.767531e-06,True,7.347665e-05,True,0.005437
127,DUP,coding_sequence_variant,0-1,20.0,> 20,gene_body_200,gene body +/-200kb,3,4622,104,...,16.173894,160.493773,50.981527,16.172468,160.712401,3.43899e-05,True,0.0002971967,True,0.024073
143,DUP,non_coding_transcript_exon_variant,0-1,30.0,> 30,gene_body_200,gene body +/-200kb,5,2019,230,...,21.180003,124.325765,51.439823,21.185939,124.896773,7.150757e-08,True,1.11234e-06,True,5e-05
123,DUP,transcript_amplification,0-1,30.0,> 30,gene_body_200,gene body +/-200kb,4,2019,124,...,28.160751,205.889974,76.293877,28.161975,206.688472,3.279864e-07,True,4.331896e-06,True,0.00023
128,DUP,coding_sequence_variant,0-1,30.0,> 30,gene_body_200,gene body +/-200kb,3,2019,76,...,29.418491,295.11939,93.314243,29.413195,296.042233,5.853279e-06,True,5.690688e-05,True,0.004097


In [30]:
# Inversion consequences 
all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["INV"]) & 
                    (all_enrich_tests_df.consequence != "all") &
                    (all_enrich_tests_df.window_raw == "gene_body_200") &
                    (all_enrich_tests_df.bonferroni_pass)
                   ].sort_values(by="risk_ratio")

Unnamed: 0,vrnt_type,consequence,maf_range,z_cutoff,z_cutoff_name,window_raw,window_name,misexp_carrier,misexp_total,control_carrier,...,risk_ratio_lower,risk_ratio_upper,odds_ratio,odds_ratio_lower,odds_ratio_upper,pval,fdr_bh_pass,fdr_bh_pval_adj,bonferroni_pass,bonferroni_pval_adj
192,INV,coding_sequence_variant,0-1,20.0,> 20,gene_body_200,gene body +/-200kb,2,4622,12,...,65.902509,1314.899323,294.499495,65.894318,1316.197749,2.899973e-05,True,0.0002537476,True,0.02029981
190,INV,coding_sequence_variant,0-1,2.0,> 2,gene_body_200,gene body +/-200kb,7,17380,7,...,206.23929,1675.916381,588.147873,206.278641,1676.94493,1.380879e-16,True,8.78741e-15,True,9.666151e-14
191,INV,coding_sequence_variant,0-1,10.0,> 10,gene_body_200,gene body +/-200kb,7,10668,7,...,335.721268,2727.73477,957.580433,335.825865,2730.463555,4.597358e-18,True,4.597358e-16,True,3.21815e-15


In [31]:
# change TES to TTS (transcription termination site)
tes_to_tts_replace = {"TES to 200kb": "TTS to 200kb"}
all_enrich_tests_df.window_name = all_enrich_tests_df.window_name.replace(tes_to_tts_replace)

In [32]:
# check if rare SVs 200kb to TSS are significant across all z-score cutoffs 
all_enrich_tests_df[(all_enrich_tests_df.vrnt_type == "all_sv") & 
                    (all_enrich_tests_df.window_raw == "upstream_200000") &
                    (all_enrich_tests_df.maf_range == "0-1")
                   ].bonferroni_pass.unique()
# window remains significant up to 200kb z-score cutoff 

array([ True])

### Subset enrichments for figures  

1. SNV, indel and SV - z-score cutoffs and MAF cutoff 
2. SNV, indel and SV windows analysis 
3. SV types - Cutoff, Bonferroni at z-score = 10 
    * Only includes DELs and DUPs 
    * Supplementary figure with other z-score cutoffs 
4. Windows - DEL, DUP, INV 
    * Supplementary figure with other z-score cutoffs
5. SV consequences 

In [33]:
subset_enrich_path = outdir_path.joinpath("grouped_enrich_gene_body_10kb")
subset_enrich_path.mkdir(parents="True", exist_ok=True)

In [39]:
### SNV, Indel and SV comparison 
# SNV, indel and SV - gene window results, all MAF +/- 10kb windows 
snp_indel_sv_gene_window_df = all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["all_sv", "indel", "snp"]) &
                                                  all_enrich_tests_df.window_raw.isin(["gene_body_10", "gene_body_window_10000"])]
snp_indel_sv_gene_window_df.to_csv(subset_enrich_path.joinpath("snp_indel_sv_gene_window_results.tsv"), sep="\t", index=False)

# SNV, indel and SV - windows, MAF < 1%  
snp_indel_sv_window_df = all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["all_sv", "indel", "snp"]) &
                                                 (all_enrich_tests_df.maf_range == "0-1") &
                                                 ~(all_enrich_tests_df.window_raw.isin(["gene_body_10", "gene_body_200", "gene_body_window_10000"]))]
snp_indel_sv_window_df.to_csv(subset_enrich_path.joinpath("snp_indel_sv_maf_0_1perc_window_bin_results.tsv"), sep="\t", index=False)  

### SV variant types enrich, MAF < 1%, z cutoffs
# select rare (MAF < 1%) SV types, gene body +/- 200kb
sv_types_rare_200kb_results_df = all_enrich_tests_df[(all_enrich_tests_df.vrnt_type.isin(["DEL", "DUP", "INV", "MEI"])) &
                                                    (all_enrich_tests_df.window_raw.isin(["gene_body_200"])) & 
                                                    (all_enrich_tests_df.maf_range.isin(["0-1"])) &
                                                    (all_enrich_tests_df.consequence == "all")  
                                                   ]
sv_types_rare_200kb_results_df.to_csv(subset_enrich_path.joinpath("sv_types_maf_0_1perc_200kb_all_z_results.tsv"), sep="\t", index=False)

# windows 
sv_types_rare_windows_results_df = all_enrich_tests_df[(all_enrich_tests_df.vrnt_type.isin(["DEL", "DUP", "INV", "MEI"])) &
                                                    ~(all_enrich_tests_df.window_raw.isin(["gene_body_10", "gene_body_200"])) & 
                                                    (all_enrich_tests_df.maf_range.isin(["0-1"])) &
                                                    (all_enrich_tests_df.consequence == "all")  
                                                   ]
sv_types_rare_windows_results_df.to_csv(subset_enrich_path.joinpath("sv_types_maf_0_1perc_windows_all_z_results.tsv"), sep="\t", index=False)

In [36]:
# all consequences pass Bonferroni
sv_vep_enrich_tests_df = all_enrich_tests_df[all_enrich_tests_df.vrnt_type.isin(["INV", "DUP", "DEL"]) & 
                                             (all_enrich_tests_df.consequence != "all") &
                                             (all_enrich_tests_df.window_raw == "gene_body_200")
                                            ].copy()
sv_vep_enrich_tests_df["vrnt_type_consequence"] = sv_vep_enrich_tests_df.vrnt_type + "|" + sv_vep_enrich_tests_df.consequence

# get consequence that pass Bonferroni significance at least once 
vrnt_type_consq_pass_bonf = sv_vep_enrich_tests_df[sv_vep_enrich_tests_df.bonferroni_pass].vrnt_type_consequence.unique()
sv_vep_enrich_tests_consq_pass_bonf_df = sv_vep_enrich_tests_df[sv_vep_enrich_tests_df.vrnt_type_consequence.isin(vrnt_type_consq_pass_bonf)].copy()
# consequence names 
consq_names_dict = {"non_coding_transcript_exon_variant": "Non-coding transcript", 
                    "upstream_gene_variant": "Upstream",
                    "no_predicted_effect": "No predicted effect",
                    'transcript_amplification': "Transcript amplification", 
                    "coding_sequence_variant": "Coding"}
sv_vep_enrich_tests_consq_pass_bonf_df["consq_name"] = sv_vep_enrich_tests_consq_pass_bonf_df.consequence.replace(consq_names_dict)
# mask any values not passing nominal cutoff 
sv_vep_enrich_tests_consq_pass_bonf_df["risk_ratio_pass"] = np.where(sv_vep_enrich_tests_consq_pass_bonf_df.pval < 0.05, 
                                                                    sv_vep_enrich_tests_consq_pass_bonf_df.risk_ratio,
                                                                     np.nan
                                                                       )
sv_vep_enrich_tests_consq_pass_bonf_df["risk_upper_pass"] = np.where(sv_vep_enrich_tests_consq_pass_bonf_df.pval < 0.05,  
                                                                        sv_vep_enrich_tests_consq_pass_bonf_df.risk_ratio_upper, 
                                                                        np.nan
                                                                       )
sv_vep_enrich_tests_consq_pass_bonf_df["risk_lower_pass"] = np.where(sv_vep_enrich_tests_consq_pass_bonf_df.pval < 0.05,  
                                                                        sv_vep_enrich_tests_consq_pass_bonf_df.risk_ratio_lower,
                                                                         np.nan
                                                                       )

# write to file 
sv_consq_pass_bonf_path = subset_enrich_path.joinpath("sv_consq_pass_bonf.tsv")
sv_vep_enrich_tests_consq_pass_bonf_df.to_csv(sv_consq_pass_bonf_path, sep="\t", index=False)

In [37]:
# write z-cutoff = 10 to file for plotting for main text figure 
sv_consq_pass_z10_bonf_df = sv_vep_enrich_tests_consq_pass_bonf_df[(sv_vep_enrich_tests_consq_pass_bonf_df.z_cutoff == 10)
                                                                  ].copy()
sv_consq_pass_bonf_path = subset_enrich_path.joinpath("sv_consq_pass_bonf_z10.tsv")
sv_consq_pass_z10_bonf_df.to_csv(sv_consq_pass_bonf_path, sep="\t", index=False)