### Enrichment of misexpression-associated SVs in regulatory annotations 

* 3D genome architecture 
    * TAD boundaries from: 
        * GM12878 (shared across multiple cell types) 
    * A/B compartments 
        * GM12878 cell line 
* CTCF-binding sites 
    * All CTCF-only 
    * B-cells 
    * Neutrophils 
    * CD14 monocytes 
* Chromatin features 
    * Chrom HMM 
* CpG islands 

In [1]:
import pandas as pd 
import numpy as np
from pybedtools import BedTool
from io import StringIO
import statsmodels.api as sm
from statsmodels.discrete import discrete_model
import sys
from collections import Counter
from scipy.stats import fisher_exact
from patsy import dmatrices
from pathlib import Path
import math
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
from matplotlib import pyplot
from functools import reduce
from statsmodels.stats import multitest

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)
# inputs 
sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
all_vrnts_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_window_cntrl_misexp_genes.txt")
all_vrnts_bed_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed")
misexp_vrnts_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_misexp_tpm_zscore_median.txt")
# shared TAD boundaries across cell lines 
shared_tad_boundaries_path = wkdir_path.joinpath("reference/4d_nucleome/shared_boundaries/4DNFIVK5JOFU_imr90_huvec_hnek_hmec.bed")
# ENCODE cCREs 
encode_c_cres_dir = wkdir_path.joinpath("reference/encode/encode_c_cres")
# CpG islands 
cpg_isl_bed_path = wkdir_path.joinpath("reference/cpg_islands/cpgIslandExt.bed")
# PBMCs chromHMM 
pbmcs_chromhmm_15marks_path = wkdir_path.joinpath("reference/chromhmm/E062_15_coreMarks_hg38lift_mnemonics.bed")
# A/B compartments in GM12878
gm12878_ab_path = wkdir_path.joinpath("reference/4d_nucleome/gm12878_hi_c/compartments/4DNFILYQ1PAY.bg")

In [3]:
# output
out_dir = wkdir_path.joinpath("5_misexp_vrnts/functional")
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)

In [4]:
# load bed file with variants in windows 
vrnts_in_windows_bed = BedTool(all_vrnts_bed_path)
vrnts_in_windows_bed_sorted = vrnts_in_windows_bed.sort()
# all variants in windows df
all_vrnts_in_windows_df = pd.read_csv(all_vrnts_path, sep="\t", header=None).rename(columns={0:"vrnt_id"})
all_vrnts_in_windows = all_vrnts_in_windows_df.vrnt_id.unique()
print(f"Number of variant IDs in windows: {len(all_vrnts_in_windows)}")

# load misexpression-associated variants 
misexp_vrnts_ids = pd.read_csv(misexp_vrnts_path, sep="\t", header=None)[0].astype(str).unique()
print(f"Number of unique misexpression-associated SVs: {len(misexp_vrnts_ids)}")
all_vrnts_in_windows_df["misexp_uniq"] = np.where(all_vrnts_in_windows_df.vrnt_id.isin(misexp_vrnts_ids), 1, 0)

Number of variant IDs in windows: 20255
Number of unique misexpression-associated SVs: 105


In [5]:
### shared TAD boundaries

all_chrom_vrnts_tad_count_dist = []

bed_intersect_cols = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"boundary_chrom", 
                      5:"boundary_start", 6:"boundary_end", 7:"boundary_no", 8:"boundary_score", 9:"overlap"}

window_intersect_cols = {0:"boundary_chrom",1:"boundary_start", 2:"boundary_end", 3:"boundary_strength", 4:"boundary_score", 
                         5:"vrnt_chrom", 6:"vrnt_start", 7:"vrnt_end", 8:"vrnt_id"}

# get overlap for K562, GM12878 and shared GM12878 boundaries 
cell_type = "gm12878_shared"
window = 5000
vrnt_tad_features_df = all_vrnts_in_windows_df.copy()
print(cell_type)
# TAD boundaries bed file 
tad_boundaries_path = shared_tad_boundaries_path
tad_boundaries_bed = BedTool(tad_boundaries_path)
input_tad_bed = tad_boundaries_bed.sort()

# identify SVs that overlap TAD boundaries exactly
sv_intersect_tad_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(input_tad_bed, wo=True)))
sv_intersect_tad_df = pd.read_csv(sv_intersect_tad_str, sep="\t", header=None).rename(columns=bed_intersect_cols).astype({"vrnt_id":str})
sv_intersect_tad = sv_intersect_tad_df.vrnt_id.unique()

# annotate variants intersecting TAD boundaries and window around TAD
vrnt_tad_features_df[f"{cell_type}_intersect_tad_boundary"] = np.where(vrnt_tad_features_df.vrnt_id.isin(sv_intersect_tad), 1, 0)
# check for NaNs 
print(vrnt_tad_features_df.isnull().values.any())

gm12878_shared
False


In [6]:
# load A/B compartments in GM12878
gm12878_ab_bed = BedTool(gm12878_ab_path)
gm12878_ab_input = gm12878_ab_bed.sort()

# intersect with A/B compartment scores 
ab_bed_intersect_cols = {0: 'vrnt_chrom', 1: 'vrnt_start', 2: 'vrnt_end', 3: 'vrnt_id', 
                           4: 'compartment_chrom', 5: 'compartment_start',6: 'compartment_end',
                           7: 'score', 8: 'overlap'}
sv_intersect_ab_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(gm12878_ab_input, wo=True)))
sv_intersect_ab_df = pd.read_csv(sv_intersect_ab_str, sep="\t", header=None).rename(columns=ab_bed_intersect_cols).astype({"vrnt_id":str})
# label compartment types 
conditions = [(sv_intersect_ab_df.score >= 0) & (~sv_intersect_ab_df.score.isnull()), 
              (sv_intersect_ab_df.score < 0) & (~sv_intersect_ab_df.score.isnull()), 
              (sv_intersect_ab_df.score.isnull())]
values = ["A", "B", "Unassigned"]

sv_intersect_ab_df["compartment_type"] = np.select(conditions, values)
vrnt_compartment_features_df = all_vrnts_in_windows_df.copy()
for overlap in ["A", "B"]:
    vrnt_overlap_compartment = sv_intersect_ab_df[sv_intersect_ab_df.compartment_type == overlap].vrnt_id.unique()
    vrnt_compartment_features_df[f"{overlap}_overlap"] = np.where(vrnt_compartment_features_df.vrnt_id.isin(vrnt_overlap_compartment), 1, 0)


### CTCF elements 

* cCREs CTCF: all cell-types 
    - CTCF-only elements from all cell-types 
* Specific primary cells: 
    * CD14 monocytes 
    * B-cells 
    * Neutrophils 

In [7]:
bed_intersect_cols = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id"}
encode_c_cres_dir_path = Path(encode_c_cres_dir)

bed_path_list = ["GRCh38-cCREs.CTCF-only.bed", 
                 "ENCFF389PZY_ENCFF587XGD_ENCFF184NWF_ENCFF496PSJ.7group.bed", 
                 "ENCFF035DJL.7group.bed", 
                 "ENCFF685DZI_ENCFF311TAY_ENCFF300LXQ.7group.bed"
                ]

cell_types_list = ["all", 
                   "CD14_monocyte", 
                   "B_cell", 
                   "Neutrophil"
                  ]

vrnt_ccre_features_df = all_vrnts_in_windows_df.copy() 
for cell_type, bed_file in zip(cell_types_list, bed_path_list): 
    ctcf_peaks_path = encode_c_cres_dir_path.joinpath(f"{bed_file}")
    peak_bed = BedTool(ctcf_peaks_path).sort()

    all_chrom_peak_info = []   
    # identify SVs that overlap peaks
    sv_intersect_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(peak_bed, wo=True)))
    # check if string is empty 
    if not sv_intersect_str.getvalue():
        raise ValueError("No variants overlap ChIP-seq peaks")
    sv_intersect_df = pd.read_csv(sv_intersect_str, sep="\t", header=None).astype({3:str}).rename(columns=bed_intersect_cols)
    if cell_type == "all": 
        cre_type_index = -2
    else: 
        cre_type_index = -3
    cre_types_list = sv_intersect_df[sv_intersect_df.columns[cre_type_index]].unique().tolist()
    for cre_type in cre_types_list: 
        vrnt_id_intersect_cre_type = sv_intersect_df[sv_intersect_df[sv_intersect_df.columns[cre_type_index]] == cre_type].copy().vrnt_id.unique()
        vrnt_ccre_features_df[f"{cre_type.replace('-', '').replace(',', '')}_{cell_type}"] = np.where(vrnt_ccre_features_df.vrnt_id.isin(vrnt_id_intersect_cre_type), 1, 0)
    print(f"{cell_type}: number of unique variants in dataframe: {len(vrnt_ccre_features_df.vrnt_id.unique())}")
# subset to CTCF features 
vrnt_ctcf_ccre_features_df = vrnt_ccre_features_df[["vrnt_id", "misexp_uniq", "CTCFonlyCTCFbound_all", 'CTCFonlyCTCFbound_CD14_monocyte', 'HighCTCF_B_cell', "HighCTCF_Neutrophil"]]

all: number of unique variants in dataframe: 20255
CD14_monocyte: number of unique variants in dataframe: 20255
B_cell: number of unique variants in dataframe: 20255
Neutrophil: number of unique variants in dataframe: 20255


### CpG Islands 

In [8]:
### CpG Islands 
cpg_isl_bed = BedTool(cpg_isl_bed_path).sort()
# identify SVs that overlap islands
bed_intersect_cols_cpg = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"cpg_isl_chrom", 
                          5:"cpg_isl_start", 6:"cpg_isl_end", 8:"overlap"}
sv_intersect_cpg_isl_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(cpg_isl_bed, wo=True)))
sv_intersect_cpg_isl_df = pd.read_csv(sv_intersect_cpg_isl_str, sep="\t", header=None).rename(columns=bed_intersect_cols_cpg).astype({"vrnt_id":str})
sv_intersect_cpg_isl = sv_intersect_cpg_isl_df.vrnt_id.unique()
# annotate variants intersecting at least one CpG island
vrnts_cpg_island_df = all_vrnts_in_windows_df.copy() 
vrnts_cpg_island_df["intersect_cpg_isl"] = np.where(vrnts_cpg_island_df.vrnt_id.isin(sv_intersect_cpg_isl), 1, 0)

### Chromatin States 

In [9]:
chromhmm_15states = ['1_TssA', '2_TssAFlnk', '3_TxFlnk', '4_Tx', '5_TxWk', '6_EnhG', '7_Enh', '8_ZNF/Rpts', '9_Het', 
                     '10_TssBiv', '11_BivFlnk', '12_EnhBiv', '13_ReprPC', '14_ReprPCWk', '15_Quies']
vrnt_bed_columns = {0:"chrom", 1:"start", 2:"end", 3:"vrnt_id"}
bed_intersect_cols_chrom_hmm = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"state_chrom", 
                                5:"state_start", 6:"state_end", 7:"state_name", 8:"overlap"}

# load PBMCs chrom HMM states
pbmcs_chromhmm_15marks_bed = BedTool(pbmcs_chromhmm_15marks_path)
    
# identify SVs that overlap peaks
sv_intersect_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(pbmcs_chromhmm_15marks_bed, wo=True)))
sv_intersect_df = pd.read_csv(sv_intersect_str, sep="\t", header=None).astype({3:str}).rename(columns=bed_intersect_cols_chrom_hmm)
# build dataframe 
vrnts_chrom_hmm_df = all_vrnts_in_windows_df.copy()
for state in chromhmm_15states: 
    # does state overlap variant 
    vrnt_ids_with_state = sv_intersect_df[sv_intersect_df.state_name == state].vrnt_id
    vrnts_chrom_hmm_df[state] = np.where(vrnts_chrom_hmm_df.vrnt_id.isin(set(vrnt_ids_with_state)), 1, 0)

### Combine features 

In [10]:
# merge different features 
dfs_to_merge = [vrnt_tad_features_df, 
                vrnt_compartment_features_df, 
                vrnt_ctcf_ccre_features_df, 
                vrnts_cpg_island_df,
                vrnts_chrom_hmm_df]

vrnt_features_merged_df = reduce(lambda  left,right: pd.merge(left,right, on=all_vrnts_in_windows_df.columns.tolist(),
                                                              how='inner'), dfs_to_merge)

# remove / in name and remove number from name for chromatin HMM columns 
rename_columns = {col:"".join(col.split("/")).split("_")[1] for col in chromhmm_15states}
vrnt_features_merged_df = vrnt_features_merged_df.rename(columns=rename_columns)

In [11]:
# structural variant information 
sv_info_df =pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID": str}).rename(columns={"plinkID":"vrnt_id"})

vrnt_features_merged_info_df = pd.merge(vrnt_features_merged_df, 
                                   sv_info_df, 
                                   on="vrnt_id", 
                                   how="left"
                                  )

In [12]:
# check for NaNs
print(vrnt_features_merged_info_df.isnull().values.any())

False


In [13]:
# results
features_dir_path = out_dir_path.joinpath("features")
features_dir_path.mkdir(parents=True, exist_ok=True)
# write features to file 
vrnt_features_out = features_dir_path.joinpath("vrnt_features_reg_annot.csv")
vrnt_features_merged_info_df.to_csv(vrnt_features_out, index=False)

### Enrichment analysis 

* Normalise all features (including binary) and do single feature logistic regression 

In [14]:
features_dict = {
                 "3d_genome": ['gm12878_shared_intersect_tad_boundary', 'A_overlap', 'B_overlap', 'CTCFonlyCTCFbound_all', 
                                'CTCFonlyCTCFbound_CD14_monocyte', 'HighCTCF_B_cell', "HighCTCF_Neutrophil",
                              ], 
                 "epigenetics":["intersect_cpg_isl"] + list(rename_columns.values())
                }

In [15]:
logistic_regr_results = {}
logr_count = 0          
for sv_type in ["DEL", "DUP"]:
    input_df = vrnt_features_merged_info_df[vrnt_features_merged_info_df.SVTYPE == sv_type].copy()
    input_df["svlen_norm"] = (input_df["SVLEN"] - input_df["SVLEN"].mean())/input_df["SVLEN"].std()
    for key in features_dict.keys():
        features = features_dict[key]
        for feature in features:
            input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
            y, X = dmatrices(f'misexp_uniq ~ {feature}_norm + svlen_norm', input_df, return_type = 'dataframe')
            logit_fit = sm.Logit(endog=y, exog=X).fit(maxiter=1000)
            log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
            # normal approximation confidence intervals
            lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
            upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
            logistic_regr_results[logr_count] = [key, feature, sv_type, log_odds, lower_conf, upper_conf, pval]
            logr_count += 1

Optimization terminated successfully.
         Current function value: 0.030222
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030460
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030443
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030402
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030368
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030115
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030180
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030096
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030312
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030462
  

In [16]:
columns_logr_enrich = ["category", "feature", "sv_type", "log_odds", "lower", "upper", "pval"]
logistic_regr_results_df = pd.DataFrame.from_dict(logistic_regr_results, orient="index", columns=columns_logr_enrich)

# multiple testing correction
pval_list = logistic_regr_results_df.pval.to_numpy()
# BH FDR
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="fdr_bh")
logistic_regr_results_df["pass_fdr_bh"] = reject
logistic_regr_results_df["pvals_corrected_fdr_bh"] = pvals_corrected
# Bonferroni correction 
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="bonferroni")
logistic_regr_results_df["pass_bonf"] = reject
logistic_regr_results_df["pvals_corrected_bonf"] = pvals_corrected

### Significant results 

In [17]:
logistic_regr_results_pass_bonf_df = logistic_regr_results_df[logistic_regr_results_df.pass_bonf].sort_values(by="log_odds", ascending=False)
for index, row in logistic_regr_results_pass_bonf_df.iterrows(): 
    print(row["feature"], row["sv_type"], row["log_odds"])

Enh DUP 1.2620667347930006
TxWk DUP 1.2483521290660107
intersect_cpg_isl DUP 0.8790587581418782
TssA DUP 0.8279293576899485
TssAFlnk DUP 0.813158618928767
ReprPC DUP 0.716611544333929
Tx DUP 0.6961995041587603
EnhG DUP 0.5916922200004098
Tx DEL 0.3626878232536936
ReprPCWk DEL 0.30791525341755954
HighCTCF_B_cell DEL 0.2971438567503857
TxWk DEL 0.29588000202608405
intersect_cpg_isl DEL 0.27403255506016183
Enh DEL 0.25040211979759047
HighCTCF_Neutrophil DEL 0.23921334073247189
gm12878_shared_intersect_tad_boundary DEL 0.19924272289615444
EnhBiv DEL 0.19562396283062797


### Write results to file 

In [18]:
category_names = {"3d_genome": "3D Genome", "epigenetics": "Regulatory"}

feature_names = {'gm12878_shared_intersect_tad_boundary': "TAD boundaries (shared)", 
                 'CTCFonlyCTCFbound_all': "CTCF (all)",
                 'CTCFonlyCTCFbound_CD14_monocyte': "CTCF (CD14-monocytes)", 
                 'HighCTCF_B_cell': "CTCF (B-cells)",
                 'HighCTCF_Neutrophil': "CTCF (Neutrophils)", 
                 'intersect_cpg_isl': "CpG islands", 
                 'TssA': "Active TSS", 
                 'TssAFlnk': "Flanking active TSS",
                 'TxFlnk': "Transcr. at gene 5' and 3'", 
                 'Tx': "Strong transcription", 
                 'TxWk': "Weak transcription", 
                 'EnhG': "Genic enhancers", 
                 'Enh': "Enhancers", 
                 'ZNFRpts': "ZNF genes & repeats", 
                 'Het': "Heterochromatin", 
                 'TssBiv': "Bivalent/poised TSS",
                 'BivFlnk': "Flanking bivalent TSS/enhancer", 
                 'EnhBiv': "Bivalent enhancer", 
                 'ReprPC': "Repressed polyComb", 
                 'ReprPCWk': "Weak repressed polycomb", 
                 'Quies': "Quiescent", 
                 'A_overlap': "A compartment", 
                 'B_overlap': "B compartment"
                }

logistic_regr_results_df["feature_name"] = logistic_regr_results_df.feature.replace(feature_names)
logistic_regr_results_df["category_name"] = logistic_regr_results_df.category.replace(category_names)

In [19]:
# all results for deletions and duplications, use Bonferroni cutoff
logistic_regr_results_df["log_odds_adj"] = np.where(logistic_regr_results_df.pass_bonf, 
                                                    logistic_regr_results_df.log_odds, 
                                                    np.nan)
# results
results_dir_path = out_dir_path.joinpath("results")
results_dir_path.mkdir(parents=True, exist_ok=True)
all_results_path = results_dir_path.joinpath("logr_func_enrich_results_all.tsv")
logistic_regr_results_df.to_csv(all_results_path, sep="\t", index=False)                                      

### Non-length adjusted enrichments 

In [20]:
logistic_regr_results_non_len_adj = {}
logr_count = 0          
for sv_type in ["DEL", "DUP"]:
    input_df = vrnt_features_merged_info_df[vrnt_features_merged_info_df.SVTYPE == sv_type].copy()
    for key in features_dict.keys():
        features = features_dict[key]
        for feature in features:
            input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
            y, X = dmatrices(f'misexp_uniq ~ {feature}_norm', input_df, return_type = 'dataframe')
            logit_fit = sm.Logit(endog=y, exog=X).fit(maxiter=1000)
            log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
            # normal approximation confidence intervals
            lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
            upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
            logistic_regr_results_non_len_adj[logr_count] = [key, feature, sv_type, log_odds, lower_conf, upper_conf, pval]
            logr_count += 1
            
logistic_regr_results_non_len_adj_df = pd.DataFrame.from_dict(logistic_regr_results_non_len_adj, orient="index", columns=columns_logr_enrich)

# multiple testing correction
pval_list = logistic_regr_results_non_len_adj_df.pval.to_numpy()
# BH FDR
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="fdr_bh")
logistic_regr_results_non_len_adj_df["pass_fdr_bh"] = reject
logistic_regr_results_non_len_adj_df["pvals_corrected_fdr_bh"] = pvals_corrected
# Bonferroni correction 
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="bonferroni")
logistic_regr_results_non_len_adj_df["pass_bonf"] = reject
logistic_regr_results_non_len_adj_df["pvals_corrected_bonf"] = pvals_corrected

Optimization terminated successfully.
         Current function value: 0.030299
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030644
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030633
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030489
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030485
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030172
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030239
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030176
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030417
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030625
  



Optimization terminated successfully.
         Current function value: 0.030490
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030164
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030430
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.030296
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.030624
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.050388
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.050647
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.051551
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.050535
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.048839


In [21]:
logistic_regr_results_non_len_adj_pass_bonf_df = logistic_regr_results_non_len_adj_df[logistic_regr_results_non_len_adj_df.pass_bonf].sort_values(by="log_odds", ascending=False)
for index, row in logistic_regr_results_non_len_adj_pass_bonf_df.iterrows(): 
    print(row["feature"], row["sv_type"], row["log_odds"])

Enh DUP 1.3483934099463208
TxWk DUP 1.3371875314903858
intersect_cpg_isl DUP 0.9900244119091234
TssA DUP 0.9199302894668124
TssAFlnk DUP 0.914645477756096
HighCTCF_B_cell DUP 0.9067217923366574
ReprPC DUP 0.818066833360135
Tx DUP 0.8038045067062993
HighCTCF_Neutrophil DUP 0.7961936907052427
CTCFonlyCTCFbound_CD14_monocyte DUP 0.719790682335864
EnhG DUP 0.6773341661297582
EnhBiv DUP 0.6090085756204552
gm12878_shared_intersect_tad_boundary DUP 0.521749321061759
TssBiv DUP 0.470566765240087
Tx DEL 0.3771288931606897
HighCTCF_B_cell DEL 0.329535373226274
ReprPCWk DEL 0.32921977095901356
TxWk DEL 0.30888578279639145
intersect_cpg_isl DEL 0.30070858471699463
Enh DEL 0.27679842065477195
TxFlnk DUP 0.2751730516402117
HighCTCF_Neutrophil DEL 0.27019044147293353
gm12878_shared_intersect_tad_boundary DEL 0.2269056914946834
EnhBiv DEL 0.21838755116601496
TssA DEL 0.20067859853742478


In [22]:
logistic_regr_results_non_len_adj_df["feature_name"] = logistic_regr_results_non_len_adj_df.feature.replace(feature_names)
logistic_regr_results_non_len_adj_df["category_name"] = logistic_regr_results_non_len_adj_df.category.replace(category_names)

# all results for deletions and duplications, use Bonferroni cutoff
logistic_regr_results_non_len_adj_df["log_odds_adj"] = np.where(logistic_regr_results_non_len_adj_df.pass_bonf, 
                                                                logistic_regr_results_non_len_adj_df.log_odds, 
                                                                np.nan)
all_results_no_len_adj_path = results_dir_path.joinpath("logr_func_enrich_results_all_no_len_adj.tsv")
logistic_regr_results_non_len_adj_df.to_csv(all_results_no_len_adj_path, sep="\t", index=False)   