### Structural variant scores, constraint and conservation 

* CADD-SV 
* PhyloP 
* PhastCons conserved elements 
* HARs
* gnomAD constraint 
* gwRVIS
* Excluded JARVIS

---
* Included MSC SV types - severely underpowered to detect significant differences 
* Exclued non-coding vs coding - currently annotating exons from protein-coding genes (so could be non-coding)

In [2]:
import pandas as pd 
import numpy as np
from collections import Counter
import sys 
from patsy import dmatrices
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from statsmodels.discrete import discrete_model
from pybedtools import BedTool
from io import StringIO
from functools import reduce
from statsmodels.stats import multitest

In [3]:
# inputs 
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)

# control and misexpressed SVs 
all_vrnts_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/misexp_cntrl_final/vrnt_id_in_window_cntrl_misexp_genes.txt")
all_vrnts_bed_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/misexp_cntrl_final/vrnt_id_in_windows_misexp_genes.bed")
misexp_vrnts_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/misexp_cntrl_final/vrnt_id_misexp_tpm_zscore_median.txt")

sv_info_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt"
# score paths 
cadd_sv_score_info_path = f"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression/5_functional_analysis/sv/data/cadd_sv/interval_svs/intrvl_all_svs/bed_out/merged/intrvl_svs_no_inv_121042_cadd_sv_info.tsv"
phylop_dir = wkdir_path.joinpath("5_misexp_vrnts/sv_scores/phylop")
gwrvis_dir = wkdir_path.joinpath("5_misexp_vrnts/sv_scores/gwrvis/v2")
#jarvis_dir = wkdir_path.joinpath("5_misexp_vrnts/sv_scores/jarvis")
gnomad_constraint_path = wkdir_path.joinpath("reference/gnomad/constraint_z_genome_1kb.qc.download.txt.clean")
gerp_elements_path = wkdir_path.joinpath("reference/conservation/gerp/gerpElements_hg38_multiz120Mammals.bed")
phastcons_elements_path = wkdir_path.joinpath("reference/conservation/phastcons/phastConsElements_hg38_multiz120Mammals.bed")
hars_bed_path = wkdir_path.joinpath("reference/conservation/hars/hg38_adc_hars.clean.bed")
fantom_5enh_path = wkdir_path.joinpath("reference/conservation/fantom5_enh/hg38_conserved10mer_mammal_FANTOMenhancer.clean.bed")
# output directory
out_dir = wkdir_path.joinpath("5_misexp_vrnts/sv_scores/results/misexp_sv_zscore_final")
out_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# constants 
CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',
               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',
               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',
               'chr19', 'chr20', 'chr21', 'chr22']

In [5]:
# load bed file with variants in windows 
vrnts_in_windows_bed = BedTool(all_vrnts_bed_path)
vrnts_in_windows_bed_sorted = vrnts_in_windows_bed.sort()
# all variants in windows df
all_vrnts_in_windows_df = pd.read_csv(all_vrnts_path, sep="\t", header=None).rename(columns={0:"vrnt_id"})
print(f"Total number of variants: {len(all_vrnts_in_windows_df.vrnt_id.unique())}")

# load misexpression-associated variants 
misexp_vrnts_ids = pd.read_csv(misexp_vrnts_path, sep="\t", header=None)[0].astype(str).unique()
print(f"Number of misexpression-associated SVs: {len(misexp_vrnts_ids)}")
all_vrnts_in_windows_df["misexp_uniq"] = np.where(all_vrnts_in_windows_df.vrnt_id.isin(misexp_vrnts_ids), 1, 0)

Total number of variants: 20262
Number of misexpression-associated SVs: 105


### CADD-SV 

In [6]:
vrnts_cadd_sv_scores_df = pd.read_csv(cadd_sv_score_info_path, sep="\t", dtype={"plinkID": str}).rename(columns={"plinkID":"vrnt_id"})

In [7]:
# no INV or MEI annotations 
vrnts_cadd_sv_df = all_vrnts_in_windows_df.copy()
vrnts_cadd_sv_df = pd.merge(vrnts_cadd_sv_df, 
                             vrnts_cadd_sv_scores_df[["vrnt_id", "Raw-Score-combined"]], 
                             how="left", 
                             on="vrnt_id")
vrnts_cadd_sv_df = vrnts_cadd_sv_df.rename(columns={"Raw-Score-combined": "CADD_sv_raw_score"})

### Conservation (PhyloP)

* Max approach 
* Phastcons score seems to max at 1.0, perhaps just PhyloP - larger range of values 

In [8]:
### phylop - missing some values 
vrnt_phylop_df = all_vrnts_in_windows_df.copy()
phylop_dir_path = Path(phylop_dir)
phylop_score_metrics_list = []
for chrom in CHROMOSOMES[:22]: 
    phylop_score_path = phylop_dir_path.joinpath(f"{chrom}_sv_in_windows_phylop_mean_median_max.tsv")
    phylop_score_metrics_list.append(pd.read_csv(phylop_score_path, sep="\t"))
vrnt_phylop_scored_full_df = pd.concat(phylop_score_metrics_list)
vrnt_phylop_scored_df = vrnt_phylop_scored_full_df[["vrnt_id", "max"]].rename(columns={"max":"phylop_max"})
vrnt_phylop_df = pd.merge(vrnt_phylop_df, 
                          vrnt_phylop_scored_df, 
                          on="vrnt_id",
                          how="left")

### GnomAD constraint score 

* Max constraint in window approach 
* Other potential options: 
    * Weighted sum approach: overlap x constraint  

In [9]:
# load GnomAD constraint scores 
gnomad_constraint = BedTool(gnomad_constraint_path)
# intersect with bed file 
gnomad_bed_intersect_cols = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"window_chrom", 
                             5:"window_start", 6:"window_end", 7:"window_id", 8:"pos", 9:"exp", 10:"obs", 11:"oe", 12:"zscore", 13:"overlap"}
sv_intersect_gnomad_constraint_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(gnomad_constraint, wo=True)))
sv_intersect_gnomad_constraint_df = pd.read_csv(sv_intersect_gnomad_constraint_str, sep="\t", header=None).rename(columns=gnomad_bed_intersect_cols).astype({"vrnt_id":str})
# max constraint score 
sv_intersect_max_constraint_df = pd.DataFrame(sv_intersect_gnomad_constraint_df.groupby("vrnt_id")["zscore"].max()).reset_index().rename(columns={"zscore":f"gnomad_constraint_max_zscore"})
# around n=5041 are NaNs, observe enrichment of NaNs in misexpression-associated SVs 
vrnt_gnomad_constraint_df = all_vrnts_in_windows_df.copy()
vrnt_gnomad_constraint_df = pd.merge(vrnt_gnomad_constraint_df, 
                                     sv_intersect_max_constraint_df, 
                                     how="outer", 
                                     on="vrnt_id")
## add binary term for SV > z-score 4
vrnt_gnomad_constraint_df["gnomad_max_constraint_grtr_z4_nonan"] = np.where(vrnt_gnomad_constraint_df.gnomad_constraint_max_zscore > 4, 1, 0)
vrnt_gnomad_constraint_df["gnomad_max_constraint_grtr_z4"] = np.where(vrnt_gnomad_constraint_df.gnomad_constraint_max_zscore.isna(), 
                                                                      np.nan, 
                                                                     vrnt_gnomad_constraint_df.gnomad_max_constraint_grtr_z4_nonan
                                                                     )

### Gerp Conserved elements 

In [10]:
gerp_elements = BedTool(gerp_elements_path)
# intersect with bed file 
gerp_bed_intersect_cols = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"element_chrom", 
                             5:"element_start", 6:"element_end", 7:"element_id", 8:"overlap"}
sv_intersect_gerp_elements_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(gerp_elements, wo=True)))
sv_intersect_gerp_elements_df = pd.read_csv(sv_intersect_gerp_elements_str, sep="\t", header=None).rename(columns=gerp_bed_intersect_cols).astype({"vrnt_id":str})
sv_intersect_gerp_elements = sv_intersect_gerp_elements_df.vrnt_id.unique()
# gerp element overlap and count
vrnt_gerp_elements_df = all_vrnts_in_windows_df.copy()
vrnt_gerp_element_count_df = pd.DataFrame(sv_intersect_gerp_elements_df.groupby("vrnt_id", as_index=False).element_id.count()).rename(columns={"element_id":"gerp_element_count"})
vrnt_gerp_elements_df = pd.merge(vrnt_gerp_elements_df, 
                                 vrnt_gerp_element_count_df, 
                                 on="vrnt_id", 
                                 how="left"
                                ).fillna(0)

vrnt_gerp_elements_df["gerp_element_overlap"] = np.where(vrnt_gerp_elements_df.vrnt_id.isin(sv_intersect_gerp_elements), 1, 0)

### PhastCons

In [11]:
phastcons_elements = BedTool(phastcons_elements_path)
# intersect with bed file 
phastcons_bed_intersect_cols = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"element_chrom", 
                             5:"element_start", 6:"element_end", 7:"element_id", 8:"overlap"}
sv_intersect_phastcons_elements_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(phastcons_elements, wo=True)))
sv_intersect_phastcons_elements_df = pd.read_csv(sv_intersect_phastcons_elements_str, sep="\t", header=None).rename(columns=phastcons_bed_intersect_cols).astype({"vrnt_id":str})
sv_intersect_phastcons_elements = sv_intersect_phastcons_elements_df.vrnt_id.unique()
# gerp element overlap and count
vrnt_phastcons_elements_df = all_vrnts_in_windows_df.copy()
vrnt_phastcons_element_count_df = pd.DataFrame(sv_intersect_phastcons_elements_df.groupby("vrnt_id", as_index=False).element_id.count()).rename(columns={"element_id":"phastcons_element_count"})
vrnt_phastcons_elements_df = pd.merge(vrnt_phastcons_elements_df, 
                                 vrnt_phastcons_element_count_df, 
                                 on="vrnt_id", 
                                 how="left"
                                ).fillna(0)

vrnt_phastcons_elements_df["phastcons_element_overlap"] = np.where(vrnt_phastcons_elements_df.vrnt_id.isin(sv_intersect_phastcons_elements), 1, 0)

### gwRVIS 

In [12]:
vrnt_gwrvis_df = all_vrnts_in_windows_df.copy()
gwrvis_dir_path = Path(gwrvis_dir)
gwrvis_score_metrics_list = []
for chrom in CHROMOSOMES[:22]: 
    gwrvis_score_path = gwrvis_dir_path.joinpath(f"{chrom}_sv_in_windows_gwrvis.tsv")
    gwrvis_score_metrics_list.append(pd.read_csv(gwrvis_score_path, sep="\t"))
vrnt_gwrvis_scored_full_df = pd.concat(gwrvis_score_metrics_list)
vrnt_gwrvis_scored_df = vrnt_gwrvis_scored_full_df[["vrnt_id", "min"]].rename(columns={"min":"gwrvis_min"})
vrnt_gwrvis_df = pd.merge(vrnt_gwrvis_df, 
                          vrnt_gwrvis_scored_df, 
                          on="vrnt_id",                          
                          how="left")

### Conserved Enhancer elements 

* Too few elements overlapping across entire test set to use 

In [13]:
fantom5_conserved_elements = BedTool(fantom_5enh_path)
# intersect with bed file 
fantom5_conserved_bed_intersect_cols = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"element_chrom", 
                                        5:"element_start", 6:"element_end", 7:"element_id", 8:"overlap"}
sv_intersect_f5_elements_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(fantom5_conserved_elements, wo=True)))
sv_intersect_f5_elements_df = pd.read_csv(sv_intersect_f5_elements_str, sep="\t", header=None).rename(columns=fantom5_conserved_bed_intersect_cols).astype({"vrnt_id":str})
sv_intersect_f5_elements = sv_intersect_f5_elements_df.vrnt_id.unique()
# element overlap
vrnt_f5_elements_df = all_vrnts_in_windows_df.copy()
vrnt_f5_elements_df["f5_consv_element_overlap"] = np.where(vrnt_f5_elements_df.vrnt_id.isin(sv_intersect_f5_elements), 1, 0)

### Human accelerated regions (HARs)

In [14]:
# load HARs 
hars_bed = BedTool(hars_bed_path).sort()
bed_intersect_cols_hars = {0:"vrnt_chrom", 1:"vrnt_start", 2:"vrnt_end", 3:"vrnt_id", 4:"har_chrom", 
                           5:"har_start", 6:"har_end", 7:"har_name", 8:"overlap"}
# identify SVs that overlap HARs
sv_intersect_hars_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(hars_bed, wo=True)))
sv_intersect_hars_df = pd.read_csv(sv_intersect_hars_str, sep="\t", header=None).rename(columns=bed_intersect_cols_hars).astype({"vrnt_id":str})
sv_intersect_hars = sv_intersect_hars_df.vrnt_id.unique()
# annotate variants intersecting at least one HAR
vrnt_har_intersect_df = all_vrnts_in_windows_df.copy()
vrnt_har_intersect_df["intersect_har"] = np.where(vrnt_har_intersect_df.vrnt_id.isin(sv_intersect_hars), 1, 0)

### Combine Features 

In [15]:
# merge different features 
dfs_to_merge = [vrnt_gnomad_constraint_df, 
                vrnt_phylop_df, 
                vrnts_cadd_sv_df, 
                vrnt_gerp_elements_df, 
                vrnt_har_intersect_df, 
               vrnt_phastcons_elements_df, 
                vrnt_f5_elements_df, 
                vrnt_gwrvis_df, 
               ]
vrnt_features_merged_df = reduce(lambda  left,right: pd.merge(left,right, on=all_vrnts_in_windows_df.columns.tolist(),
                                                              how='inner'), dfs_to_merge)

In [16]:
# structural variant information 
sv_info_df =pd.read_csv(sv_info_path, sep="\t", dtype={"plinkID": str}).rename(columns={"plinkID":"vrnt_id"})

vrnt_features_merged_info_df = pd.merge(vrnt_features_merged_df, 
                                   sv_info_df, 
                                   on="vrnt_id", 
                                   how="left"
                                  )

In [17]:
# add coding/non-coding information 
#sv_test_control_coding_info_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/non_coding_svs/vrnt_id_in_windows_misexp_genes_coding_no_exon_1kb.tsv")
#sv_test_control_coding_info_df = pd.read_csv(sv_test_control_coding_info_path, sep="\t")
#vrnt_features_merged_info_coding_df = pd.merge(vrnt_features_merged_info_df, sv_test_control_coding_info_df[["vrnt_id", "coding"]], on=["vrnt_id"], how="left")

In [18]:
# write to file 
vrnt_features_out = out_dir.joinpath("vrnt_features_scores.csv")
vrnt_features_merged_info_df.to_csv(vrnt_features_out, index=False)

### Logistic regression

In [33]:
feature_list = ["gnomad_constraint_max_zscore", 
                "gnomad_max_constraint_grtr_z4",
                "phylop_max", 
                "CADD_sv_raw_score", 
                "intersect_har",
                "phastcons_element_count", 
                "gerp_element_count", 
                "gwrvis_min",
               ] 

region = "all"
sv_types_logistic_regr_results = {}
logr_count = 0          
for sv_type in ["DEL", "DUP"]:
    for feature in feature_list:
        print(sv_type, feature)
        # remove NaNs before normalising by length 
        input_df = vrnt_features_merged_info_df[(vrnt_features_merged_info_df.SVTYPE == sv_type) & 
                                                (vrnt_features_merged_info_df[feature].notna())].copy()
        input_df["svlen_norm"] = (input_df["SVLEN"] - input_df["SVLEN"].mean())/input_df["SVLEN"].std()
        input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
        y, X = dmatrices(f'misexp_uniq ~ {feature}_norm + svlen_norm', input_df, return_type = 'dataframe')
        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()
        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
        # normal approximation confidence intervals
        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
        sv_types_logistic_regr_results[logr_count] = [feature, sv_type, region, log_odds, lower_conf, upper_conf, pval]
        logr_count += 1

DEL gnomad_constraint_max_zscore
Optimization terminated successfully.
         Current function value: 0.030201
         Iterations 10
DEL gnomad_max_constraint_grtr_z4
Optimization terminated successfully.
         Current function value: 0.030150
         Iterations 9
DEL phylop_max
Optimization terminated successfully.
         Current function value: 0.029485
         Iterations 10
DEL CADD_sv_raw_score
Optimization terminated successfully.
         Current function value: 0.030165
         Iterations 10
DEL intersect_har
Optimization terminated successfully.
         Current function value: 0.030431
         Iterations 9
DEL phastcons_element_count
Optimization terminated successfully.
         Current function value: 0.030422
         Iterations 9
DEL gerp_element_count
Optimization terminated successfully.
         Current function value: 0.030400
         Iterations 9
DEL gwrvis_min
Optimization terminated successfully.
         Current function value: 0.030512
         Iterat

In [34]:
columns_logr_enrich = ["feature", "sv_type", "region", "log_odds", "lower", "upper", "pval"]
sv_types_logistic_regr_results_df = pd.DataFrame.from_dict(sv_types_logistic_regr_results, orient="index", columns=columns_logr_enrich)

### Multiple Testing Correction 

In [35]:
pval_list = sv_types_logistic_regr_results_df.pval.to_numpy()
# multiple testing correction
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="fdr_bh")
sv_types_logistic_regr_results_df["pass_fdr_bh"] = reject
sv_types_logistic_regr_results_df["pvals_corrected_fdr_bh"] = pvals_corrected
# Bonferroni correction 
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="bonferroni")
sv_types_logistic_regr_results_df["pass_bonf"] = reject
sv_types_logistic_regr_results_df["pvals_corrected_bonf"] = pvals_corrected

### Significant results 

In [36]:
for index, row in sv_types_logistic_regr_results_df.iterrows(): 
    pass_bonf = row["pass_bonf"]
    if pass_bonf: 
        print(row["sv_type"], row["feature"], row["log_odds"], row["pvals_corrected_bonf"])

DEL phylop_max 0.4471033940033347 2.6303286921224432e-05
DEL CADD_sv_raw_score 0.3324963545651643 0.015100119714700133
DUP gnomad_constraint_max_zscore 0.9162837441442854 0.0037933639350685184
DUP gnomad_max_constraint_grtr_z4 0.9473559926527871 0.0006432827739679205
DUP phylop_max 1.104646071751581 0.027927586760368675
DUP CADD_sv_raw_score 0.7566751604334726 0.001987429601630485
DUP gwrvis_min -0.7881086546231187 0.030020598875784076


### Write to file 

In [37]:
### clean output 
features_to_keep = {#'gerp_element_count': "Conserved elements", 
                    'CADD_sv_raw_score': "CADD-SV",
                    'phylop_max': "Conservation",
                    #'gnomad_max_constraint_grtr_z4': "gnomAD constraint z > 4", 
                    "gnomad_constraint_max_zscore": "gnomAD constraint",
                    'gwrvis_min': "gwRVIS",
                    'intersect_har': "HARs"
                   }
sv_types_logistic_regr_results_features_to_keep_df = sv_types_logistic_regr_results_df[sv_types_logistic_regr_results_df.feature.isin(features_to_keep.keys())].copy()
sv_types_logistic_regr_results_features_to_keep_df["feature_name"] = sv_types_logistic_regr_results_features_to_keep_df.feature.replace(features_to_keep)

In [38]:
features_to_keep_path = out_dir.joinpath("misexp_sv_scores_features.tsv")
sv_types_logistic_regr_results_features_to_keep_df.to_csv(features_to_keep_path, sep="\t", index=False)

### Non length-adjusted enrichment 

In [39]:
sv_types_logistic_regr_results_no_len_adj = {}
for sv_type in ["DEL", "DUP"]:
    for feature in feature_list:
        print(sv_type, feature)
        # remove NaNs before normalising by length 
        input_df = vrnt_features_merged_info_df[(vrnt_features_merged_info_df.SVTYPE == sv_type) & 
                                                (vrnt_features_merged_info_df[feature].notna())].copy()
        input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
        y, X = dmatrices(f'misexp_uniq ~ {feature}_norm', input_df, return_type = 'dataframe')
        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()
        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
        # normal approximation confidence intervals
        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
        sv_types_logistic_regr_results_no_len_adj[logr_count] = [feature, sv_type, region, log_odds, lower_conf, upper_conf, pval]
        logr_count += 1
sv_types_logistic_regr_results_no_len_adj_df = pd.DataFrame.from_dict(sv_types_logistic_regr_results_no_len_adj, orient="index", columns=columns_logr_enrich)
# 
pval_list = sv_types_logistic_regr_results_no_len_adj_df.pval.to_numpy()
# multiple testing correction
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="fdr_bh")
sv_types_logistic_regr_results_no_len_adj_df["pass_fdr_bh"] = reject
sv_types_logistic_regr_results_no_len_adj_df["pvals_corrected_fdr_bh"] = pvals_corrected
# Bonferroni correction 
reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method="bonferroni")
sv_types_logistic_regr_results_no_len_adj_df["pass_bonf"] = reject
sv_types_logistic_regr_results_no_len_adj_df["pvals_corrected_bonf"] = pvals_corrected

DEL gnomad_constraint_max_zscore
Optimization terminated successfully.
         Current function value: 0.030361
         Iterations 10
DEL gnomad_max_constraint_grtr_z4
Optimization terminated successfully.
         Current function value: 0.030293
         Iterations 9
DEL phylop_max
Optimization terminated successfully.
         Current function value: 0.029516
         Iterations 10
DEL CADD_sv_raw_score
Optimization terminated successfully.
         Current function value: 0.030233
         Iterations 10
DEL intersect_har
Optimization terminated successfully.
         Current function value: 0.030519
         Iterations 9
DEL phastcons_element_count
Optimization terminated successfully.
         Current function value: 0.030425
         Iterations 9
DEL gerp_element_count
Optimization terminated successfully.
         Current function value: 0.030401
         Iterations 9
DEL gwrvis_min
Optimization terminated successfully.
         Current function value: 0.030642
         Iterat

In [40]:
for index, row in sv_types_logistic_regr_results_no_len_adj_df.iterrows(): 
    pass_bonf = row["pass_bonf"]
    if pass_bonf: 
        print(row["sv_type"], row["feature"], row["log_odds"], row["pvals_corrected_bonf"])
        
### clean output 
features_to_keep = {#'gerp_element_count': "Conserved elements", 
                    'CADD_sv_raw_score': "CADD-SV",
                    'phylop_max': "Conservation",
                    #'gnomad_max_constraint_grtr_z4': "gnomAD constraint z > 4", 
                    "gnomad_constraint_max_zscore": "gnomAD constraint",
                    'gwrvis_min': "gwRVIS",
                    'intersect_har': "HARs"
                   }
sv_types_logistic_regr_results_no_len_adj_features_to_keep_df = sv_types_logistic_regr_results_no_len_adj_df[sv_types_logistic_regr_results_no_len_adj_df.feature.isin(features_to_keep.keys())].copy()
sv_types_logistic_regr_results_no_len_adj_features_to_keep_df["feature_name"] = sv_types_logistic_regr_results_no_len_adj_df.feature.replace(features_to_keep)

DEL phylop_max 0.47652916239774784 1.0460493711318187e-06
DEL CADD_sv_raw_score 0.3767640884019741 0.0014388999253719206
DEL phastcons_element_count 0.13084673607783107 0.002278090365857613
DEL gerp_element_count 0.1384361682458371 0.0009721632539310822
DUP gnomad_constraint_max_zscore 1.0432872344083846 6.679166661325797e-05
DUP gnomad_max_constraint_grtr_z4 1.025166559823696 3.943793444888182e-05
DUP phylop_max 1.2456936227266526 0.0029161015656206635
DUP CADD_sv_raw_score 0.7856179248653821 0.00025153269374883025
DUP phastcons_element_count 0.4493134581338546 4.423355630459484e-05
DUP gerp_element_count 0.4294918442204741 0.00028237353460778224
DUP gwrvis_min -0.9254566335119306 0.00048781561514272756


In [41]:
features_to_keep_no_len_adj_path = out_dir.joinpath("misexp_sv_scores_features_no_len_adj.tsv")
sv_types_logistic_regr_results_no_len_adj_features_to_keep_df.to_csv(features_to_keep_no_len_adj_path, sep="\t", index=False)