### Misexpressed gene enrichment analysis 

In [3]:
import pandas as pd
from statsmodels.discrete import discrete_model
from statsmodels.stats import multitest
import numpy as np
from patsy import dmatrices
from pathlib import Path

In [4]:
# inputs 
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

gene_features_path = wkdir_path.joinpath("3_misexp_genes/inactive_gene_features_8650.csv")
out_dir_path = wkdir_path.joinpath("3_misexp_genes/results")

In [5]:
# load features 
gene_features_df = pd.read_csv(gene_features_path)
features = [col for col in gene_features_df.columns if col not in ["gene_id", "gene_type"]]
print(f"Number of features: {len(features)}")
gene_ids_with_features = gene_features_df.gene_id.unique()
print(f"Number of genes: {len(gene_ids_with_features)}")

Number of features: 82
Number of genes: 8650


In [6]:
enrich_logr_results, count = {}, 0
for z_score in [2, 10, 20, 30]:
    gene_misexp_gene_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/misexp_genes_tpm0.5_z{z_score}.txt")
    gene_misexp = set(pd.read_csv(gene_misexp_gene_path, sep="\t", header=None)[0].tolist())
    gene_never_misexp_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/never_misexp_genes_tpm0.5_z{z_score}.txt")
    gene_never_misexp = set(pd.read_csv(gene_never_misexp_path, sep="\t", header=None)[0].tolist())
    total_genes = len(gene_misexp) + len(gene_never_misexp)
    if total_genes != len(gene_ids_with_features): 
        raise ValueError("Mismatch between gene total genes and numeber of genes with features.")
    # categorical variable: misexpressed, never misexpressed 
    gene_features_df["misexp_group"] = np.where(gene_features_df.gene_id.isin(gene_never_misexp), 0, 1)
    # logistic regression 
    for feature in features:
        input_df = gene_features_df.copy()
        input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
        y, X = dmatrices(f"misexp_group ~ {feature}_norm", input_df, return_type = 'dataframe')
        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()
        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
        # normal approximation confidence intervals
        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
        enrich_logr_results[count] = [z_score, feature, log_odds, lower_conf, upper_conf, pval]
        count += 1

Optimization terminated successfully.
         Current function value: 0.657092
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.692673
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692559
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.680872
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688081
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688669
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688824
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688182
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.680269
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687728
  

Optimization terminated successfully.
         Current function value: 0.692673
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692559
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.680872
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688081
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688669
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688824
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688182
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.680269
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687728
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688371
  

Optimization terminated successfully.
         Current function value: 0.687650
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.686950
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.672124
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.681694
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.681877
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.682159
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.682215
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.671405
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.680103
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.681044
  



Optimization terminated successfully.
         Current function value: 0.688022
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687935
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688063
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688102
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687550
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687854
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.686928
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.687253
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687427
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.684516
  



Optimization terminated successfully.
         Current function value: 0.611925
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.612275
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612324
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612324
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612290
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612252
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612183
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611861
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611708
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611323
  

In [7]:
# results 
enrich_logr_results_df = pd.DataFrame.from_dict(enrich_logr_results, 
                                                orient="index", 
                                                columns=["z_score", "feature", "log_odds", "lower", "upper", "pval"])

In [8]:
# multiple testing correction 
pval_as_array = enrich_logr_results_df.pval.to_numpy()
# FDR BH and Bonferroni correction 
for method in ["fdr_bh", "bonferroni"]:
    pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=method)
    enrich_logr_results_df[f"{method}_pass"] = pass_test
    enrich_logr_results_df[f"{method}_pval_adj"] = pval_adj
    enrich_logr_results_pass_df = enrich_logr_results_df[enrich_logr_results_df[f"{method}_pass"]]

In [9]:
# feature names
feature_name_dict = {"gene_length": "Gene length", 
                     "gene_count_1Mb" : "Genes within 1Mb", 
                     "gene_distance_min": "Min. gene distance", 
                     "pHaplo": "pHaplo", 
                     "pTriplo": "pTriplo", 
                     "EDS": "Enhancer Domain Score (EDS)", 
                     'oe_lof_upper': "gnomAD LOEUF", 
                     'oe_mis_upper': "gnomAD missense OEUF", 
                     'Episcore': 'Episcore', 
                     'gene_fraction_TssA': 'ChromHMM Active TSS', 
                     'gene_fraction_TssAFlnk': 'ChromHMM Flanking active TSS', 
                     'gene_fraction_TxFlnk': "ChromHMM Transcr. at gene 5' and 3'",
                     'gene_fraction_Tx': "ChromHMM Strong transcription", 
                     'gene_fraction_TxWk': "ChromHMM Weak transcription",
                     'gene_fraction_EnhG': "ChromHMM Genic enhancers", 
                     'gene_fraction_Enh': "ChromHMM Enhancers", 
                     'gene_fraction_ZNFRpts': "ChromHMM ZNF genes and repeats",
                     'gene_fraction_Het': "ChromHMM Heterochromatin", 
                     'gene_fraction_TssBiv': "ChromHMM Bivalent/Poised TSS", 
                     'gene_fraction_BivFlnk': "ChromHMM Flanking Bivalent TSS/Enh",
                     'gene_fraction_EnhBiv': "ChromHMM Bivalent Enhancer", 
                     'gene_fraction_ReprPC': "ChromHMM Repressed PolyComb", 
                     'gene_fraction_ReprPCWk': "ChromHMM Weak Repressed PolyComb",
                     'gene_fraction_Quies': "ChromHMM Quiescent/Low",
                     'ActivityLinking_Conserved_nt_count': 'Conserved enhancer bp (activity-linking)', 
                     'ActivityLinking_nt_count': 'Enhancer nt bp (activity-linking)',
                     'ProximityLinking_Conserved_nt_count': 'Conserved enhancer bp (proximity-linking)', 
                     'ProximityLinking_nt_count': 'Enhancer bp (proximity-linking)',
                     'ActivityLinking_EnhancerNumber': "# enhancers (activity-linking)",
                     'ActivityLinking_NumberConservedElements': "Conserved # enhancers (activity-linking)",
                     'ProximityLinking_EnhancerNumber': "# enhancers (proximity-linking)",
                     'ProximityLinking_NumberConservedElements': "Conserved # enhancers (proximity-linking)",
                     'brain_expression': "Brain expression", 
                     'active_gene_count_1Mb': "Expressed gene within 1Mb",
                     'gnomad_autosomal_dominant': 'Autosomal dominant gene',
                     'gnomad_haploinsufficient': 'Haploinsufficient gene',
                     'gnomad_autosomal_recessive': 'Autosomal recessive gene',
                     'gnomad_olfactory_genes': "Olfactory gene",
                     'oncogene': 'High-confidence oncogenes',
                     "approved_target": "Approved drug target",
                     "gwas_gene": "GWAS gene",
                     "decipher_gene": "DECIPHER gene", 
                     'adipose_expression': 'Adipose expression', 
                     'adrenal_gland_expression': 'Adrenal gland expression', 
                     'artery_expression': 'Artery', 
                     'bladder_expression': 'Bladder', 
                     'brain_expression': 'Brain', 
                     'breast_expression': 'Breast', 
                     'cervix_expression': 'Cervix', 
                     'colon_expression': 'Colon', 
                     'esophagus_expression': 'Esophagus', 
                     'fallopian_tube_expression': 'Fallopian tube', 
                     'heart_expression': 'Heart', 
                     'kidney_expression': 'Kidney', 
                     'liver_expression': 'Liver', 
                     'lung_expression': 'Lung', 
                     'minor_salivary_gland_expression': 'Minor salivary gland', 
                     'muscle_expression': 'Muscle', 
                     'nerve_expression': 'Nerve', 
                     'ovary_expression': 'Ovary', 
                     'pancreas_expression': 'Pancreas', 
                     'pituitary_expression': 'Pituitary', 
                     'prostate_expression': 'Prostate', 
                     'skin_expression': 'Skin', 
                     'small_intestine_expression': 'Small intestine', 
                     'spleen_expression': 'Spleen', 
                     'stomach_expression': 'Stomach', 
                     'testis_expression': 'Testis', 
                     'thyroid_expression': 'Thyroid', 
                     'uterus_expression': 'Uterus', 
                     'vagina_expression': 'Vagina', 
                     'tissue_expression': "# tissues expressed",
                     'active_gene_distance': 'Distance to expressed gene',
                     'fraction_A': "A compartment overlap",
                     'fraction_B': "B compartment overlap",
                     'fraction_unassigned': "Unassigned compartment overlap",
                     'pLI': 'pLI', 
                     'pNull': 'Probability of complete haplosufficiency', 
                     'pRec': 'Probability of recessive lethality', 
                     'gerp_element_per_bp': '# conserved elements per bp',
                     'tad_distance': 'Min. TAD boundary distance', 
                      'phylop_mean': "Mean gene body conservation",
                     'omim_gene': 'OMIM', 
                     "protein_coding": "Protein-coding"
                    }

enrich_logr_results_df["feature_name"] = enrich_logr_results_df.feature.replace(feature_name_dict)

In [10]:
# feature groups 
feature_groups_dict = {"Expression": ["adipose_expression", "adrenal_gland_expression",
                                      "artery_expression", "bladder_expression", "brain_expression", 
                                      "breast_expression", "cervix_expression", "colon_expression", 
                                      "esophagus_expression", "fallopian_tube_expression", "heart_expression",
                                      "kidney_expression", "liver_expression", "lung_expression", 
                                      "minor_salivary_gland_expression", "muscle_expression", 
                                      "nerve_expression", "ovary_expression", "pancreas_expression", 
                                      "pituitary_expression", "prostate_expression", "skin_expression", 
                                      "spleen_expression", "stomach_expression", "testis_expression", 
                                      "thyroid_expression", "uterus_expression", "vagina_expression", 
                                      "small_intestine_expression", "tissue_expression"
                                     ],
                       "Genomic": ["active_gene_distance","active_gene_count_1Mb", 
                                  "gene_count_1Mb", "gene_length", "gene_distance_min"],
                       "Constraint &\nconservation": ['pHaplo', 'pTriplo', 'EDS', 'oe_lof_upper', 
                                      'oe_mis_upper','Episcore', "pLI", "pNull", "pRec", "phylop_mean", "gerp_element_per_bp"],
                       "Regulation": ['ProximityLinking_Conserved_nt_count', 'ProximityLinking_nt_count',
                                      'ActivityLinking_EnhancerNumber', 'ActivityLinking_NumberConservedElements',
                                      'ProximityLinking_EnhancerNumber','ProximityLinking_NumberConservedElements', 
                                     "gene_fraction_TssA", "gene_fraction_TssAFlnk", "gene_fraction_TxFlnk", 
                                      "gene_fraction_Tx", "gene_fraction_TxWk", "gene_fraction_EnhG", 
                                      "gene_fraction_Enh", "gene_fraction_ZNFRpts", "gene_fraction_Het", 
                                      "gene_fraction_TssBiv", "gene_fraction_BivFlnk", "gene_fraction_EnhBiv", 
                                     "gene_fraction_ReprPC", "gene_fraction_ReprPCWk", "gene_fraction_Quies", 
                                      "fraction_A", "fraction_B", "fraction_unassigned", "ActivityLinking_Conserved_nt_count", 
                                      "ActivityLinking_nt_count", "tad_distance"
                                     ],
                       "Gene sets": ["gnomad_autosomal_dominant", "gnomad_haploinsufficient", 
                                     "gnomad_autosomal_recessive", "gnomad_olfactory_genes", "oncogene", 
                                    "approved_target", "decipher_gene", "omim_gene", "protein_coding"
                                    ]
                      }
features_to_group_dict = {}
for group in feature_groups_dict.keys(): 
    for feature in feature_groups_dict[group]: 
        features_to_group_dict[feature] = group 
enrich_logr_results_df["feature_group"] = enrich_logr_results_df.feature.replace(features_to_group_dict)

In [11]:
# calculate absolute value of log-odds 
enrich_logr_results_df["log_odds_abs"] = enrich_logr_results_df.log_odds.abs()
# write all results to file 
out_dir_path.mkdir(parents=True, exist_ok=True)
enrich_logr_results_path = out_dir_path.joinpath("enrich_logr_results_all_rev_zscores.csv")
enrich_logr_results_df.to_csv(enrich_logr_results_path, index=False)

In [13]:
# z-score > 2, passing Bonferroni correction  
enrich_logr_results_pass_z2_df = enrich_logr_results_df[(enrich_logr_results_df["bonferroni_pass"]) & 
                                                     (enrich_logr_results_df["z_score"] == 2)
                                                    ].sort_values(by="log_odds_abs", ascending=False)
# write all features 
enrich_logr_results_pass_z2_path = out_dir_path.joinpath(f"enrich_logr_results_pass_bonf_z2_all.csv")
enrich_logr_results_pass_z2_df.to_csv(enrich_logr_results_pass_z2_path, index=False)

In [49]:
# show top 15 features in main figure
top_results = 15
enrich_logr_results_pass_z2_top_results_df = enrich_logr_results_pass_z2_df.head(top_results).copy()
all_top_features = set(enrich_logr_results_pass_z2_top_results_df.feature.unique())

In [16]:
feature_names_in_plot = {'Gene length':'Gene length',
                         'gnomAD LOEUF':'gnomAD LOEUF',
                         'OMIM':'OMIM',
                         '# enhancers (activity-linking)': '# enhancers (activity)',
                         '# enhancers (proximity-linking)': '# enhancers (proximity)',
                         'Conserved enhancer bp (proximity-linking)': 'Conserved enhancer bp (proximity)',
                         'Conserved # enhancers (proximity-linking)': 'Conserved # enhancers (proximity)',
                         'Enhancer bp (proximity-linking)': 'Enhancer bp (proximity)',
                         'Brain': 'Brain expression',
                         '# tissues expressed': '# tissues expressed',
                         'Enhancer Domain Score (EDS)': 'Enhancer Domain Score', 
                         'Episcore': 'Episcore',
                         "gnomAD missense OEUF": "gnomAD missense OEUF", 
                         'Pituitary': 'Pituitary expression', 
                         'Heart': 'Heart expression', 
                         "Protein-coding": "Protein-coding"
                        }


# write results adjusting name for plot 
enrich_logr_results_pass_z2_top_results_df["feature_name_trunc"] = enrich_logr_results_pass_z2_top_results_df.feature_name.replace(feature_names_in_plot)
# feature groups 
gene_sets = feature_groups_dict["Expression"] + feature_groups_dict["Gene sets"]
gene_sets.remove("tissue_expression")
print(f"Number of gene set features: {len(gene_sets)}")
# quantitative features
quant_features = feature_groups_dict["Regulation"] + feature_groups_dict["Constraint &\nconservation"] + feature_groups_dict["Genomic"] + ["tissue_expression"]
print(f"Number of quantitative features: {len(quant_features)}")
features_class_dict = {**{feature: "Quantitative" for feature in quant_features}, **{feature: "Binary" for feature in gene_sets}}

enrich_logr_results_pass_z2_top_results_df["class"] = enrich_logr_results_pass_z2_top_results_df.feature.replace(features_class_dict)

Number of gene set features: 38
Number of quantitative features: 44


In [17]:
# write to file 
enrich_logr_results_pass_z2_top_results_path = out_dir_path.joinpath(f"enrich_logr_results_pass_bonf_z2_top{top_results}.csv")
enrich_logr_results_pass_z2_top_results_df.to_csv(enrich_logr_results_pass_z2_top_results_path, index=False)

In [18]:
# write all results, adjusting log odds where failing bonferroni correction (for heatmap)
enrich_logr_results_df["log_odds_adj"] = np.where(enrich_logr_results_df["bonferroni_pass"], 
                                                  enrich_logr_results_df.log_odds, 
                                                  np.nan 
                                                 )
enrich_logr_results_log_odds_adj_path = out_dir_path.joinpath(f"enrich_logr_results_all_zscores_bonf_log_odds_adj.csv")
enrich_logr_results_df.to_csv(enrich_logr_results_log_odds_adj_path, index=False)

### Enrichment test splitting protein-coding and lncRNA genes 

Enrichment testing for lncRNAs only

In [23]:
gene_lncrna_features_df = gene_features_df[gene_features_df.protein_coding == 0].copy()

# remove features that have all entries = 0
lncrna_features_to_rmv = ["gene_id", "gene_type", "gene_fraction_TxFlnk", "gnomad_autosomal_dominant", 
                          "gnomad_haploinsufficient", "gnomad_autosomal_recessive", "gnomad_olfactory_genes", 
                          "oncogene", "approved_target", "decipher_gene", "protein_coding", "misexp_group"
                         ]

lncrna_features = [col for col in gene_lncrna_features_df.columns if col not in lncrna_features_to_rmv]

print(f"Number of features: {len(lncrna_features)}")
lncrna_gene_ids_with_features = gene_lncrna_features_df.gene_id.unique()
print(f"Number of genes: {len(lncrna_gene_ids_with_features)}")

Number of features: 73
Number of genes: 5558


In [24]:
lncrna_enrich_logr_results, count = {}, 0
for z_score in [2, 10, 20, 30]:
    gene_misexp_gene_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/misexp_genes_tpm0.5_z{z_score}.txt")
    gene_misexp = set(pd.read_csv(gene_misexp_gene_path, sep="\t", header=None)[0].tolist())
    gene_never_misexp_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/never_misexp_genes_tpm0.5_z{z_score}.txt")
    gene_never_misexp = set(pd.read_csv(gene_never_misexp_path, sep="\t", header=None)[0].tolist())
    # categorical variable: misexpressed, never misexpressed 
    gene_lncrna_features_df["misexp_group"] = np.where(gene_lncrna_features_df.gene_id.isin(gene_never_misexp), 0, 1)
    # logistic regression 
    for feature in lncrna_features:
        print(feature)
        input_df = gene_lncrna_features_df.copy()
        input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
        y, X = dmatrices(f"misexp_group ~ {feature}_norm", input_df, return_type = 'dataframe')
        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()
        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
        # normal approximation confidence intervals
        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
        lncrna_enrich_logr_results[count] = [z_score, feature, log_odds, lower_conf, upper_conf, pval]
        count += 1

gene_length
Optimization terminated successfully.
         Current function value: 0.635029
         Iterations 6
gene_count_1Mb
Optimization terminated successfully.
         Current function value: 0.669514
         Iterations 4
gene_distance_min
Optimization terminated successfully.
         Current function value: 0.670562
         Iterations 4
tissue_expression
Optimization terminated successfully.
         Current function value: 0.668792
         Iterations 4
adipose_expression
Optimization terminated successfully.
         Current function value: 0.670192
         Iterations 4
adrenal_gland_expression
Optimization terminated successfully.
         Current function value: 0.670105
         Iterations 4
artery_expression
Optimization terminated successfully.
         Current function value: 0.670187
         Iterations 4
bladder_expression
Optimization terminated successfully.
         Current function value: 0.670316
         Iterations 4
brain_expression
Optimization terminated

Optimization terminated successfully.
         Current function value: 0.669430
         Iterations 4
tad_distance
Optimization terminated successfully.
         Current function value: 0.669676
         Iterations 4
phylop_mean
Optimization terminated successfully.
         Current function value: 0.670615
         Iterations 4
omim_gene
Optimization terminated successfully.
         Current function value: 0.668648
         Iterations 4
gene_length
Optimization terminated successfully.
         Current function value: 0.635029
         Iterations 6
gene_count_1Mb
Optimization terminated successfully.
         Current function value: 0.669514
         Iterations 4
gene_distance_min
Optimization terminated successfully.
         Current function value: 0.670562
         Iterations 4
tissue_expression
Optimization terminated successfully.
         Current function value: 0.668792
         Iterations 4
adipose_expression
Optimization terminated successfully.
         Current function val

Optimization terminated successfully.
         Current function value: 0.688059
         Iterations 4
heart_expression
Optimization terminated successfully.
         Current function value: 0.686949
         Iterations 4
kidney_expression
Optimization terminated successfully.
         Current function value: 0.688701
         Iterations 4
liver_expression
Optimization terminated successfully.
         Current function value: 0.688731
         Iterations 4
lung_expression
Optimization terminated successfully.
         Current function value: 0.687800
         Iterations 4
minor_salivary_gland_expression
Optimization terminated successfully.
         Current function value: 0.688982
         Iterations 4
muscle_expression
Optimization terminated successfully.
         Current function value: 0.687910
         Iterations 4
nerve_expression
Optimization terminated successfully.
         Current function value: 0.688020
         Iterations 4
ovary_expression
Optimization terminated successf

Optimization terminated successfully.
         Current function value: 0.648787
         Iterations 5
breast_expression
Optimization terminated successfully.
         Current function value: 0.653152
         Iterations 5
cervix_expression
Optimization terminated successfully.
         Current function value: 0.653532
         Iterations 5
colon_expression
Optimization terminated successfully.
         Current function value: 0.654213
         Iterations 5
esophagus_expression
Optimization terminated successfully.
         Current function value: 0.654105
         Iterations 5
fallopian_tube_expression
Optimization terminated successfully.
         Current function value: 0.653221
         Iterations 5
heart_expression
Optimization terminated successfully.
         Current function value: 0.653528
         Iterations 5
kidney_expression
Optimization terminated successfully.
         Current function value: 0.655226
         Iterations 5
liver_expression
Optimization terminated successf

In [57]:
# results 
lncrna_enrich_logr_results_df = pd.DataFrame.from_dict(lncrna_enrich_logr_results, 
                                                orient="index", 
                                                columns=["z_score", "feature", "log_odds", "lower", "upper", "pval"])
# multiple testing correction 
pval_as_array = lncrna_enrich_logr_results_df.pval.to_numpy()
# FDR BH and Bonferroni correction 
for method in ["fdr_bh", "bonferroni"]:
    pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=method)
    lncrna_enrich_logr_results_df[f"{method}_pass"] = pass_test
    lncrna_enrich_logr_results_df[f"{method}_pval_adj"] = pval_adj
lncrna_enrich_logr_results_df["feature_name"] = lncrna_enrich_logr_results_df.feature.replace(feature_name_dict)
# add feature group 
features_to_group_dict = {}
for group in feature_groups_dict.keys(): 
    for feature in feature_groups_dict[group]: 
        features_to_group_dict[feature] = group 
lncrna_enrich_logr_results_df["feature_group"] = lncrna_enrich_logr_results_df.feature.replace(features_to_group_dict)
# calculate absolute value of log-odds 
lncrna_enrich_logr_results_df["log_odds_abs"] = lncrna_enrich_logr_results_df.log_odds.abs()
# z-score > 2, passing Bonferroni correction  
lncrna_enrich_logr_results_pass_z2_df = lncrna_enrich_logr_results_df[(lncrna_enrich_logr_results_df.bonferroni_pass) & 
                                                                      (lncrna_enrich_logr_results_df["z_score"] == 2)
                                                                     ].sort_values(by="log_odds_abs", ascending=False)

In [58]:
# show top 15 features in main figure
lncrna_enrich_logr_results_pass_z2_top_results_df = lncrna_enrich_logr_results_pass_z2_df.head(top_results).copy()
lncrna_top_features = set(lncrna_enrich_logr_results_pass_z2_top_results_df.feature.unique())

Enrichment testing for protein-coding genes 

In [59]:
gene_features_df = pd.read_csv(gene_features_path)
gene_protein_features_df = gene_features_df[gene_features_df.protein_coding == 1].copy()

# remove features that have all entries = 0
protein_features_to_rmv = ["gene_id", "gene_type", "protein_coding", "misexp_group"]

protein_features = [col for col in gene_protein_features_df.columns if col not in protein_features_to_rmv]

print(f"Number of features: {len(protein_features)}")
protein_gene_ids_with_features = gene_protein_features_df.gene_id.unique()
print(f"Number of genes: {len(protein_gene_ids_with_features)}")

Number of features: 81
Number of genes: 3092


In [60]:
protein_enrich_logr_results, count = {}, 0
for z_score in [2, 10, 20, 30]:
    gene_misexp_gene_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/misexp_genes_tpm0.5_z{z_score}.txt")
    gene_misexp = set(pd.read_csv(gene_misexp_gene_path, sep="\t", header=None)[0].tolist())
    gene_never_misexp_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/never_misexp_genes_tpm0.5_z{z_score}.txt")
    gene_never_misexp = set(pd.read_csv(gene_never_misexp_path, sep="\t", header=None)[0].tolist())
    # categorical variable: misexpressed, never misexpressed 
    gene_protein_features_df["misexp_group"] = np.where(gene_protein_features_df.gene_id.isin(gene_never_misexp), 0, 1)
    # logistic regression 
    for feature in protein_features:
        print(feature)
        input_df = gene_protein_features_df.copy()
        input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
        y, X = dmatrices(f"misexp_group ~ {feature}_norm", input_df, return_type = 'dataframe')
        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()
        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
        # normal approximation confidence intervals
        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
        protein_enrich_logr_results[count] = [z_score, feature, log_odds, lower_conf, upper_conf, pval]
        count += 1

gene_length
Optimization terminated successfully.
         Current function value: 0.623480
         Iterations 7
gene_count_1Mb
Optimization terminated successfully.
         Current function value: 0.634177
         Iterations 5
gene_distance_min
Optimization terminated successfully.
         Current function value: 0.644017
         Iterations 4
tissue_expression
Optimization terminated successfully.
         Current function value: 0.642521
         Iterations 5
adipose_expression
Optimization terminated successfully.
         Current function value: 0.644026
         Iterations 4
adrenal_gland_expression
Optimization terminated successfully.
         Current function value: 0.644684
         Iterations 4
artery_expression
Optimization terminated successfully.
         Current function value: 0.644437
         Iterations 4
bladder_expression
Optimization terminated successfully.
         Current function value: 0.644333
         Iterations 4
brain_expression
Optimization terminated

phylop_mean
Optimization terminated successfully.
         Current function value: 0.644970
         Iterations 4
approved_target
Optimization terminated successfully.
         Current function value: 0.641278
         Iterations 5
decipher_gene
Optimization terminated successfully.
         Current function value: 0.642596
         Iterations 5
omim_gene
Optimization terminated successfully.
         Current function value: 0.643091
         Iterations 4
gene_length
Optimization terminated successfully.
         Current function value: 0.623480
         Iterations 7
gene_count_1Mb
Optimization terminated successfully.
         Current function value: 0.634177
         Iterations 5
gene_distance_min
Optimization terminated successfully.
         Current function value: 0.644017
         Iterations 4
tissue_expression
Optimization terminated successfully.
         Current function value: 0.642521
         Iterations 5
adipose_expression
Optimization terminated successfully.
         Cur

Optimization terminated successfully.
         Current function value: 0.612370
         Iterations 5
oe_mis_upper
Optimization terminated successfully.
         Current function value: 0.631497
         Iterations 5
gerp_element_per_bp
Optimization terminated successfully.
         Current function value: 0.644716
         Iterations 4
tad_distance
Optimization terminated successfully.
         Current function value: 0.643607
         Iterations 5
phylop_mean
Optimization terminated successfully.
         Current function value: 0.644970
         Iterations 4
approved_target
Optimization terminated successfully.
         Current function value: 0.641278
         Iterations 5
decipher_gene
Optimization terminated successfully.
         Current function value: 0.642596
         Iterations 5
omim_gene
Optimization terminated successfully.
         Current function value: 0.643091
         Iterations 4
gene_length
Optimization terminated successfully.
         Current function value: 0.5



Optimization terminated successfully.
         Current function value: 0.596939
         Iterations 11
gene_fraction_Enh
Optimization terminated successfully.
         Current function value: 0.596662
         Iterations 5
gene_fraction_ZNFRpts
Optimization terminated successfully.
         Current function value: 0.598122
         Iterations 5
gene_fraction_Het
Optimization terminated successfully.
         Current function value: 0.597780
         Iterations 5
gene_fraction_TssBiv
Optimization terminated successfully.
         Current function value: 0.597905
         Iterations 5
gene_fraction_BivFlnk
Optimization terminated successfully.
         Current function value: 0.597999
         Iterations 5
gene_fraction_EnhBiv
Optimization terminated successfully.
         Current function value: 0.598074
         Iterations 5
gene_fraction_ReprPC
Optimization terminated successfully.
         Current function value: 0.598049
         Iterations 5
gene_fraction_ReprPCWk
Optimization term

Optimization terminated successfully.
         Current function value: 0.480662
         Iterations 5
active_gene_distance
Optimization terminated successfully.
         Current function value: 0.480728
         Iterations 5
active_gene_count_1Mb
Optimization terminated successfully.
         Current function value: 0.479112
         Iterations 5
gene_fraction_TssA
Optimization terminated successfully.
         Current function value: 0.479182
         Iterations 9
gene_fraction_TssAFlnk
Optimization terminated successfully.
         Current function value: 0.478632
         Iterations 10
gene_fraction_TxFlnk
         Current function value: 0.480661
         Iterations: 35
gene_fraction_Tx
Optimization terminated successfully.
         Current function value: 0.480540
         Iterations 6
gene_fraction_TxWk
Optimization terminated successfully.
         Current function value: 0.480384
         Iterations 6
gene_fraction_EnhG
         Current function value: 0.479657
         Iterati

  return 1/(1+np.exp(-X))


gnomad_autosomal_dominant
Optimization terminated successfully.
         Current function value: 0.480638
         Iterations 5
gnomad_haploinsufficient
Optimization terminated successfully.
         Current function value: 0.480675
         Iterations 6
gnomad_autosomal_recessive
Optimization terminated successfully.
         Current function value: 0.479909
         Iterations 6
gnomad_olfactory_genes
Optimization terminated successfully.
         Current function value: 0.480487
         Iterations 5
oncogene
Optimization terminated successfully.
         Current function value: 0.480548
         Iterations 6
EDS
Optimization terminated successfully.
         Current function value: 0.472641
         Iterations 6
ActivityLinking_Conserved_nt_count
Optimization terminated successfully.
         Current function value: 0.475606
         Iterations 6
ActivityLinking_nt_count
Optimization terminated successfully.
         Current function value: 0.477730
         Iterations 5
ProximityL

In [61]:
# results 
protein_enrich_logr_results_df = pd.DataFrame.from_dict(protein_enrich_logr_results, 
                                                orient="index", 
                                                columns=["z_score", "feature", "log_odds", "lower", "upper", "pval"])
# multiple testing correction 
pval_as_array = protein_enrich_logr_results_df.pval.to_numpy()
# FDR BH and Bonferroni correction 
for method in ["fdr_bh", "bonferroni"]:
    pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=method)
    protein_enrich_logr_results_df[f"{method}_pass"] = pass_test
    protein_enrich_logr_results_df[f"{method}_pval_adj"] = pval_adj
protein_enrich_logr_results_df["feature_name"] = protein_enrich_logr_results_df.feature.replace(feature_name_dict)
# add feature group 
features_to_group_dict = {}
for group in feature_groups_dict.keys(): 
    for feature in feature_groups_dict[group]: 
        features_to_group_dict[feature] = group 
protein_enrich_logr_results_df["feature_group"] = protein_enrich_logr_results_df.feature.replace(features_to_group_dict)
# calculate absolute value of log-odds 
protein_enrich_logr_results_df["log_odds_abs"] = protein_enrich_logr_results_df.log_odds.abs()
# z-score > 2, passing Bonferroni correction  
protein_enrich_logr_results_pass_z2_df = protein_enrich_logr_results_df[(protein_enrich_logr_results_df.bonferroni_pass) & 
                                                                        (protein_enrich_logr_results_df["z_score"] == 2)
                                                                       ].sort_values(by="log_odds_abs", ascending=False)
# top 15 features
protein_enrich_logr_top_results_pass_z2_df = protein_enrich_logr_results_pass_z2_df.head(top_results).copy()
protein_top_features = set(protein_enrich_logr_top_results_pass_z2_df.feature.unique())

In [77]:
# concatenate all results
protein_enrich_logr_results_df["gene_type"] = "protein-coding"
lncrna_enrich_logr_results_df["gene_type"] = "lncRNA"
enrich_logr_results_df["gene_type"] = "all"

all_enrich_test_results_df = pd.concat([enrich_logr_results_df.drop(columns=["log_odds_adj"]), protein_enrich_logr_results_df, lncrna_enrich_logr_results_df])

In [78]:
# select top 15 features from each group 
combined_top15_features = protein_top_features.union(lncrna_top_features).union(all_top_features)
all_enrich_test_results_top15_z2_df = all_enrich_test_results_df[all_enrich_test_results_df.feature.isin(combined_top15_features) &
                                                        (all_enrich_test_results_df["z_score"] == 2)].copy()

In [79]:
feature_names_in_plot = {'Gene length':'Gene length',
                         'gnomAD LOEUF':'gnomAD LOEUF',
                         'OMIM':'OMIM',
                         '# enhancers (activity-linking)': '# enhancers (activity)',
                         '# enhancers (proximity-linking)': '# enhancers (proximity)',
                         'Conserved enhancer bp (proximity-linking)': 'Conserved enhancer bp (proximity)',
                         'Conserved # enhancers (proximity-linking)': 'Conserved # enhancers (proximity)',
                         'Enhancer bp (proximity-linking)': 'Enhancer bp (proximity)',
                         'Brain': 'Brain expression',
                         '# tissues expressed': '# tissues expressed',
                         'Enhancer Domain Score (EDS)': 'Enhancer Domain Score', 
                         'Episcore': 'Episcore',
                         "gnomAD missense OEUF": "gnomAD missense OEUF", 
                         'Pituitary': 'Pituitary expression', 
                         'Heart': 'Heart expression', 
                         "Protein-coding": "Protein-coding"
                        }


# write results adjusting name for plot 
all_enrich_test_results_top15_z2_df["feature_name_trunc"] = all_enrich_test_results_top15_z2_df.feature_name.replace(feature_names_in_plot)

In [80]:
# write to file 
all_enrich_test_results_top15_z2_path = out_dir_path.joinpath(f"enrich_logr_results_gene_types_z2_bonf.csv")
all_enrich_test_results_top15_z2_df.to_csv(all_enrich_test_results_top15_z2_path, index=False)