### Misexpressed gene enrichment analysis 

In [1]:
import pandas as pd
from statsmodels.discrete import discrete_model
from statsmodels.stats import multitest
import numpy as np
from patsy import dmatrices
from pathlib import Path

In [2]:
# inputs 
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

gene_features_path = wkdir_path.joinpath("3_misexp_genes/inactive_gene_features_8650.csv")
out_dir_path = wkdir_path.joinpath("3_misexp_genes/results")

In [3]:
# load features 
gene_features_df = pd.read_csv(gene_features_path)
features = [col for col in gene_features_df.columns if col != "gene_id"]
print(f"Number of features: {len(features)}")
gene_ids_with_features = gene_features_df.gene_id.unique()
print(f"Number of genes: {len(gene_ids_with_features)}")

Number of features: 81
Number of genes: 8650


In [4]:
enrich_logr_results, count = {}, 0
for z_score in [2, 10, 20, 30]:
    gene_misexp_gene_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/misexp_genes_tpm0.5_z{z_score}.txt")
    gene_misexp = set(pd.read_csv(gene_misexp_gene_path, sep="\t", header=None)[0].tolist())
    gene_never_misexp_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_metrics/never_misexp_genes_tpm0.5_z{z_score}.txt")
    gene_never_misexp = set(pd.read_csv(gene_never_misexp_path, sep="\t", header=None)[0].tolist())
    total_genes = len(gene_misexp) + len(gene_never_misexp)
    if total_genes != len(gene_ids_with_features): 
        raise ValueError("Mismatch between gene total genes and numeber of genes with features.")
    # categorical variable: misexpressed, never misexpressed 
    gene_features_df["misexp_group"] = np.where(gene_features_df.gene_id.isin(gene_never_misexp), 0, 1)
    # logistic regression 
    for feature in features:
        input_df = gene_features_df.copy()
        input_df[f"{feature}_norm"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()
        y, X = dmatrices(f"misexp_group ~ {feature}_norm", input_df, return_type = 'dataframe')
        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()
        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]
        # normal approximation confidence intervals
        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]
        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]
        enrich_logr_results[count] = [z_score, feature, log_odds, lower_conf, upper_conf, pval]
        count += 1

Optimization terminated successfully.
         Current function value: 0.657092
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.692673
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692559
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.680872
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688081
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688669
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688824
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688182
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.680269
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687728
  

Optimization terminated successfully.
         Current function value: 0.687728
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688371
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687301
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.689166
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687713
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.685497
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687344
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688178
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688615
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.689769
  

Optimization terminated successfully.
         Current function value: 0.681227
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.683524
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.682082
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.681829
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.681561
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.681869
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.676299
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.681132
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684976
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.682191
  



Optimization terminated successfully.
         Current function value: 0.687935
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688063
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688102
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687550
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687854
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.686928
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.687253
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687427
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.684516
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.687519
  



Optimization terminated successfully.
         Current function value: 0.612275
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612324
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612324
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612290
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612252
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.612183
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611861
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611708
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611323
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.611295
  

In [14]:
# results 
enrich_logr_results_df = pd.DataFrame.from_dict(enrich_logr_results, 
                                                orient="index", 
                                                columns=["z_score", "feature", "log_odds", "lower", "upper", "pval"])

In [15]:
# multiple testing correction 
pval_as_array = enrich_logr_results_df.pval.to_numpy()
# FDR BH and Bonferroni correction 
for method in ["fdr_bh", "bonferroni"]:
    pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=method)
    enrich_logr_results_df[f"{method}_pass"] = pass_test
    enrich_logr_results_df[f"{method}_pval_adj"] = pval_adj
    enrich_logr_results_pass_df = enrich_logr_results_df[enrich_logr_results_df[f"{method}_pass"]]

In [16]:
# feature names
feature_name_dict = {"gene_length": "Gene length", 
                     "gene_count_1Mb" : "Genes within 1Mb", 
                     "gene_distance_min": "Min. gene distance", 
                     "pHaplo": "pHaplo", 
                     "pTriplo": "pTriplo", 
                     "EDS": "Enhancer Domain Score (EDS)", 
                     'oe_lof_upper': "gnomAD LOEUF", 
                     'oe_mis_upper': "gnomAD missense OEUF", 
                     'Episcore': 'Episcore', 
                     'gene_fraction_TssA': 'ChromHMM Active TSS', 
                     'gene_fraction_TssAFlnk': 'ChromHMM Flanking active TSS', 
                     'gene_fraction_TxFlnk': "ChromHMM Transcr. at gene 5' and 3'",
                     'gene_fraction_Tx': "ChromHMM Strong transcription", 
                     'gene_fraction_TxWk': "ChromHMM Weak transcription",
                     'gene_fraction_EnhG': "ChromHMM Genic enhancers", 
                     'gene_fraction_Enh': "ChromHMM Enhancers", 
                     'gene_fraction_ZNFRpts': "ChromHMM ZNF genes and repeats",
                     'gene_fraction_Het': "ChromHMM Heterochromatin", 
                     'gene_fraction_TssBiv': "ChromHMM Bivalent/Poised TSS", 
                     'gene_fraction_BivFlnk': "ChromHMM Flanking Bivalent TSS/Enh",
                     'gene_fraction_EnhBiv': "ChromHMM Bivalent Enhancer", 
                     'gene_fraction_ReprPC': "ChromHMM Repressed PolyComb", 
                     'gene_fraction_ReprPCWk': "ChromHMM Weak Repressed PolyComb",
                     'gene_fraction_Quies': "ChromHMM Quiescent/Low",
                     'ActivityLinking_Conserved_nt_count': 'Conserved enhancer bp (activity-linking)', 
                     'ActivityLinking_nt_count': 'Enhancer nt bp (activity-linking)',
                     'ProximityLinking_Conserved_nt_count': 'Conserved enhancer bp (proximity-linking)', 
                     'ProximityLinking_nt_count': 'Enhancer bp (proximity-linking)',
                     'ActivityLinking_EnhancerNumber': "# enhancers (activity-linking)",
                     'ActivityLinking_NumberConservedElements': "Conserved # enhancers (activity-linking)",
                     'ProximityLinking_EnhancerNumber': "# enhancers (proximity-linking)",
                     'ProximityLinking_NumberConservedElements': "Conserved # enhancers (proximity-linking)",
                     'brain_expression': "Brain expression", 
                     'active_gene_count_1Mb': "Expressed gene within 1Mb",
                     'gnomad_autosomal_dominant': 'Autosomal dominant gene',
                     'gnomad_haploinsufficient': 'Haploinsufficient gene',
                     'gnomad_autosomal_recessive': 'Autosomal recessive gene',
                     'gnomad_olfactory_genes': "Olfactory gene",
                     'oncogene': 'High-confidence oncogenes',
                     "approved_target": "Approved drug target",
                     "gwas_gene": "GWAS gene",
                     "decipher_gene": "DECIPHER gene", 
                     'adipose_expression': 'Adipose expression', 
                     'adrenal_gland_expression': 'Adrenal gland expression', 
                     'artery_expression': 'Artery', 
                     'bladder_expression': 'Bladder', 
                     'brain_expression': 'Brain', 
                     'breast_expression': 'Breast', 
                     'cervix_expression': 'Cervix', 
                     'colon_expression': 'Colon', 
                     'esophagus_expression': 'Esophagus', 
                     'fallopian_tube_expression': 'Fallopian tube', 
                     'heart_expression': 'Heart', 
                     'kidney_expression': 'Kidney', 
                     'liver_expression': 'Liver', 
                     'lung_expression': 'Lung', 
                     'minor_salivary_gland_expression': 'Minor salivary gland', 
                     'muscle_expression': 'Muscle', 
                     'nerve_expression': 'Nerve', 
                     'ovary_expression': 'Ovary', 
                     'pancreas_expression': 'Pancreas', 
                     'pituitary_expression': 'Pituitary', 
                     'prostate_expression': 'Prostate', 
                     'skin_expression': 'Skin', 
                     'small_intestine_expression': 'Small intestine', 
                     'spleen_expression': 'Spleen', 
                     'stomach_expression': 'Stomach', 
                     'testis_expression': 'Testis', 
                     'thyroid_expression': 'Thyroid', 
                     'uterus_expression': 'Uterus', 
                     'vagina_expression': 'Vagina', 
                     'tissue_expression': "# tissues expressed",
                     'active_gene_distance': 'Distance to expressed gene',
                     'fraction_A': "A compartment overlap",
                     'fraction_B': "B compartment overlap",
                     'fraction_unassigned': "Unassigned compartment overlap",
                     'pLI': 'pLI', 
                     'pNull': 'Probability of complete haplosufficiency', 
                     'pRec': 'Probability of recessive lethality', 
                     'gerp_element_per_bp': '# conserved elements per bp',
                     'tad_distance': 'Min. TAD boundary distance', 
                      'phylop_mean': "Mean gene body conservation",
                     'omim_gene': 'OMIM', 
                    }

enrich_logr_results_df["feature_name"] = enrich_logr_results_df.feature.replace(feature_name_dict)

In [17]:
# feature groups 
feature_groups_dict = {"Expression": ["adipose_expression", "adrenal_gland_expression",
                                      "artery_expression", "bladder_expression", "brain_expression", 
                                      "breast_expression", "cervix_expression", "colon_expression", 
                                      "esophagus_expression", "fallopian_tube_expression", "heart_expression",
                                      "kidney_expression", "liver_expression", "lung_expression", 
                                      "minor_salivary_gland_expression", "muscle_expression", 
                                      "nerve_expression", "ovary_expression", "pancreas_expression", 
                                      "pituitary_expression", "prostate_expression", "skin_expression", 
                                      "spleen_expression", "stomach_expression", "testis_expression", 
                                      "thyroid_expression", "uterus_expression", "vagina_expression", 
                                      "small_intestine_expression", "tissue_expression"
                                     ],
                       "Genomic": ["active_gene_distance","active_gene_count_1Mb", 
                                  "gene_count_1Mb", "gene_length", "gene_distance_min"],
                       "Constraint &\nconservation": ['pHaplo', 'pTriplo', 'EDS', 'oe_lof_upper', 
                                      'oe_mis_upper','Episcore', "pLI", "pNull", "pRec", "phylop_mean", "gerp_element_per_bp"],
                       "Regulation": ['ProximityLinking_Conserved_nt_count', 'ProximityLinking_nt_count',
                                      'ActivityLinking_EnhancerNumber', 'ActivityLinking_NumberConservedElements',
                                      'ProximityLinking_EnhancerNumber','ProximityLinking_NumberConservedElements', 
                                     "gene_fraction_TssA", "gene_fraction_TssAFlnk", "gene_fraction_TxFlnk", 
                                      "gene_fraction_Tx", "gene_fraction_TxWk", "gene_fraction_EnhG", 
                                      "gene_fraction_Enh", "gene_fraction_ZNFRpts", "gene_fraction_Het", 
                                      "gene_fraction_TssBiv", "gene_fraction_BivFlnk", "gene_fraction_EnhBiv", 
                                     "gene_fraction_ReprPC", "gene_fraction_ReprPCWk", "gene_fraction_Quies", 
                                      "fraction_A", "fraction_B", "fraction_unassigned", "ActivityLinking_Conserved_nt_count", 
                                      "ActivityLinking_nt_count", "tad_distance"
                                     ],
                       "Gene sets": ["gnomad_autosomal_dominant", "gnomad_haploinsufficient", 
                                     "gnomad_autosomal_recessive", "gnomad_olfactory_genes", "oncogene", 
                                    "approved_target", "decipher_gene", "omim_gene", 
                                    ]
                      }
features_to_group_dict = {}
for group in feature_groups_dict.keys(): 
    for feature in feature_groups_dict[group]: 
        features_to_group_dict[feature] = group 
enrich_logr_results_df["feature_group"] = enrich_logr_results_df.feature.replace(features_to_group_dict)

In [18]:
# calculate absolute value of log-odds 
enrich_logr_results_df["log_odds_abs"] = enrich_logr_results_df.log_odds.abs()
# write all results to file 
out_dir_path.mkdir(parents=True, exist_ok=True)
enrich_logr_results_path = out_dir_path.joinpath("enrich_logr_results_all_rev_zscores.csv")
enrich_logr_results_df.to_csv(enrich_logr_results_path, index=False)

In [19]:
# z-score > 2, passing Bonferroni correction  
enrich_logr_results_pass_z2_df = enrich_logr_results_df[(enrich_logr_results_df["bonferroni_pass"]) & 
                                                     (enrich_logr_results_df["z_score"] == 2)
                                                    ].sort_values(by="log_odds_abs", ascending=False)
# write all features 
enrich_logr_results_pass_z2_path = out_dir_path.joinpath(f"enrich_logr_results_pass_bonf_z2_all.csv")
enrich_logr_results_pass_z2_df.to_csv(enrich_logr_results_pass_z2_path, index=False)

In [21]:
# show top 15 features in main figure
top_results = 15
enrich_logr_results_pass_z2_top_results_df = enrich_logr_results_pass_z2_df.head(top_results).copy()

feature_names_in_plot = {'Gene length':'Gene length',
                         'gnomAD LOEUF':'gnomAD LOEUF',
                         'OMIM':'OMIM',
                         '# enhancers (activity-linking)': '# enhancers (activity)',
                         '# enhancers (proximity-linking)': '# enhancers (proximity)',
                         'Conserved enhancer bp (proximity-linking)': 'Conserved enhancer bp (proximity)',
                         'Conserved # enhancers (proximity-linking)': 'Conserved # enhancers (proximity)',
                         'Enhancer bp (proximity-linking)': 'Enhancer bp (proximity)',
                         'Brain': 'Brain expression',
                         '# tissues expressed': '# tissues expressed',
                         'Enhancer Domain Score (EDS)': 'Enhancer Domain Score', 
                         'Episcore': 'Episcore',
                         "gnomAD missense OEUF": "gnomAD missense OEUF", 
                         'Pituitary': 'Pituitary expression', 
                         'Heart': 'Heart expression'
                        }


# write results adjusting name for plot 
enrich_logr_results_pass_z2_top_results_df["feature_name_trunc"] = enrich_logr_results_pass_z2_top_results_df.feature_name.replace(feature_names_in_plot)
# feature groups 
gene_sets = feature_groups_dict["Expression"] + feature_groups_dict["Gene sets"]
gene_sets.remove("tissue_expression")
print(f"Number of gene set features: {len(gene_sets)}")
# quantitative features
quant_features = feature_groups_dict["Regulation"] + feature_groups_dict["Constraint &\nconservation"] + feature_groups_dict["Genomic"] + ["tissue_expression"]
print(f"Number of quantitative features: {len(quant_features)}")
features_class_dict = {**{feature: "Quantitative" for feature in quant_features}, **{feature: "Binary" for feature in gene_sets}}

enrich_logr_results_pass_z2_top_results_df["class"] = enrich_logr_results_pass_z2_top_results_df.feature.replace(features_class_dict)

Number of gene set features: 37
Number of quantitative features: 44


In [22]:
# write to file 
enrich_logr_results_pass_z2_top_results_path = out_dir_path.joinpath(f"enrich_logr_results_pass_bonf_z2_top{top_results}.csv")
enrich_logr_results_pass_z2_top_results_df.to_csv(enrich_logr_results_pass_z2_top_results_path, index=False)

In [23]:
# write all results, adjusting log odds where failing bonferroni correction (for heatmap)
enrich_logr_results_df["log_odds_adj"] = np.where(enrich_logr_results_df["bonferroni_pass"], 
                                                  enrich_logr_results_df.log_odds, 
                                                  np.nan 
                                                 )
enrich_logr_results_log_odds_adj_path = out_dir_path.joinpath(f"enrich_logr_results_all_zscores_bonf_log_odds_adj.csv")
enrich_logr_results_df.to_csv(enrich_logr_results_log_odds_adj_path, index=False)