### Odds ratio from gProfiler results 

In [1]:
import pandas as pd 
from pathlib import Path 
import numpy as np

from scipy.stats import fisher_exact

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

gprofiler_results_path = wkdir_path.joinpath("3_misexp_genes/gprofiler_enrichment/gprofiler_hp_enrich_misexp_8650genes_protein_coding.csv")
gprofiler_results_df = pd.read_csv(gprofiler_results_path, sep=",")

In [4]:
def sum_nested_list(nested_list):
    total = 0
    for item in nested_list:
        if isinstance(item, list):
            total += sum_nested_list(item)
        else:
            total += item 
    return total

In [5]:
enrich_results_dict = {}
for index, row in gprofiler_results_df.iterrows(): 
    term_name, intersect, term, query, domain = row["term_name"], row["intersection_size"], row["term_size"], row["query_size"], row["effective_domain_size"]
    misexp_overlap = intersect
    misexp_nonoverlap = query - intersect
    nonmisexp_overlap = term - intersect
    nonmisexp_nonoverlap = domain - query - nonmisexp_overlap
    conting_mtx = [[misexp_overlap, misexp_nonoverlap], [nonmisexp_overlap, nonmisexp_nonoverlap]]
    odds_ratio, p_value = fisher_exact(conting_mtx, alternative="two-sided")
    total = sum_nested_list(conting_mtx)
    if total != domain: 
        raise ValueError(print(f"Total domain size not equal: {total}, {domain}"))
    enrich_results_dict[index] = [term_name, odds_ratio, p_value]
                   

In [14]:
enrich_results_df = pd.DataFrame.from_dict(enrich_results_dict, orient="index", columns=["term_name", "odds_ratio", "pval"])
enrich_results_df["log_odds"] = np.log(enrich_results_df.odds_ratio)
# merge with results 
gprofiler_enrich_results_df = pd.merge(gprofiler_results_df, enrich_results_df, on="term_name", how="inner")
# write to file
gprofiler_enrich_results_path = wkdir_path.joinpath("3_misexp_genes/gprofiler_enrichment/gprofiler_hp_enrich_misexp_8650genes_protein_coding_log_odds.csv")
gprofiler_enrich_results_df.to_csv(gprofiler_enrich_results_path, index=False)