In [None]:
import networkx as nx
import pandas as pd
from ora import run_ora, gmt_parser
import numpy as np
import gseapy
import pickle
import igraph
import json
import networkx as nx
import pandas as pd
import json

In [None]:
sc_bel_df = pd.read_table("../bel_graphs/sc_bel_extend.tsv")
bp_bel_df = pd.read_table("../bel_graphs/bp_bel_extend.tsv")

In [None]:
sc_common_edges = [(sc_bel_df.at[row, "source"], sc_bel_df.at[row, "target"]) for row in sc_bel_df.index]
bp_common_edges = [(bp_bel_df.at[row, "source"], bp_bel_df.at[row, "target"]) for row in bp_bel_df.index]

In [None]:
sc_set = {node for edge in sc_common_edges for node in edge}
bp_set = {node for edge in bp_common_edges for node in edge}

In [None]:
len(sc_set), len(bp_set)

In [None]:
sc_ora_df = run_ora(
    gmt_path = "kegg.gmt", 
    set_gene_symbols = sc_set, 
    min_size = 15, 
    max_size = 500
)

bp_ora_df = run_ora(
    gmt_path = "kegg.gmt", 
    set_gene_symbols = bp_set, 
    min_size = 15, 
    max_size = 500
)

In [None]:
with open("kegg.json", "r") as f:
    pathway_names = json.load(f)

enriched_sc_ora_df = sc_ora_df[sc_ora_df['q_value'] <= 0.05].copy()
enriched_sc_ora_df["pathway_name"] = enriched_sc_ora_df["pathway_id"].apply(lambda x: pathway_names[x])

enriched_bp_ora_df = bp_ora_df[bp_ora_df['q_value'] <= 0.05].copy()
enriched_bp_ora_df["pathway_name"] = enriched_bp_ora_df["pathway_id"].apply(lambda x: pathway_names[x])

In [None]:
enriched_sc_ora_df.to_csv("sc_enrich_ora.tsv", sep="\t")
enriched_bp_ora_df.to_csv("bp_enrich_ora.tsv", sep="\t")

In [None]:
kegg_map = json.load(open("kegg.json", "r"))

with open("kegg.gmt") as genesets:
    gmt_data = {
        kegg_map[line.strip().split("\t")[0]]: line.strip().split("\t")[2:] 
        for line in genesets.readlines()
    }

In [None]:
for pathway in enriched_sc_ora_df["pathway_name"]:
    pathway_data = gmt_data[pathway]
    count = 0
    
    for gene in pathway_data:
        if any(sc_bel_df["source"].str.contains(gene)) or any(sc_bel_df["target"].str.contains(gene)):
            count += 1
    
    if count/len(pathway_data) > 0.10:
        print(f"{pathway}: {count/len(pathway_data) * 100:.2f} ({count}/{len(pathway_data)})")

In [None]:
for pathway in enriched_bp_ora_df["pathway_name"]:
    pathway_data = gmt_data[pathway]
    count = 0
    
    for gene in pathway_data:
        if any(bp_bel_df["source"].str.contains(gene)) or any(bp_bel_df["target"].str.contains(gene)):
            count += 1
    
    if count/len(pathway_data) > 0.10:
        print(f"{pathway}: {count/len(pathway_data) * 100:.2f} ({count}/{len(pathway_data)})")