## Enrichment analysis for ELGP

In [6]:
import pandas as pd
from gseapy import gseaplot
import gseapy as gp
import matplotlib.pyplot as plt

ess = pd.read_csv("../../data/benchMarking/mouse/ess_lpi.csv")
ess.head()
gene_sets = {'Essential LncRNA':ess['lncRNA_ID']}

human_tissue_dict={'heart':'heart left ventricle','lung':'left lung','stomach':'stomach'}
mouse_tissue_dict={'heart':'heart','lung':'lung','brain':'forebrain'}
model_names = ['SVM','MLP']

for model_name in model_names:
    for tissue in mouse_tissue_dict.keys():

        tissue_name = mouse_tissue_dict[tissue]
        prediction = pd.read_csv(f"../../results/mouse/{model_name}_predictions_{tissue}.csv")
        prediction = prediction[['lncRNA_ID','Score']]

        rnk = prediction

        pre_res = gp.prerank(rnk=rnk,
                            gene_sets=gene_sets,
                            threads=4,
                            min_size=5,
                            max_size=4000,
                            permutation_num=1000, # reduce number to speed up testing
                            outdir=None, # don't write to disk
                            seed=6,
                            verbose=True, # see what's going on behind the scenes
                            )

        es_df = pd.DataFrame({
            "lncRNA_ID": pre_res.ranking.index,
            "ES_score": pre_res.results['Essential LncRNA']["RES"]
        })
        results = pd.merge(es_df, prediction, on='lncRNA_ID', how='inner')
        results['label'] = results['lncRNA_ID'].isin(ess['lncRNA_ID']).astype(int)
        results.to_csv(f"mouse/res_{model_name}_{tissue}.csv", index=False)
        gsea_res = pre_res.res2d
        gsea_res.to_csv(f"mouse/gsea_{model_name}_{tissue}.csv", index=False)

        terms = pre_res.res2d.Term
        fig = gseaplot(
            term=f"{model_name}({tissue_name})",
            **pre_res.results[terms[0]],
            color="green",
            figsize=(4, 3),
            ofname=f"mouse/{model_name}_{tissue}.svg",
            rank_metric=None
        )


The order of those genes will be arbitrary, which may produce unexpected results.


2025-04-17 22:52:20,053 [INFO] Parsing data files for GSEA.............................
2025-04-17 22:52:20,115 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-17 22:52:20,118 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-17 22:52:20,119 [INFO] Start to run GSEA...Might take a while..................
2025-04-17 22:52:22,553 [INFO] Congratulations. GSEApy runs successfully................

The order of those genes will be arbitrary, which may produce unexpected results.
2025-04-17 22:52:22,951 [INFO] Parsing data files for GSEA.............................
2025-04-17 22:52:23,009 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-17 22:52:23,011 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-17 22:52:23,012 [INFO] Start to run GSEA...Might take a while..................
2025-04-17 22:52:25,153 [INFO] Congratulations. GSEApy runs successfully................

The 

## Enrichment analysis for iEssLnc and GIC.

In [7]:
import pandas as pd
from gseapy import gseaplot
import gseapy as gp
import matplotlib.pyplot as plt

ess = pd.read_csv("../../data/benchMarking/mouse/ess_lpi.csv")
ess.head()
gene_sets = {'Essential LncRNA':ess['lncRNA_ID']}

prediction = pd.read_csv(f"../../results/mouse/GIC_score_mouse.csv")
prediction = prediction[['lncRNA_ID','Score']]

rnk = prediction

pre_res = gp.prerank(rnk=rnk,
                    gene_sets=gene_sets,
                    threads=4,
                    min_size=5,
                    max_size=4000,
                    permutation_num=1000, # reduce number to speed up testing
                    outdir=None, # don't write to disk
                    seed=6,
                    verbose=True, # see what's going on behind the scenes
                    )

es_df = pd.DataFrame({
    "lncRNA_ID": pre_res.ranking.index,
    "ES_score": pre_res.results['Essential LncRNA']["RES"]
})
results = pd.merge(es_df, prediction, on='lncRNA_ID', how='inner')
results['label'] = results['lncRNA_ID'].isin(ess['lncRNA_ID']).astype(int)
results.to_csv(f"mouse/res_GIC_mouse.csv", index=False)
gsea_res = pre_res.res2d
gsea_res.to_csv(f"mouse/gsea_GIC_mouse.csv", index=False)

terms = pre_res.res2d.Term
fig = gseaplot(
    term=f"GIC(mouse)",
    **pre_res.results[terms[0]],
    color="green",
    figsize=(4, 3),
    ofname=f"mouse/GIC_mouse.svg",
    rank_metric=None
)


The order of those genes will be arbitrary, which may produce unexpected results.
2025-04-17 23:25:41,451 [INFO] Parsing data files for GSEA.............................
2025-04-17 23:25:41,493 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-17 23:25:41,495 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-17 23:25:41,496 [INFO] Start to run GSEA...Might take a while..................
2025-04-17 23:25:43,748 [INFO] Congratulations. GSEApy runs successfully................

