## Enrichment analysis for ELGP

In [None]:
import pandas as pd
from gseapy import gseaplot
import gseapy as gp

for species in ['human','mouse']:
    ess = pd.read_csv(f"../../data/benchmark/{species}/ess_lnc.csv")
    ess.head()
    gene_sets = {'Essential LncRNA':ess['lncRNA_id']}

    if species == 'human':
        tissue_dict={'heart':'heart left ventricle','lung':'left lung','stomach':'stomach'}
    else:
        tissue_dict={'heart':'heart','lung':'lung','brain':'forebrain'}
    model_names = ['MLP','SVM']

    for model_name in model_names:
        for tissue in tissue_dict.keys():

            tissue_name = tissue_dict[tissue]
            prediction = pd.read_csv(f"../../results/{species}/{model_name}_predictions_{tissue}.csv")
            prediction = prediction[['lncRNA_id','Score']]

            rnk = prediction

            pre_res = gp.prerank(rnk=rnk,
                                gene_sets=gene_sets,
                                threads=4,
                                min_size=5,
                                max_size=4000,
                                permutation_num=1000, # reduce number to speed up testing
                                outdir=None, # don't write to disk
                                seed=6,
                                verbose=True, # see what's going on behind the scenes
                                )

            es_df = pd.DataFrame({
                "lncRNA_id": pre_res.ranking.index,
                "ES_score": pre_res.results['Essential LncRNA']["RES"]
            })
            results = pd.merge(es_df, prediction, on='lncRNA_id', how='inner')
            results['label'] = results['lncRNA_id'].isin(ess['lncRNA_id']).astype(int)
            results.to_csv(f"{species}/res_{model_name}_{tissue}.csv", index=False)
            gsea_res = pre_res.res2d
            gsea_res.to_csv(f"{species}/gsea_{model_name}_{tissue}.csv", index=False)

            terms = pre_res.res2d.Term
            fig = gseaplot(
                term=f"{model_name}({tissue_name})",
                **pre_res.results[terms[0]],
                color="green",
                figsize=(4, 3),
                ofname=f"{species}/{model_name}_{tissue}.svg",
                rank_metric=None
            )


2025-09-12 18:51:01,142 [INFO] Parsing data files for GSEA.............................
2025-09-12 18:51:01,206 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-09-12 18:51:01,210 [INFO] 0001 gene_sets used for further statistical testing.....
2025-09-12 18:51:01,211 [INFO] Start to run GSEA...Might take a while..................
2025-09-12 18:51:05,281 [INFO] Congratulations. GSEApy runs successfully................

2025-09-12 18:51:06,275 [INFO] Parsing data files for GSEA.............................
2025-09-12 18:51:06,337 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-09-12 18:51:06,340 [INFO] 0001 gene_sets used for further statistical testing.....
2025-09-12 18:51:06,341 [INFO] Start to run GSEA...Might take a while..................
2025-09-12 18:51:09,646 [INFO] Congratulations. GSEApy runs successfully................

2025-09-12 18:51:09,998 [INFO] Parsing data files for GSEA............................

## Enrichment analysis for iEssLnc and GIC.

In [2]:
import pandas as pd
from gseapy import gseaplot
import gseapy as gp
import matplotlib.pyplot as plt

for species in ['human','mouse']:
    ess = pd.read_csv(f"../../data/benchmark/{species}/ess_lnc.csv")
    ess.head()
    gene_sets = {'Essential LncRNA':ess['lncRNA_id']}

    for method in ['GIC','iEssLnc']:

        prediction = pd.read_csv(f"../../results/{species}/{method}_score.csv")
        prediction = prediction[['lncRNA_id','Score']]

        rnk = prediction

        pre_res = gp.prerank(rnk=rnk,
                            gene_sets=gene_sets,
                            threads=4,
                            min_size=5,
                            max_size=4000,
                            permutation_num=1000, # reduce number to speed up testing
                            outdir=None, # don't write to disk
                            seed=6,
                            verbose=True, # see what's going on behind the scenes
                            )

        es_df = pd.DataFrame({
            "lncRNA_id": pre_res.ranking.index,
            "ES_score": pre_res.results['Essential LncRNA']["RES"]
        })
        results = pd.merge(es_df, prediction, on='lncRNA_id', how='inner')
        results['label'] = results['lncRNA_id'].isin(ess['lncRNA_id']).astype(int)
        results.to_csv(f"{species}/res_{method}_{species}.csv", index=False)
        gsea_res = pre_res.res2d
        gsea_res.to_csv(f"{species}/gsea_{method}_{species}.csv", index=False)

        terms = pre_res.res2d.Term
        fig = gseaplot(
            term=f"{method}({species})",
            **pre_res.results[terms[0]],
            color="green",
            figsize=(4, 3),
            ofname=f"{species}/{method}_{species}.svg",
            rank_metric=None
        )




The order of those genes will be arbitrary, which may produce unexpected results.
2025-09-12 17:10:26,632 [INFO] Parsing data files for GSEA.............................
2025-09-12 17:10:26,665 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-09-12 17:10:26,666 [INFO] 0001 gene_sets used for further statistical testing.....
2025-09-12 17:10:26,666 [INFO] Start to run GSEA...Might take a while..................
2025-09-12 17:10:28,532 [INFO] Congratulations. GSEApy runs successfully................

2025-09-12 17:10:28,852 [INFO] Parsing data files for GSEA.............................
2025-09-12 17:10:28,903 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-09-12 17:10:28,906 [INFO] 0001 gene_sets used for further statistical testing.....
2025-09-12 17:10:28,906 [INFO] Start to run GSEA...Might take a while..................
2025-09-12 17:10:30,706 [INFO] Congratulations. GSEApy runs successfully................

The 