## Enrichment analysis for ELGP

In [2]:
import pandas as pd
from gseapy import gseaplot
import gseapy as gp
import matplotlib.pyplot as plt

species = 'human'

ess = pd.read_csv(f"../../data/benchmark/{species}/ess_lpi.csv")
ess.head()
gene_sets = {'Essential LncRNA':ess['lncRNA_ID']}

if species == 'human':
    tissue_dict={'heart':'heart left ventricle','lung':'left lung','stomach':'stomach'}
else:
    tissue_dict={'heart':'heart','lung':'lung','brain':'forebrain'}
model_names = ['SVM','MLP']

for model_name in model_names:
    for tissue in tissue_dict.keys():

        tissue_name = tissue_dict[tissue]
        prediction = pd.read_csv(f"../../results/{species}/{model_name}_predictions_{tissue}.csv")
        prediction = prediction[['lncRNA_ID','Score']]

        rnk = prediction

        pre_res = gp.prerank(rnk=rnk,
                            gene_sets=gene_sets,
                            threads=4,
                            min_size=5,
                            max_size=4000,
                            permutation_num=1000, # reduce number to speed up testing
                            outdir=None, # don't write to disk
                            seed=6,
                            verbose=True, # see what's going on behind the scenes
                            )

        es_df = pd.DataFrame({
            "lncRNA_ID": pre_res.ranking.index,
            "ES_score": pre_res.results['Essential LncRNA']["RES"]
        })
        results = pd.merge(es_df, prediction, on='lncRNA_ID', how='inner')
        results['label'] = results['lncRNA_ID'].isin(ess['lncRNA_ID']).astype(int)
        results.to_csv(f"{species}/res_{model_name}_{tissue}.csv", index=False)
        gsea_res = pre_res.res2d
        gsea_res.to_csv(f"{species}/gsea_{model_name}_{tissue}.csv", index=False)

        terms = pre_res.res2d.Term
        fig = gseaplot(
            term=f"{model_name}({tissue_name})",
            **pre_res.results[terms[0]],
            color="green",
            figsize=(4, 3),
            ofname=f"{species}/{model_name}_{tissue}.svg",
            rank_metric=None
        )


2025-04-26 19:26:39,297 [INFO] Parsing data files for GSEA.............................
2025-04-26 19:26:39,488 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-26 19:26:39,493 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-26 19:26:39,496 [INFO] Start to run GSEA...Might take a while..................
2025-04-26 19:26:52,230 [INFO] Congratulations. GSEApy runs successfully................

2025-04-26 19:26:54,231 [INFO] Parsing data files for GSEA.............................
2025-04-26 19:26:54,516 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-26 19:26:54,519 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-26 19:26:54,521 [INFO] Start to run GSEA...Might take a while..................
2025-04-26 19:27:07,643 [INFO] Congratulations. GSEApy runs successfully................

2025-04-26 19:27:09,856 [INFO] Parsing data files for GSEA............................

## Enrichment analysis for iEssLnc and GIC.

In [None]:
import pandas as pd
from gseapy import gseaplot
import gseapy as gp
import matplotlib.pyplot as plt

species = 'mouse'

ess = pd.read_csv(f"../../data/benchmark/{species}/ess_lpi.csv")
ess.head()
gene_sets = {'Essential LncRNA':ess['lncRNA_ID']}

prediction = pd.read_csv(f"../../results/{species}/GIC_score_{species}.csv")
prediction = prediction[['lncRNA_ID','Score']]

rnk = prediction

pre_res = gp.prerank(rnk=rnk,
                    gene_sets=gene_sets,
                    threads=4,
                    min_size=5,
                    max_size=4000,
                    permutation_num=1000, # reduce number to speed up testing
                    outdir=None, # don't write to disk
                    seed=6,
                    verbose=True, # see what's going on behind the scenes
                    )

es_df = pd.DataFrame({
    "lncRNA_ID": pre_res.ranking.index,
    "ES_score": pre_res.results['Essential LncRNA']["RES"]
})
results = pd.merge(es_df, prediction, on='lncRNA_ID', how='inner')
results['label'] = results['lncRNA_ID'].isin(ess['lncRNA_ID']).astype(int)
results.to_csv(f"{species}/res_GIC_{species}.csv", index=False)
gsea_res = pre_res.res2d
gsea_res.to_csv(f"{species}/gsea_GIC_{species}.csv", index=False)

terms = pre_res.res2d.Term
fig = gseaplot(
    term=f"GIC({species})",
    **pre_res.results[terms[0]],
    color="green",
    figsize=(4, 3),
    ofname=f"{species}/GIC_{species}.svg",
    rank_metric=None
)


The order of those genes will be arbitrary, which may produce unexpected results.
2025-04-17 23:25:41,451 [INFO] Parsing data files for GSEA.............................
2025-04-17 23:25:41,493 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-17 23:25:41,495 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-17 23:25:41,496 [INFO] Start to run GSEA...Might take a while..................
2025-04-17 23:25:43,748 [INFO] Congratulations. GSEApy runs successfully................

