## weighted scoring scheme

In [1]:
import pandas as pd
from gseapy import gseaplot
import gseapy as gp
import matplotlib.pyplot as plt

ess = pd.read_csv("../../data/benchMarking/human/ess_lpi.csv")
ess.head()
gene_sets = {'Essential LncRNA':ess['lncRNA_ID']}

human_tissue_dict={'heart':'heart left ventricle','lung':'left lung','stomach':'stomach'}
mouse_tissue_dict={'heart':'heart','lung':'lung','brain':'forebrain'}
model_names = ['SVM','MLP']

for model_name in model_names:
    for tissue in human_tissue_dict.keys():

        tissue_name = human_tissue_dict[tissue]
        prediction = pd.read_csv(f"../../results/human/{model_name}_predictions_{tissue}.csv")
        prediction = prediction[['lncRNA_ID','Score']]

        rnk = prediction

        pre_res = gp.prerank(rnk=rnk,
                            gene_sets=gene_sets,
                            threads=4,
                            min_size=5,
                            max_size=4000,
                            permutation_num=1000, # reduce number to speed up testing
                            outdir=None, # don't write to disk
                            seed=6,
                            verbose=True, # see what's going on behind the scenes
                            )

        es_df = pd.DataFrame({
            "lncRNA_ID": pre_res.ranking.index,
            "ES_score": pre_res.results['Essential LncRNA']["RES"]
        })
        results = pd.merge(es_df, prediction, on='lncRNA_ID', how='inner')
        results['label'] = results['lncRNA_ID'].isin(ess['lncRNA_ID']).astype(int)
        results.to_csv(f"human/res_{model_name}_{tissue}.csv", index=False)
        gsea_res = pre_res.res2d
        #gsea_res['NOM p-val'] = gsea_res['NOM p-val'].round(4)
        gsea_res.to_csv(f"human/gsea_{model_name}_{tissue}.csv", index=False)

        terms = pre_res.res2d.Term
        fig = gseaplot(
            #rank_metric=pre_res.ranking,
            term=f"{model_name}({tissue_name})",
            **pre_res.results[terms[0]],
            color="green",
            figsize=(4, 3),
            ofname=f"human/{model_name}_{tissue}.svg",
            rank_metric=None
        )


2025-04-15 22:07:01,453 [INFO] Parsing data files for GSEA.............................
2025-04-15 22:07:01,539 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-15 22:07:01,541 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-15 22:07:01,542 [INFO] Start to run GSEA...Might take a while..................
2025-04-15 22:07:04,699 [INFO] Congratulations. GSEApy runs successfully................

2025-04-15 22:07:05,609 [INFO] Parsing data files for GSEA.............................
2025-04-15 22:07:05,690 [INFO] 0000 gene_sets have been filtered out when max_size=4000 and min_size=5
2025-04-15 22:07:05,692 [INFO] 0001 gene_sets used for further statistical testing.....
2025-04-15 22:07:05,693 [INFO] Start to run GSEA...Might take a while..................
2025-04-15 22:07:08,513 [INFO] Congratulations. GSEApy runs successfully................

2025-04-15 22:07:10,253 [INFO] Parsing data files for GSEA............................