In [21]:
import gseapy as gp
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def gsea_plot(cluster):
    cluster_data = pd.read_csv(f"/BiO2/Research/ipmi/gastric_cancer/RNA_{cluster}_log2_fold_change_value.csv", index_col=0)
    
    gene_scores = cluster_data.mean(axis=1)
    ranked_genes = gene_scores.sort_values(ascending=False).reset_index()
    ranked_genes.columns = ["Gene", "Score"]
    ranked_genes['Gene'] = ranked_genes['Gene'].str.upper()
    ranked_genes.to_csv(f"/BiO2/Research/ipmi/gastric_cancer/{cluster}_ranked_genes_uppercase.rnk", sep="\t", index=False, header=False)
    
    pre_res = gp.prerank(
        rnk=f"/BiO2/Research/ipmi/gastric_cancer/{cluster}_ranked_genes_uppercase.rnk",
        gene_sets="KEGG_2016",
        outdir="gsea_results",
        min_size=5,  
        max_size=5000,  
        permutation_num=100
    )
    
    gsea_result = pre_res.res2d
    significant_pathways = gsea_result[gsea_result["FDR q-val"].astype(float) < 0.05]
    
    output_dir = "/BiO2/Research/ipmi/gastric_cancer/"
    for term in significant_pathways['Term']:
        nom_pval = significant_pathways.loc[significant_pathways['Term'] == term, 'FDR q-val'].values[0]
        nom_pval_formatted = f"{nom_pval:.3f}"
        pathway = term.split(" Homo")[0] 
    
        fig = pre_res.plot(terms=[term])
        fig.suptitle(f"GSEA result of RNA1 cluster of {pathway} (p-val: {nom_pval_formatted})", fontsize=12, y=1.05)
    
        fig.savefig(f"{output_dir}{cluster}_gsea_plot_{term}.pdf", bbox_inches='tight')
        plt.close(fig)

In [None]:
for i in ["cluster1", "cluster2"]:
    gsea_plot(i)