In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
import gseapy as gp

In [None]:
plot_output_path = 'hesc_pseudobulk_gene_overrep_gene_set'
os.makedirs(plot_output_path, exist_ok=True)

In [None]:
gp.get_library_name()

In [None]:
GO_Biological_Process_2023 = gp.get_library(name='GO_Biological_Process_2023', organism='human')
Reactome_2022 = gp.get_library(name='Reactome_2022', organism='human')
CORUM = gp.get_library(name='CORUM', organism='human')
KEGG_2021_Human = gp.get_library(name='KEGG_2021_Human', organism='human')

In [None]:
dg_cluster_df = pd.read_csv('clustering/downstream_gene_clusters_hESC.csv', index_col=0)

for ds_gene_module in np.unique(dg_cluster_df['leiden']):
    selected_genes = list(dg_cluster_df[dg_cluster_df['leiden'] == ds_gene_module]['gene_name'].values)
    
    try:
        enr = gp.enrichr(gene_list=selected_genes, 
                 gene_sets=[GO_Biological_Process_2023, Reactome_2022, CORUM, KEGG_2021_Human],
                 organism='human', 
                 outdir=None, # don't write to disk
                )
        
        gene_set_map = {
            'gs_ind_0' : 'GO_Biological_Process_2023',
            'gs_ind_1' : 'Reactome_2022',
            'gs_ind_2' : 'CORUM',
            'gs_ind_3' : 'KEGG_2021_Human',
        }
        enr.results['Gene_set'] = enr.results['Gene_set'].map(gene_set_map)

        ax = gp.barplot(enr.results,
                      column="Adjusted P-value",
                      group='Gene_set', 
                      size=10,
                      top_term=4,
                      figsize=(3,5),
                      color=['darkred', 'darkblue', 'darkgreen', 'orange'] # set colors for group
                     )
        ax.set_title(f'readout gene cluster {ds_gene_module}')

        fig = plt.gcf()
        plt.savefig(f'{plot_output_path}/hESC_readout_cluster_{ds_gene_module}_overrep_g_sets.pdf')
        plt.close(fig)

    except:
        continue

In [None]:
dg_cluster_df = pd.read_csv('clustering/perturbed_gene_clusters_hESC.csv', index_col=0)

for ds_gene_module in np.unique(dg_cluster_df['leiden']):
    selected_genes = list(dg_cluster_df[dg_cluster_df['leiden'] == ds_gene_module]['perturbed_gene_name'].values)
    
    try:
        enr = gp.enrichr(gene_list=selected_genes, 
                 gene_sets=[GO_Biological_Process_2023, Reactome_2022, CORUM, KEGG_2021_Human],
                 organism='human', 
                 outdir=None, # don't write to disk
                )
        
        gene_set_map = {
            'gs_ind_0' : 'GO_Biological_Process_2023',
            'gs_ind_1' : 'Reactome_2022',
            'gs_ind_2' : 'CORUM',
            'gs_ind_3' : 'KEGG_2021_Human',
        }
        enr.results['Gene_set'] = enr.results['Gene_set'].map(gene_set_map)

        ax = gp.barplot(enr.results,
                      column="Adjusted P-value",
                      group='Gene_set', 
                      size=10,
                      top_term=4,
                      figsize=(3,5),
                      color=['darkred', 'darkblue', 'darkgreen', 'orange'] # set colors for group
                     )
        ax.set_title(f'perturbed gene cluster {ds_gene_module}')

        fig = plt.gcf()
        plt.savefig(f'{plot_output_path}/hESC_perturbed_cluster_{ds_gene_module}_overrep_g_sets.pdf')
        plt.close(fig)

    except:
        continue