In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
import gseapy as gp

In [None]:
plot_output_path = 'hesc_sc_cluster_overrep_gene_set_plots'
os.makedirs(plot_output_path, exist_ok=True)

In [None]:
gp.get_library_name()

In [None]:
GO_Biological_Process_2023 = gp.get_library(name='GO_Biological_Process_2023', organism='human')
Reactome_2022 = gp.get_library(name='Reactome_2022', organism='human')
CORUM = gp.get_library(name='CORUM', organism='human')
KEGG_2021_Human = gp.get_library(name='KEGG_2021_Human', organism='human')

In [None]:
class_anno_map = {
'0_0' : 'non-targeting enriched',
'10_0' : 'non-targeting like',
'11_0' : 'upregulation of lipid biosynthesis',
'12_0' : 'upregulation of stress response',
'13_0' : 'non-targeting enriched',
'14_0' : 'pert cell cycle',
'14_1' : 'pert spliceosome',
'14_2' : 'pert mRNA-3 processing',
'14_3' : 'pert mRNA transcription',
'14_4' : 'pert mRNA transcription',
'14_5' : 'pert mRNA transcription',
'15_0' : 'germ layer differentiation',
'15_1' : 'germ layer differentiation',
'15_10' : 'germ layer differentiation',
'15_11' : 'germ layer differentiation',
'15_12' : 'germ layer differentiation',
'15_13' : 'germ layer differentiation',
'15_14' : 'mesenchymal differentiation',
'15_2' : 'germ layer differentiation',
'15_3' : 'germ layer differentiation',
'15_4' : 'germ layer differentiation',
'15_5' : 'non-targeting enriched',
'15_6' : 'germ layer differentiation',
'15_7' : 'germ layer differentiation',
'15_8' : 'germ layer differentiation',
'15_9' : 'germ layer differentiation',
'16_0' : 'low UMI count',
'16_1' : 'low UMI count',
'16_2' : 'low UMI count',
'16_3' : 'low UMI count',
'16_4' : 'low UMI count',
'16_5' : 'low UMI count',
'16_6' : 'low UMI count',
'16_7' : 'pert DBR1',
'17_0' : 'non-targeting enriched',
'18_0' : 'pert translation',
'18_1' : 'pert mTOR signaling',
'18_10' : 'pert translation',
'18_11' : 'pert translation',
'18_12' : 'pert translation',
'18_2' : 'pert translation',
'18_3' : 'pert translation',
'18_4' : 'pert translation',
'18_5' : 'pert mTOR signaling',
'18_6' : 'pert translation',
'18_7' : 'pert translation',
'18_8' : 'pert translation',
'18_9' : 'pert translation',
'19_0' : 'non-targeting enriched',
'19_1' : 'non-targeting like',
'1_0' : 'non-targeting enriched',
'20_0' : 'pert mRNA transcription',
'20_1' : 'pert mRNA transcription',
'20_10' : 'pert mRNA deadenylation',
'20_11' : 'pert mRNA transcription',
'20_2' : 'pert GNB2L1',
'20_3' : 'pert mRNA deadenylation',
'20_4' : 'pert mRNA transcription',
'20_5' : 'pert mRNA transcription',
'20_6' : 'pert mRNA transcription',
'20_7' : 'pert mRNA transcription',
'20_8' : 'pert mRNA transcription',
'20_9' : 'pert mRNA transcription',
'21_0' : 'non-targeting enriched',
'21_1' : 'pert DBR1',
'22_0' : 'non-targeting enriched',
'23_0' : 'pert ubiquitin E3 ligase',
'23_1' : 'pert protein neddylation',
'24_0' : 'low mito-genes',
'24_1' : 'low mito-genes',
'24_2' : 'upregulation of stress response',
'25_0' : 'mesenchymal differentiation',
'25_1' : 'mesenchymal differentiation',
'25_2' : 'mesenchymal differentiation',
'25_3' : 'low UMI count',
'25_4' : 'mesenchymal differentiation',
'26_0' : 'pert DBR1',
'27_0' : 'pert RNA methylation',
'28_0' : 'pert DNA damage checkpoint',
'28_1' : 'pert DNA damage checkpoint',
'2_0' : 'non-targeting enriched',
'3_0' : 'non-targeting enriched',
'4_0' : 'non-targeting enriched',
'5_0' : 'non-targeting enriched',
'6_0' : 'non-targeting enriched',
'7_0' : 'non-targeting enriched',
'8_0' : 'non-targeting enriched',
'9_0' : 'non-targeting enriched',
'9_1' : 'non-targeting enriched',
}

cluster_annotation_map = {
    k : class_anno_map[k] + '_' + k for k in class_anno_map.keys()
}

In [None]:
c_enrich_gene_df = pd.read_parquet('enrichment_results/l2_c_enrich_gene.parquet')
c_enrich_gene_df['cluster_name'] = c_enrich_gene_df['cluster'].map(cluster_annotation_map)
c_enrich_gene_df

In [None]:
for c in np.unique(c_enrich_gene_df['cluster']):
    cluster_name = cluster_annotation_map[c]

    enriched_genes = list(c_enrich_gene_df[
            (c_enrich_gene_df['cluster'] == c)
            & (c_enrich_gene_df['log2fc'] > 1) 
            & (c_enrich_gene_df['positive_count'] > 2)
            & (c_enrich_gene_df['pval_adj'] < 0.001)
        ]['perturbed_gene'].unique())
    
    if len(enriched_genes) == 0:
        continue

    try:
        enr = gp.enrichr(gene_list=enriched_genes, 
                 gene_sets=[GO_Biological_Process_2023, Reactome_2022, CORUM, KEGG_2021_Human],
                 organism='human', 
                 outdir=None, # don't write to disk
                )
        
        gene_set_map = {
            'gs_ind_0' : 'GO_Biological_Process_2023',
            'gs_ind_1' : 'Reactome_2022',
            'gs_ind_2' : 'CORUM',
            'gs_ind_3' : 'KEGG_2021_Human',
        }
        enr.results['Gene_set'] = enr.results['Gene_set'].map(gene_set_map)

        ax = gp.barplot(enr.results,
                      column="Adjusted P-value",
                      group='Gene_set', 
                      size=10,
                      top_term=4,
                      figsize=(3,5),
                      color=['darkred', 'darkblue', 'darkgreen', 'orange'] # set colors for group
                     )
        ax.set_title(f'{cluster_name}')

        fig = plt.gcf()
        plt.savefig(f'{plot_output_path}/hESC_sc_{cluster_name}_overrep_g_sets.pdf')
        plt.close(fig)

    except:
        continue