In [None]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgba
import seaborn as sns
import scipy.stats
from tqdm import tqdm

import anndata
import scanpy as sc

from scmg.preprocessing.data_standardization import GeneNameMapper

gene_name_mapper = GeneNameMapper()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.autolayout"] = False
matplotlib.rc('pdf', fonttype=42)
plt.rcParams['font.family'] = 'FreeSans'
sc.set_figure_params(vector_friendly=True, dpi_save=300)
plt.rcParams['axes.grid'] = False

In [None]:
plot_output_path = 'hesc_pseudobulk_plots'
os.makedirs(plot_output_path, exist_ok=True)

table_output_path = 'hesc_pseudobulk_tables'
os.makedirs(table_output_path, exist_ok=True)

In [None]:
adata = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/pseudo_bulk.h5ad')
#adata = sc.read_h5ad('/GPUData_xingjie/SCMG/perturbation_data/ReplogleWeissman2022_K562_gwps.h5ad')

adata

In [None]:
pert_cluster_df = pd.read_csv('clustering/perturbed_gene_clusters_hESC.csv', index_col=0)
dg_cluster_df = pd.read_csv('clustering/downstream_gene_clusters_hESC.csv', index_col=0)

common_perts = list(set(pert_cluster_df.index) & set(adata.obs.index))
common_dgs = list(set(dg_cluster_df.index) & set(adata.var.index))

pert_cluster_df = pert_cluster_df.loc[common_perts].copy()
dg_cluster_df = dg_cluster_df.loc[common_dgs].copy()
adata = adata[common_perts, common_dgs].copy()

In [None]:
pert_df = pd.DataFrame(
    data=adata.X,
    index=list(adata.obs['perturbed_gene_name']),
    columns=list(adata.var['gene_name'])
)

In [None]:
#selected_p_genes = list(pert_cluster_df[pert_cluster_df['leiden'].isin([8, 18, 20, 32, 17, 3, 13])]['perturbed_gene_name'])
selected_p_genes = list(pert_df.index)

selected_d_genes = list(dg_cluster_df[dg_cluster_df['leiden'].isin([11])]['gene_name'])

In [None]:
selected_p_genes = list(pert_df.index)
selected_d_genes = ['AP1S2', 'CTSC', 'PYCARD', 'ZFP36L2', 'PIM2', 'AKIRIN1', 'EBPL',
       'PEBP1', 'UGP2', 'ADM', 'ERBB2', 'ESRP1', 'CYP2S1', 'TGIF1',
       'CLDN7', 'CD9', 'CD24', 'MAD2L2', 'CDCA7L', 'PSIP1', 'PAICS',
       'HELLS', 'SEPHS1', 'TERF1', 'RBPMS2', 'DNMT3B', 'DPPA4', 'L1TD1',
       'POU5F1', 'FOXH1', 'ZSCAN10', 'PODXL', 'RRAS2', 'ETV4', 'TET1',
       'PHC1', 'USP44', 'VASH2', 'DPYSL3', 'SCG3', 'PTPRZ1', 'SOX2',
       'CNMD', 'MFGE8', 'JADE1', 'EIF2AK4', 'GPR176', 'VSNL1', 'SNRPN']

In [None]:
selected_pert_df = pert_df.loc[selected_p_genes, selected_d_genes].copy()
optimal_pert_gene_order = list(selected_pert_df.sum(axis=1).sort_values().index)
optimal_downstream_gene_order = list(selected_pert_df.sum(axis=0).sort_values().index)

#Z_pg = scipy.cluster.hierarchy.linkage(selected_pert_df.values, method='average', metric='euclidean')
#Z_pg_optimal = scipy.cluster.hierarchy.optimal_leaf_ordering(Z_pg, selected_pert_df.values, metric='euclidean')
#optimal_pert_gene_order = selected_pert_df.index.values[scipy.cluster.hierarchy.leaves_list(Z_pg_optimal)]
#
#Z_dg = scipy.cluster.hierarchy.linkage(selected_pert_df.T.values, method='average', metric='euclidean')
#Z_dg_optimal = scipy.cluster.hierarchy.optimal_leaf_ordering(Z_dg, selected_pert_df.T.values, metric='euclidean')
#optimal_downstream_gene_order = selected_pert_df.T.index.values[scipy.cluster.hierarchy.leaves_list(Z_dg_optimal)]

In [None]:
fig, ax = plt.subplots(figsize=(22, 20))
sns.heatmap(selected_pert_df.loc[optimal_pert_gene_order, optimal_downstream_gene_order],
            center=0, cmap='seismic', vmax=0.5, vmin=-0.5, ax=ax)

In [None]:
import gseapy as gp
GO_Biological_Process_2023 = gp.get_library(name='GO_Biological_Process_2023', organism='human')
Reactome_2022 = gp.get_library(name='Reactome_2022', organism='human')
CORUM = gp.get_library(name='CORUM', organism='human')
KEGG_2021_Human = gp.get_library(name='KEGG_2021_Human', organism='human')

In [None]:
rnk_df = pd.DataFrame({
    'gene': optimal_pert_gene_order, 
    'rank': -selected_pert_df.sum(axis=1).loc[optimal_pert_gene_order].values
}).set_index('gene')
rnk_df

In [None]:
pre_res = gp.prerank(rnk=rnk_df,
                     gene_sets=[GO_Biological_Process_2023, Reactome_2022, CORUM, KEGG_2021_Human],
                     threads=4,
                     min_size=5,
                     max_size=1000,
                     permutation_num=1000, # reduce number to speed up testing
                     outdir=None, # don't write to disk
                     seed=6,
                     verbose=True, # see what's going on behind the scenes
                    )

In [None]:
pre_res.res2d.sort_values('NES').to_csv(
    os.path.join(table_output_path, 'regulatory_submodules_16_pluripotency_markers.csv')
)
pre_res.res2d.sort_values('NES')

In [None]:
pre_res.res2d[pre_res.res2d['NES'] > 0][:60]

In [None]:
pre_res.res2d[pre_res.res2d['NES'] < 0][:20]

In [None]:
reg_module_genes = pre_res.res2d.loc[43, 'Lead_genes'].split(';')

Z_pg = scipy.cluster.hierarchy.linkage(selected_pert_df.loc[reg_module_genes].values, method='average', metric='euclidean')
Z_pg_optimal = scipy.cluster.hierarchy.optimal_leaf_ordering(Z_pg, selected_pert_df.loc[reg_module_genes].values, metric='euclidean')
optimal_pert_gene_order = selected_pert_df.loc[reg_module_genes].index.values[scipy.cluster.hierarchy.leaves_list(Z_pg_optimal)]

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(selected_pert_df.loc[optimal_pert_gene_order, optimal_downstream_gene_order],
            center=0, cmap='seismic', vmax=0.5, vmin=-0.5, ax=ax, rasterized=True,
            cbar_kws={'label': 'gene expression shift'})

ax.set_xticks(np.arange(len(optimal_downstream_gene_order)) + 0.5, optimal_downstream_gene_order, size=7)
ax.set_yticks(np.arange(len(optimal_pert_gene_order)) + 0.5, optimal_pert_gene_order, size=7)

ax.set_xlabel('Pluripotency markers')
ax.set_ylabel('Mediator complex')

fig.savefig(os.path.join(plot_output_path, f'regulatory_submodules_16_pluripotency_markers_mediator_complex.pdf'))
