In [None]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgba
import seaborn as sns
import scipy.stats
from tqdm import tqdm

import anndata
import scanpy as sc

from scmg.preprocessing.data_standardization import GeneNameMapper

gene_name_mapper = GeneNameMapper()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.autolayout"] = False
matplotlib.rc('pdf', fonttype=42)
plt.rcParams['font.family'] = 'FreeSans'
sc.set_figure_params(vector_friendly=True, dpi_save=300)
plt.rcParams['axes.grid'] = False

In [None]:
plot_output_path = 'hesc_pseudobulk_plots'
os.makedirs(plot_output_path, exist_ok=True)

In [None]:
dg_cluster_df = pd.read_csv('clustering/downstream_gene_clusters_hESC.csv', index_col=0)
dg_cluster_df

In [None]:
adata = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/pseudo_bulk.h5ad')

pert_cluster_df = pd.read_csv('clustering/perturbed_gene_clusters_hESC.csv', index_col=0)
dg_cluster_df = pd.read_csv('clustering/downstream_gene_clusters_hESC.csv', index_col=0)

common_perts = list(set(pert_cluster_df.index) & set(adata.obs.index))
common_dgs = list(set(dg_cluster_df.index) & set(adata.var.index))

pert_cluster_df = pert_cluster_df.loc[common_perts].copy()
dg_cluster_df = dg_cluster_df.loc[common_dgs].copy()
adata = adata[common_perts, common_dgs].copy()

pert_df = pd.DataFrame(
    data=adata.X,
    index=list(adata.obs['perturbed_gene_name']),
    columns=list(adata.var['gene_name'])
)

downstream_gene_sim_df = pd.DataFrame(
    data= 1 - scipy.spatial.distance.cdist(pert_df.values.T, pert_df.values.T, metric='cosine'),
    index=list(pert_df.columns),
    columns=list(pert_df.columns)
)

In [None]:
all_gene_corr_df = pd.read_parquet('/GPUData_xingjie/Softwares/SCMG_dev/tests/manifold_generator/global_gene_correlation/direct_gene_corr_df_measured.parquet')
#all_gene_corr_df = pd.read_parquet('/GPUData_xingjie/Softwares/SCMG_dev/tests/manifold_generator/global_gene_correlation/gene_corr_df_measured.parquet')


named_all_gene_corr_df = all_gene_corr_df.copy()
named_all_gene_corr_df.index = gene_name_mapper.map_gene_names(
    named_all_gene_corr_df.index, 'human', 'human', 'id', 'name')
named_all_gene_corr_df.columns = gene_name_mapper.map_gene_names(
    named_all_gene_corr_df.columns, 'human', 'human', 'id', 'name')

In [None]:
selected_cluster = 19
selected_genes = list(dg_cluster_df[dg_cluster_df['leiden'].isin([selected_cluster])]['gene_name'])
selected_gene_df = named_all_gene_corr_df.loc[selected_genes, selected_genes]


Z_dg = scipy.cluster.hierarchy.linkage(selected_gene_df.values, method='average', metric='euclidean')
Z_dg_optimal = scipy.cluster.hierarchy.optimal_leaf_ordering(Z_dg, selected_gene_df.values, metric='euclidean')
optimal_downstream_gene_order = selected_gene_df.index.values[scipy.cluster.hierarchy.leaves_list(Z_dg_optimal)]

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(selected_gene_df.loc[optimal_downstream_gene_order, optimal_downstream_gene_order],
            center=0, cmap='PiYG_r', vmax=0.7, vmin=-0.7, ax=ax, rasterized=True,
            cbar_kws={'label': 'global manifold correlation'})
ax.set_xticks(np.arange(len(optimal_downstream_gene_order)) + 0.5, optimal_downstream_gene_order, size=7)
ax.set_yticks(np.arange(len(optimal_downstream_gene_order)) + 0.5, optimal_downstream_gene_order, size=7)
ax.set_title(f'Downstream gene cluster {selected_cluster}', size=20)

fig.savefig(os.path.join(plot_output_path, f'downstream_gene_cluster_{selected_cluster}_corr_mtx.pdf'))

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(downstream_gene_sim_df.loc[optimal_downstream_gene_order, optimal_downstream_gene_order],
            center=0, cmap='PuOr_r', vmax=1, vmin=-1, ax=ax)

In [None]:
optimal_downstream_gene_order

In [None]:
gene_exp_modules = {
    '12_neuronal' : ['ADGRL3', 'ANK2', 'MAGI2', 'NLGN1', 'GRID2', 'LINGO2', 'LRRTM4',
       'KCND2', 'DLGAP1', 'RBFOX1', 'CADM2', 'NRXN1', 'NRG3', 'GRIA4',
       'GABRB3', 'SYT1', 'RIMS2', 'MGAT4C', 'FGF12', 'PPP2R2B', 'ADCY2',
       'TMEM132D', 'ZMAT4', 'UNC5D', 'PCDH11X', 'EDIL3', 'NALF1',
       'HS3ST5', 'TRPM3', 'DST', 'FAT3', 'LARGE1', 'PRKG1', 'RASAL2',
       'KIAA0825', 'PTPN4', 'TOX', 'RYR2', 'CTNNA3', 'GPM6B', 'PHF21B'],
    '17_exit_pluripotency' : ['CECR2', 'NRF1', 'TCF4', 'ZFAND3', 'TRIO', 'KANSL1', 'EXOC4',
       'EXT1', 'MACF1', 'ARID1B', 'PLEKHA5', 'PTBP2', 'MBTD1', 'FBXO11',
       'ASH1L', 'ACACA', 'ERBIN', 'MKLN1', 'FNDC3B', 'SND1', 'INO80',
       'KDM2B', 'PTPN14', 'CTDSPL', 'GLG1', 'WAC'],
    '17_immediate_early_gene' : ['EGR1', 'FOS', 'IER2'],
    '18_mesenchymal' : ['COL4A1', 'COL4A2', 'IGFBP5', 'COL12A1', 'COL5A2', 'POSTN',
       'COL8A1', 'FRZB', 'CXCL14', 'LUM', 'CCN2', 'CCN1', 'IGFBP7',
       'TIMP3', 'PRSS23', 'ACTG2', 'LGALS1', 'VIM'],
    '19_mesenchymal' : ['FN1', 'FSTL1', 'MMP2', 'OLFML3',
       'COL1A1', 'COL1A2', 'SPARC', 'BGN', 'PDLIM7', 'TPM1', 'ACTA2',
       'TAGLN', 'MYL9', 'CALD1', 'CSRP1', 'FLNA', 'ITGB1', 'RGS5',
       'ANXA6', 'TUBB6', 'TPM4', 'TIMP1', 'IER3', 'ANXA2', 'ANXA1',
       'TAGLN2', 'ACTG1', 'MYL12B', 'ZNF428'],
    '4_differentiation' : ['PCDH10', 'SPOCK3', 'PCDH9', 'IL1RAPL1', 'LRP1B', 'CNTNAP2',
       'MAPK10', 'TTC3', 'MEIS2', 'EFNA5', 'MAML3', 'DACH1', 'SLIT2',
       'CADM1', 'CDH2', 'DCC', 'ERBB4', 'PCDH7', 'SOX5', 'SSBP2',
       'GREB1L', 'PRTG', 'ROR2', 'FBN2', 'CDH11', 'FLRT2', 'PAM', 'NRP2',
       'EBF2', 'ADAMTS6', 'DNAJC1', 'NRIP1', 'TLE4', 'ZEB2', 'ATP2B1',
       'TGFB1', 'VAMP8', 'S100A4', 'COTL1', 'CCDC167', 'GLIPR2', 'IFITM1',
       'SAMD3', 'EOMES', 'DNAJC15', 'CITED2', 'DCTN3', 'SVBP', 'CRABP1',
       'CRABP2', 'MACROH2A2', 'ID4', 'TUSC3', 'ENC1', 'MLLT11', 'CPE',
       'LHX1', 'CER1', 'CYP26A1', 'FGF8', 'FGF17', 'SIX6', 'SIX3',
       'CCKBR', 'GMPR', 'NPPB', 'MYL7', 'FLNC', 'DLK1', 'HAPLN1',
       'COL2A1', 'P3H2', 'LAMB1', 'WLS', 'GPC3', 'LRIG3', 'SEPTIN11',
       'MRC2', 'HAS2', 'TNC', 'SERPINE2', 'RBP1', 'PCOLCE', 'FBLN1',
       'BMP4', 'CYP1B1', 'GJA1', 'IGFBP3', 'TMEM88', 'SELENOP', 'TFPI',
       'GNG11', 'IFITM3', 'RHOC', 'CD151', 'SPRY1', 'PTGR1', 'DSP',
       'PERP', 'WFDC2'],
    '16_pluripotency_marker' : [
        'AP1S2', 'CTSC', 'PYCARD', 'ZFP36L2', 'PIM2', 'AKIRIN1', 'EBPL',
       'PEBP1', 'UGP2', 'ADM', 'ERBB2', 'ESRP1', 'CYP2S1', 'TGIF1',
       'CLDN7', 'CD9', 'CD24', 'MAD2L2', 'CDCA7L', 'PSIP1', 'PAICS',
       'HELLS', 'SEPHS1', 'TERF1', 'RBPMS2', 'DNMT3B', 'DPPA4', 'L1TD1',
       'POU5F1', 'FOXH1', 'ZSCAN10', 'PODXL', 'RRAS2', 'ETV4', 'TET1',
       'PHC1', 'USP44', 'VASH2', 'DPYSL3', 'SCG3', 'PTPRZ1', 'SOX2',
       'CNMD', 'MFGE8', 'JADE1', 'EIF2AK4', 'GPR176', 'VSNL1', 'SNRPN'
    ],
    '9_pluripotency_marker' : ['AP1S2', 'CTSC', 'PYCARD', 'ZFP36L2', 'PIM2', 'AKIRIN1', 'EBPL',
       'PEBP1', 'UGP2', 'ADM', 'ERBB2', 'ESRP1', 'CYP2S1', 'TGIF1',
       'CLDN7', 'CD9', 'CD24', 'MAD2L2', 'CDCA7L', 'PSIP1', 'PAICS',
       'HELLS', 'SEPHS1', 'TERF1', 'RBPMS2', 'DNMT3B', 'DPPA4', 'L1TD1',
       'POU5F1', 'FOXH1', 'ZSCAN10', 'PODXL', 'RRAS2', 'ETV4', 'TET1',
       'PHC1', 'USP44', 'VASH2', 'DPYSL3', 'SCG3', 'PTPRZ1', 'SOX2',
       'CNMD', 'MFGE8', 'JADE1', 'EIF2AK4', 'GPR176', 'VSNL1', 'SNRPN'],

    '7_p53_signaling' : ['BAX', 'BBC3', 'CCNG1', 'RRM2B', 'SESN1', 'DDB2', 'MDM2', 'CDKN1A',
       'EI24'],

    '6_Eukaryotic_Translation_Elongation' : ['RPL4', 'RPLP1', 'EEF2', 'RPS15A', 'RPL30', 'RPS28', 'FAU', 'RPS7',
       'RPL27A', 'RPS3A', 'RPL14', 'RPL27', 'RPL37A', 'RPL7A', 'RPS10',
       'RPS6', 'RPL23A', 'RPL34', 'RPL6', 'RPS29', 'RPL10A', 'RPL32',
       'RPS8', 'RPS25', 'RPS13', 'RPS20', 'RPS4X', 'RPL36', 'RPS16',
       'RPS17', 'RPL3', 'RPL7', 'RPL13A', 'RPS9', 'RPL8', 'RPS19', 'RPS5',
       'RPL12', 'RPSA', 'EEF1D', 'RPL28', 'RPL39', 'EEF1B2', 'RPS2',
       'RPS15', 'RPL26', 'RPS14', 'RPL11', 'RPLP2'],
    '3_Metabolism_Of_Lipids' : ['ACLY', 'CYP51A1', 'DHCR24', 'DHCR7', 'ELOVL6', 'FABP3', 'FABP7',
       'FASN', 'FDFT1', 'FDPS', 'HMGCR', 'HMGCS1', 'HSD17B12', 'IDI1',
       'INSIG1', 'MSMO1', 'MVD', 'PCYT2', 'SC5D', 'SCD', 'SLC25A1',
       'SQLE', 'TM7SF2'],
    '10_Cellular_Responses_To_Stress' : ['ATF3', 'ATF4', 'ATF5', 'BAG1', 'CDK6', 'CEBPG', 'CHAC1', 'DDIT3',
       'EIF2S2', 'H1-0', 'HERPUD1', 'HMOX1', 'HSPA9', 'PHGDH', 'PPP1R15A',
       'PRDX5', 'RSL24D1', 'SESN2', 'SLC7A11', 'TRIB3'],
    '15_Protein_processing_in_endoplasmic_reticulum' : ['CALR', 'CANX', 'DNAJB11', 'HSP90B1', 'HSPA5', 'P4HB', 'PDIA3',
       'PDIA4', 'PDIA6', 'SEC61G', 'SEC62'],
}

In [None]:
np.array(pert_cluster_df[pert_cluster_df['leiden'] == 34]['perturbed_gene_name'])

In [None]:
np.array(dg_cluster_df[dg_cluster_df['leiden'] == 23]['gene_name'])