In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
plot_output_path = 'hesc_sc_analysis_plots'
os.makedirs(plot_output_path, exist_ok=True)

In [None]:
c_enrich_guide_df = pd.read_parquet('enrichment_results/class_anno_enrich_guide.parquet')
c_enrich_guide_df

In [None]:
c_enrich_guide_df['class_anno'] = c_enrich_guide_df['class_anno'].replace(
    {'upregulation of cholesterol biosynthesis': 'upregulation of lipid biosynthesis'})

In [None]:
# Get the enriched genes in each annotated classes
class_anno_enrich_genes_dict = {}
for c in np.unique(c_enrich_guide_df['class_anno']):

    # Only consider genes with at least 2 enriched guides
    enriched_guide_counts = c_enrich_guide_df[
        (c_enrich_guide_df['class_anno'] == c)
        & (c_enrich_guide_df['log2fc'] > 0) 
        & (c_enrich_guide_df['positive_count'] > 1)
        & (c_enrich_guide_df['perturbed_gene'] != 'non-targeting')
    ]['perturbed_gene'].value_counts()
    pre_selected_genes = enriched_guide_counts[enriched_guide_counts > 1].index

    # Get the significantly with at least 1 significantly enriched guide
    enriched_genes = c_enrich_guide_df[
        (c_enrich_guide_df['class_anno'] == c)
        & (c_enrich_guide_df['log2fc'] > 1) 
        & (c_enrich_guide_df['positive_count'] > 2)
        & (c_enrich_guide_df['pval_adj'] < 0.001)
        & (c_enrich_guide_df['perturbed_gene'].isin(pre_selected_genes))
    ]['perturbed_gene'].unique()

    class_anno_enrich_genes_dict[c] = enriched_genes

In [None]:
#anno_class_order = np.array(list(class_anno_enrich_genes_dict.keys()))
#anno_class_order[np.argsort([len(class_anno_enrich_genes_dict[c]) for c in anno_class_order])]

anno_class_order = ['non-targeting enriched', 'pert GNB2L1', 'pert DBR1',
       'pert protein neddylation', 'pert mRNA deadenylation',
       'pert mRNA-3 processing', 'non-targeting like',
       'pert ubiquitin E3 ligase', 'pert RNA methylation',
       'pert mTOR signaling', 'germ layer differentiation',
       'pert DNA damage checkpoint', 'low mito-genes',
       'pert mRNA transcription', 'pert translation', 'low UMI count',
       'mesenchymal differentiation', 'pert spliceosome',
       'pert cell cycle', 'upregulation of stress response',
       'upregulation of lipid biosynthesis']

anno_class_sizes = np.array([len(class_anno_enrich_genes_dict[c]) for c in anno_class_order])

In [None]:
fig, ax = plt.subplots(figsize=(3, 5), dpi=200)

ax.barh(anno_class_order, anno_class_sizes, color='grey')
ax.set_xlabel('Number of enriched peturbations')
ax.set_ylabel('Cell class')

fig.savefig(f'{plot_output_path}/perturb_class_enriched_gene_count.pdf', bbox_inches='tight')

In [None]:
class_anno_map = {
'0_0' : 'non-targeting enriched',
'10_0' : 'non-targeting like',
'11_0' : 'upregulation of lipid biosynthesis',
'12_0' : 'upregulation of stress response',
'13_0' : 'non-targeting enriched',
'14_0' : 'pert cell cycle',
'14_1' : 'pert spliceosome',
'14_2' : 'pert mRNA-3 processing',
'14_3' : 'pert mRNA transcription',
'14_4' : 'pert mRNA transcription',
'14_5' : 'pert mRNA transcription',
'15_0' : 'germ layer differentiation',
'15_1' : 'germ layer differentiation',
'15_10' : 'germ layer differentiation',
'15_11' : 'germ layer differentiation',
'15_12' : 'germ layer differentiation',
'15_13' : 'germ layer differentiation',
'15_14' : 'mesenchymal differentiation',
'15_2' : 'germ layer differentiation',
'15_3' : 'germ layer differentiation',
'15_4' : 'germ layer differentiation',
'15_5' : 'non-targeting enriched',
'15_6' : 'germ layer differentiation',
'15_7' : 'germ layer differentiation',
'15_8' : 'germ layer differentiation',
'15_9' : 'germ layer differentiation',
'16_0' : 'low UMI count',
'16_1' : 'low UMI count',
'16_2' : 'low UMI count',
'16_3' : 'low UMI count',
'16_4' : 'low UMI count',
'16_5' : 'low UMI count',
'16_6' : 'low UMI count',
'16_7' : 'pert DBR1',
'17_0' : 'non-targeting enriched',
'18_0' : 'pert translation',
'18_1' : 'pert mTOR signaling',
'18_10' : 'pert translation',
'18_11' : 'pert translation',
'18_12' : 'pert translation',
'18_2' : 'pert translation',
'18_3' : 'pert translation',
'18_4' : 'pert translation',
'18_5' : 'pert mTOR signaling',
'18_6' : 'pert translation',
'18_7' : 'pert translation',
'18_8' : 'pert translation',
'18_9' : 'pert translation',
'19_0' : 'non-targeting enriched',
'19_1' : 'non-targeting like',
'1_0' : 'non-targeting enriched',
'20_0' : 'pert mRNA transcription',
'20_1' : 'pert mRNA transcription',
'20_10' : 'pert mRNA deadenylation',
'20_11' : 'pert mRNA transcription',
'20_2' : 'pert GNB2L1',
'20_3' : 'pert mRNA deadenylation',
'20_4' : 'pert mRNA transcription',
'20_5' : 'pert mRNA transcription',
'20_6' : 'pert mRNA transcription',
'20_7' : 'pert mRNA transcription',
'20_8' : 'pert mRNA transcription',
'20_9' : 'pert mRNA transcription',
'21_0' : 'non-targeting enriched',
'21_1' : 'pert DBR1',
'22_0' : 'non-targeting enriched',
'23_0' : 'pert ubiquitin E3 ligase',
'23_1' : 'pert protein neddylation',
'24_0' : 'low mito-genes',
'24_1' : 'low mito-genes',
'24_2' : 'upregulation of stress response',
'25_0' : 'mesenchymal differentiation',
'25_1' : 'mesenchymal differentiation',
'25_2' : 'mesenchymal differentiation',
'25_3' : 'low UMI count',
'25_4' : 'mesenchymal differentiation',
'26_0' : 'pert DBR1',
'27_0' : 'pert RNA methylation',
'28_0' : 'pert DNA damage checkpoint',
'28_1' : 'pert DNA damage checkpoint',
'2_0' : 'non-targeting enriched',
'3_0' : 'non-targeting enriched',
'4_0' : 'non-targeting enriched',
'5_0' : 'non-targeting enriched',
'6_0' : 'non-targeting enriched',
'7_0' : 'non-targeting enriched',
'8_0' : 'non-targeting enriched',
'9_0' : 'non-targeting enriched',
'9_1' : 'non-targeting enriched',
}

cluster_annotation_map = {
    k : class_anno_map[k] + '_' + k for k in class_anno_map.keys()
}

In [None]:
c_enrich_gene_df = pd.read_parquet('enrichment_results/l2_c_enrich_gene.parquet')
c_enrich_gene_df['cluster_name'] = c_enrich_gene_df['cluster'].map(cluster_annotation_map)
c_enrich_gene_df

In [None]:
selected_genes = np.unique(list(class_anno_enrich_genes_dict['germ layer differentiation']) 
          + list(class_anno_enrich_genes_dict['mesenchymal differentiation']))

selected_clusters = [
    'germ layer differentiation_15_0',
    'germ layer differentiation_15_12',
    'germ layer differentiation_15_9',
    'germ layer differentiation_15_10',
    'germ layer differentiation_15_11',
    'germ layer differentiation_15_7',
    'germ layer differentiation_15_1',
    'germ layer differentiation_15_2',
    'germ layer differentiation_15_8',
    'germ layer differentiation_15_13',
    'germ layer differentiation_15_3',
    'germ layer differentiation_15_6',
    'germ layer differentiation_15_4',
    'mesenchymal differentiation_15_14',
    'mesenchymal differentiation_25_1',
    'mesenchymal differentiation_25_0',
    'mesenchymal differentiation_25_2',
    'mesenchymal differentiation_25_4',
]

selected_cep_df = c_enrich_gene_df[
    c_enrich_gene_df['perturbed_gene'].isin(selected_genes)
    & c_enrich_gene_df['cluster_name'].isin(selected_clusters)
    ]

log2fc_df = selected_cep_df.pivot(index='perturbed_gene', columns='cluster_name', values='log2fc')
pval_df = selected_cep_df.pivot(index='perturbed_gene', columns='cluster_name', values='pval_adj')
mlog_pval_df = -np.log10(pval_df + 1e-20)

In [None]:
#pre_ordered_genes = [
#    'MED19', 'SP1', 'SUPT20H', 'NANOG', 'SOX2', 'POU5F1',
#
#    'HARS', 'TARS', 'RRP9', 'GPN3', 
#
#    'CHAF1B', 'MMS22L', 'RNGTT', 'BCL2L1', 'PHB', 'REV3L', 'BRCA2',
#    'C22orf15', 'EIF2B4', 'MNAT1', 'TANGO6', 'BRIP1', 'SDE2', 'POLR2M',
#    'SMNDC1', 'EIF2B5', 'RNF214', 'PPIE', 'EXOC3', 
#
#    'MRP63', 'RPP14', 'PDCD11', 'DDX56', 'UBE2T',
#    'DDX21', 'MED22', 'HEATR1', 'ZC3H8', 'BRIX1',
#    'EIF2B3', 'ZNF574', 'SKA3', 'CCNH', 'SKA1', 'IPO7', 'TSEN2',
#    'FOXD3', 'EIF2S2', 'PDCD7', 'CENPC', 
#]
#
#mlog_pval_df.loc[~mlog_pval_df.index.isin(pre_ordered_genes), :
#              ]['mesenchymal differentiation_25_2'].sort_values(ascending=False).index.values

In [None]:
np.setdiff1d(selected_genes, gene_order)

In [None]:
#gene_order =[
#    'MED19', 'SP1', 'SUPT20H', 'NANOG', 'SOX2', 'POU5F1',
#
#    'HARS', 'TARS', 'RRP9', 'GPN3', 
#
#    'CHAF1B', 'MMS22L', 'RNGTT', 'BCL2L1', 'PHB', 'REV3L', 'BRCA2',
#    'C22orf15', 'EIF2B4', 'MNAT1', 'TANGO6', 'BRIP1', 'SDE2', 'POLR2M',
#    'SMNDC1', 'EIF2B5', 'RNF214', 'PPIE', 'EXOC3', 
#
#    'MRP63', 'RPP14', 'PDCD11', 'DDX56', 'UBE2T',
#    'DDX21', 'MED22', 'HEATR1', 'ZC3H8', 'BRIX1',
#    'EIF2B3', 'ZNF574', 'CCNH', 'SKA1', 'IPO7', 'TSEN2',
#    'FOXD3', 'EIF2S2', 'PDCD7', 'CENPC', 
#
#    'CENPI', 'RCL1', 'GEMIN5', 'TIMELESS', 'DERL2', 'SHFM1', 'KRR1',
#    'RFWD3', 'SKA3', 'TAF6'
#]

gene_order =[
    'MED19', 'SP1', 'SUPT20H', 'NANOG', 'SOX2', 'POU5F1',

    'HARS', 'TARS', 'RPP14', 'RPP30', 'TSEN2',

    'RRP9', 'PDCD11', 'DDX56', 'DDX21', 'HEATR1', 'BRIX1', 'MRP63', 'RCL1', 'KRR1',

    'EIF2B4', 'EIF2B5', 'EIF2S2', 'EIF2B3',

    'RNF214', 'UBE2T', 'SHFM1', 

    'GEMIN5', 'SMNDC1', 'PDCD7', 'PPIE', 'RNGTT', 'EXOSC10',
    
    'GPN3', 'POLR2M', 'TAF6', 'MED22', 'ZC3H8', 'ZNF574', 'FOXD3',

    'SKA1', 'SKA3', 
    'CENPI', 'CCNH', 'MNAT1', 'CHAF1B', 'TIMELESS',

    'MMS22L', 'BRCA2', 'BRIP1', 'SDE2', 'REV3L', 'RFWD3', 

    'TANGO6', 'EXOC3', 'DERL2',
    
    'PHB', 'IPO7','BCL2L1','C22orf15', 

     'MTBP', 'NCAPH', 
]

cluster_order = [
   'germ layer differentiation_15_0',
    'germ layer differentiation_15_12',
    'germ layer differentiation_15_9',
    'germ layer differentiation_15_10',
    'germ layer differentiation_15_11',
    'germ layer differentiation_15_7',
    'germ layer differentiation_15_1',
    'germ layer differentiation_15_2',
    'germ layer differentiation_15_8',
    'germ layer differentiation_15_13',
    'germ layer differentiation_15_3',
    'germ layer differentiation_15_6',
    'germ layer differentiation_15_4',
    'mesenchymal differentiation_15_14',
    'mesenchymal differentiation_25_1',
    'mesenchymal differentiation_25_0',
    'mesenchymal differentiation_25_2',
    'mesenchymal differentiation_25_4',
]

log2fc_df_to_show = log2fc_df.loc[gene_order, cluster_order].T
mlog_pval_df_to_show = mlog_pval_df.loc[gene_order, cluster_order].T

In [None]:
import sys
sys.path.append('../')
from scattermap import scattermap

fig, ax = plt.subplots(figsize=(25, 8))

ax = scattermap(log2fc_df_to_show, 
                marker_size=(7 * mlog_pval_df_to_show.values.astype(float)) + 0.5,
                vmin=-6, vmax=6, cmap='coolwarm',
                linewidths=0.2, linecolor='black',
                ax=ax,
                cbar_kws={'shrink':0.5, 'anchor':(0, 0.7)}
                )

ax.tick_params(axis='both', which='major', labelsize=7)
ax.figure.axes[1].tick_params(axis="y", labelsize=7)
ax.figure.axes[1].set_ylabel('log2 fold change', fontsize=7)

# Create a dot size legend using off-axis scatter calls and legend
ax.scatter(-1, -1, label='$10^{-20}$', marker="o", linewidths=0, c="grey", s=140.5)
ax.scatter(-1, -1, label='$10^{-10}$', marker="o", linewidths=0, c="grey", s=70.5)
ax.scatter(-1, -1, label='$10^{-3}$', marker="o", linewidths=0, c="grey", s=21.5)
leg = ax.legend(loc="upper left", bbox_to_anchor=(1, 0.2), fontsize=7)
leg.set_title('adjusted p-val',prop={'size':7})

fig.savefig(f'{plot_output_path}/perturbed_genes_enriched_in_differentiation_clusters_dot_plot.pdf', bbox_inches='tight')

In [None]:
mesenchymal_diff_gene_annotation = {
    'HARS': 'tRNA synthetase',
    'TARS': 'tRNA synthetase',
    'RPP14': 'tRNA biogenesis',
    'TSEN2': 'tRNA splicing',

    'RRP9': 'Ribosome biogenesis',
    'PDCD11': 'Ribosome biogenesis',
    'DDX56': 'Ribosome biogenesis',
    'DDX21': 'Ribosome biogenesis',
    'HEATR1': 'Ribosome biogenesis',
    'BRIX1': 'Ribosome biogenesis',
    'RCL1': 'Ribosome biogenesis',
    'KRR1': 'Ribosome biogenesis',
    'MRP63' : 'Mitochondrial Ribosomal Protein',

    'EIF2B4': 'Translation initiation',
    'EIF2B3': 'Translation initiation',
    'EIF2B5': 'Translation initiation',
    'EIF2S2': 'Translation initiation',

    'SKA1': 'chromosome segregation in mitosis', 
    'SKA3': 'chromosome segregation in mitosis',
    'CENPC': 'chromosome segregation in mitosis', 
    'CENPI': 'chromosome segregation in mitosis',
    'CCNH': 'Cyclin H', 
    'MNAT1' : 'forms the CDK-activating kinase (CAK) enzymatic complex',
    'CHAF1B': 'assembly of histone octamers onto newly-replicated DNA',
    'TIMELESS': 'Plays an important role in the control of DNA replication',

    'SMNDC1' : 'Involved in spliceosome assembly',
    'PDCD7': 'a component of the minor U12-type spliceosome',
    'PPIE' : 'Involved in pre-mRNA splicing',
    'GEMIN5': 'splicing of cellular pre-mRNAs',
    'RNGTT' : 'mRNA-capping enzyme',
    
    'GPN3': 'Small GTPase required for proper localization of RNA polymerase II',
    'POLR2M' : 'a subunit of RNA polymerase II',
    'TAF6': 'initiation of Pol II transcription',
    'MED22': 'component of the Mediator complex',
    'ZC3H8': 'transcriptional repressor of the GATA3 promoter',      
    'ZNF574': 'May be involved in transcriptional regulation',
    'FOXD3': 'TF',

    'MMS22L': 'promotes homologous recombination-mediated repair of double-strand breaks', 
    'BRCA2' : 'double-strand break repair',
    'BRIP1' : 'maintenance of chromosomal stability',
    'RFWD3': 'response to DNA damage',
    'SDE2' : 'cellular response to UV; mitotic G1 DNA damage checkpoint signaling; and protein ubiquitination',
    'REV3L' : 'catalytic subunit of DNA polymerase zeta', 
    
    'RNF214' : 'Predicted to enable ubiquitin-protein transferase activity',
    'UBE2T': 'Ubiquitin conjugating enzyme',
    'SHFM1': 'Component of the 26S proteasome', 
   
    'TANGO6' : 'Predicted to be involved in protein secretion',
    'EXOC3': 'docking of exocytic vesicles with fusion sites on the plasma membrane', 
    'DERL2': 'degradation of misfolded glycoproteins in the ER',

    'IPO7': 'nuclear protein import',    
    'PHB': 'pleiotropic, mitochondrial chaperone, and transcriptional co-regulator', 
    'BCL2L1': 'inhibitor of cell death',
    'C22orf15': '', 
}

In [None]:
selected_genes = [
    #'HARS', 'TARS', 'RPP14', 'TSEN2',
    #'RRP9', 'PDCD11', 'DDX56', 'DDX21', 'HEATR1', 'BRIX1', 'RCL1', 'KRR1', 'MRP63',
    #'EIF2B4', 'EIF2B3', 'EIF2B5', 'EIF2S2',

    #'SKA1', 'SKA3', 'CENPC', 'CENPI', 'CCNH', 'MNAT1', 'CHAF1B',

    #'SMNDC1', 'PDCD7', 'PPIE', 'GEMIN5', 'RNGTT',

    'EIF1AX', 'EIF2B2', 'EIF2S2', 'EIF2S3', 'EIF4A2', 'EIF4E', 'EIF4G1', 
    'EIF5A', 'EIF5B',
    'EIF3J', 'EIF3I', 'EIF3A', 'EIF3C', 'EIF3CL', 'EIF3D', 'EIF3F',
    'EIF3M', 'EIF3G', 'EIF4A1', 'EIF3H',
    'EIF4G2', 'EIF2B3', 'EIF2B4', 'EIF2B5', 'EIF6'
]

selected_clusters = [
    'upregulation of stress response_12_0', 
    'pert cell cycle_14_0',
    'pert mTOR signaling_18_1',

    'pert translation_18_3', 
    'pert translation_18_0', 
    'pert translation_18_4',
    
    'pert translation_18_6', 
    'pert translation_18_2',
       
       'pert translation_18_7',
    'pert translation_18_12', 
    
       'mesenchymal differentiation_25_0',
       'mesenchymal differentiation_25_1'
]

selected_cep_df = c_enrich_gene_df[
    c_enrich_gene_df['perturbed_gene'].isin(selected_genes)
    & c_enrich_gene_df['cluster_name'].isin(selected_clusters)
    ]

log2fc_df = selected_cep_df.pivot(index='perturbed_gene', columns='cluster_name', values='log2fc')
pval_df = selected_cep_df.pivot(index='perturbed_gene', columns='cluster_name', values='pval_adj')
mlog_pval_df = -np.log10(pval_df + 1e-20)

log2fc_df_to_show = log2fc_df.loc[selected_genes, selected_clusters].T
mlog_pval_df_to_show = mlog_pval_df.loc[selected_genes, selected_clusters].T

fig, ax = plt.subplots(figsize=(8, 4))

ax = scattermap(log2fc_df_to_show, 
                marker_size=(7 * mlog_pval_df_to_show.values.astype(float)) + 0.5,
                vmin=-6, vmax=6, cmap='coolwarm',
                linewidths=0.2, linecolor='black',
                ax=ax,
                cbar_kws={'shrink':0.5, 'anchor':(0, 0.7)}
                )

ax.tick_params(axis='both', which='major', labelsize=7)
ax.figure.axes[1].tick_params(axis="y", labelsize=7)
ax.figure.axes[1].set_ylabel('log2 fold change', fontsize=7)

# Create a dot size legend using off-axis scatter calls and legend
ax.scatter(-1, -1, label='$10^{-20}$', marker="o", linewidths=0, c="grey", s=140.5)
ax.scatter(-1, -1, label='$10^{-10}$', marker="o", linewidths=0, c="grey", s=70.5)
ax.scatter(-1, -1, label='$10^{-3}$', marker="o", linewidths=0, c="grey", s=21.5)
leg = ax.legend(loc="upper left", bbox_to_anchor=(1, 0.2), fontsize=7)
leg.set_title('adjusted p-val',prop={'size':7})

fig.savefig(f'{plot_output_path}/perturbed_eif_genes_enriched_in_differentiation_clusters_dot_plot.pdf', bbox_inches='tight')

In [None]:
c_enrich_gene_df[
    c_enrich_gene_df['perturbed_gene'].isin(selected_genes)
    & (c_enrich_gene_df['log2fc'] > 1)
    & (c_enrich_gene_df['pval_adj'] < 0.01)
    & (c_enrich_gene_df['positive_count'] > 1)
    ]['cluster_name'].unique()

In [None]:
eif_genes = np.unique(c_enrich_gene_df[c_enrich_gene_df['perturbed_gene'].str.startswith('EIF')]['perturbed_gene'])

c_enrich_gene_df[
    c_enrich_gene_df['perturbed_gene'].isin(eif_genes)
    & (c_enrich_gene_df['log2fc'] > 1)
    & (c_enrich_gene_df['pval_adj'] < 0.01)
    & (c_enrich_gene_df['positive_count'] > 1)
    ]['perturbed_gene'].unique()

In [None]:
for g in selected_genes:
    for c in class_anno_enrich_genes_dict.keys():
        if g in class_anno_enrich_genes_dict[c]:
            print(g, c)