In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
class_enrich_guide_df = pd.read_parquet('enrichment_results/class_anno_enrich_guide.parquet')
class_enrich_guide_df['class_anno'] = class_enrich_guide_df['class_anno'].replace(
    {'upregulation of cholesterol biosynthesis': 'upregulation of lipid biosynthesis'})
class_enrich_guide_df

In [None]:
# Get the enriched genes in each annotated classes
class_enriched_genes_dict = {
    'class': [], 
    'enriched_perturbed_gene': [],
    'n_enrcihed_guides': [],
}

for c in np.unique(class_enrich_guide_df['class_anno']):

    # Only consider genes with at least 2 enriched guides
    enriched_guide_counts = class_enrich_guide_df[
        (class_enrich_guide_df['class_anno'] == c)
        & (class_enrich_guide_df['log2fc'] > 0) 
        & (class_enrich_guide_df['positive_count'] > 1)
        & (class_enrich_guide_df['perturbed_gene'] != 'non-targeting')
    ]['perturbed_gene'].value_counts()
    pre_selected_genes = enriched_guide_counts[enriched_guide_counts > 1].index

    # Get the significantly with at least 1 significantly enriched guide
    enriched_genes = class_enrich_guide_df[
        (class_enrich_guide_df['class_anno'] == c)
        & (class_enrich_guide_df['log2fc'] > 1) 
        & (class_enrich_guide_df['positive_count'] > 2)
        & (class_enrich_guide_df['pval_adj'] < 0.001)
        & (class_enrich_guide_df['perturbed_gene'].isin(pre_selected_genes))
    ]['perturbed_gene'].unique()

    for g in enriched_genes:
        class_enriched_genes_dict['class'].append(c)
        class_enriched_genes_dict['enriched_perturbed_gene'].append(g)
        class_enriched_genes_dict['n_enrcihed_guides'].append(enriched_guide_counts[g])

class_enriched_genes_df = pd.DataFrame(class_enriched_genes_dict)
class_enriched_genes_df.to_csv('class_enriched_perturbed_genes.csv', index=False)
class_enriched_genes_df

In [None]:
class_anno_map = {
'0_0' : 'non-targeting enriched',
'10_0' : 'non-targeting like',
'11_0' : 'upregulation of lipid biosynthesis',
'12_0' : 'upregulation of stress response',
'13_0' : 'non-targeting enriched',
'14_0' : 'pert cell cycle',
'14_1' : 'pert spliceosome',
'14_2' : 'pert mRNA-3 processing',
'14_3' : 'pert mRNA transcription',
'14_4' : 'pert mRNA transcription',
'14_5' : 'pert mRNA transcription',
'15_0' : 'germ layer differentiation',
'15_1' : 'germ layer differentiation',
'15_10' : 'germ layer differentiation',
'15_11' : 'germ layer differentiation',
'15_12' : 'germ layer differentiation',
'15_13' : 'germ layer differentiation',
'15_14' : 'mesenchymal differentiation',
'15_2' : 'germ layer differentiation',
'15_3' : 'germ layer differentiation',
'15_4' : 'germ layer differentiation',
'15_5' : 'non-targeting enriched',
'15_6' : 'germ layer differentiation',
'15_7' : 'germ layer differentiation',
'15_8' : 'germ layer differentiation',
'15_9' : 'germ layer differentiation',
'16_0' : 'low UMI count',
'16_1' : 'low UMI count',
'16_2' : 'low UMI count',
'16_3' : 'low UMI count',
'16_4' : 'low UMI count',
'16_5' : 'low UMI count',
'16_6' : 'low UMI count',
'16_7' : 'pert DBR1',
'17_0' : 'non-targeting enriched',
'18_0' : 'pert translation',
'18_1' : 'pert mTOR signaling',
'18_10' : 'pert translation',
'18_11' : 'pert translation',
'18_12' : 'pert translation',
'18_2' : 'pert translation',
'18_3' : 'pert translation',
'18_4' : 'pert translation',
'18_5' : 'pert mTOR signaling',
'18_6' : 'pert translation',
'18_7' : 'pert translation',
'18_8' : 'pert translation',
'18_9' : 'pert translation',
'19_0' : 'non-targeting enriched',
'19_1' : 'non-targeting like',
'1_0' : 'non-targeting enriched',
'20_0' : 'pert mRNA transcription',
'20_1' : 'pert mRNA transcription',
'20_10' : 'pert mRNA deadenylation',
'20_11' : 'pert mRNA transcription',
'20_2' : 'pert GNB2L1',
'20_3' : 'pert mRNA deadenylation',
'20_4' : 'pert mRNA transcription',
'20_5' : 'pert mRNA transcription',
'20_6' : 'pert mRNA transcription',
'20_7' : 'pert mRNA transcription',
'20_8' : 'pert mRNA transcription',
'20_9' : 'pert mRNA transcription',
'21_0' : 'non-targeting enriched',
'21_1' : 'pert DBR1',
'22_0' : 'non-targeting enriched',
'23_0' : 'pert ubiquitin E3 ligase',
'23_1' : 'pert protein neddylation',
'24_0' : 'low mito-genes',
'24_1' : 'low mito-genes',
'24_2' : 'upregulation of stress response',
'25_0' : 'mesenchymal differentiation',
'25_1' : 'mesenchymal differentiation',
'25_2' : 'mesenchymal differentiation',
'25_3' : 'low UMI count',
'25_4' : 'mesenchymal differentiation',
'26_0' : 'pert DBR1',
'27_0' : 'pert RNA methylation',
'28_0' : 'pert DNA damage checkpoint',
'28_1' : 'pert DNA damage checkpoint',
'2_0' : 'non-targeting enriched',
'3_0' : 'non-targeting enriched',
'4_0' : 'non-targeting enriched',
'5_0' : 'non-targeting enriched',
'6_0' : 'non-targeting enriched',
'7_0' : 'non-targeting enriched',
'8_0' : 'non-targeting enriched',
'9_0' : 'non-targeting enriched',
'9_1' : 'non-targeting enriched',
}

cluster_annotation_map = {
    k : class_anno_map[k] + '_' + k for k in class_anno_map.keys()
}

In [None]:
cluster_enrich_gene_df = pd.read_parquet('enrichment_results/l2_c_enrich_gene.parquet')
cluster_enrich_gene_df['cluster_name'] = cluster_enrich_gene_df['cluster'].map(cluster_annotation_map)
cluster_enrich_gene_df

In [None]:
cluster_enrich_gene_df = cluster_enrich_gene_df[
    (cluster_enrich_gene_df['log2fc'] > 1) 
    & (cluster_enrich_gene_df['positive_count'] > 2)
    & (cluster_enrich_gene_df['pval_adj'] < 0.001)
    & (cluster_enrich_gene_df['perturbed_gene'] != 'non-targeting')
]

cluster_enrich_gene_df.to_csv('cluster_enriched_perturbed_genes.csv', index=False)
cluster_enrich_gene_df