In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
from scmg.preprocessing.data_standardization import GeneNameMapper, standardize_adata
from scmg.model.causal_prediction import CausalGenePredictor

gene_name_mapper = GeneNameMapper()

In [None]:
output_path = 'pert_dataset_stats'
os.makedirs(output_path, exist_ok=True)

In [None]:
pert_data_files = [
    '/GPUData_xingjie/SCMG/perturbation_data/AdamsonWeissman2016_GSM2406681_10X010.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/FrangiehIzar2021_RNA.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/hESC_TF_screen.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/JiangSatija2024_IFNB.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/JiangSatija2024_IFNG.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/JiangSatija2024_INS.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/JiangSatija2024_TGFB.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/JiangSatija2024_TNFA.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/Joung_TFScreen_HS_2023.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/knockTF_human.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/knockTF_mouse.h5ad',
    #'/GPUData_xingjie/SCMG/perturbation_data/omnipath.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/PertOrg.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/ReplogleWeissman2022_K562_essential.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/ReplogleWeissman2022_K562_gwps.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/ReplogleWeissman2022_rpe1.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/TianKampmann2021_CRISPRa.h5ad',
    '/GPUData_xingjie/SCMG/perturbation_data/TianKampmann2021_CRISPRi.h5ad',
]

adata_pert_list = []
for pdf in pert_data_files:
    adata_local = sc.read_h5ad(pdf)
    adata_local.obs['data_file'] = os.path.basename(pdf).split('.')[0]
    adata_pert_list.append(adata_local)
    print(os.path.basename(pdf), adata_pert_list[-1].shape[0])

adata_pert = anndata.concat(adata_pert_list, axis=0)
adata_pert.var['gene_name'] = adata_pert_list[0].var['gene_name']

adata_pert

In [None]:
adata_pert.obs['data_file'].value_counts()

In [None]:
species_map = {
    'ReplogleWeissman2022_K562_gwps' : 'human',
    'PertOrg' : 'mouse',
    'ReplogleWeissman2022_K562_essential' : 'human',
    'ReplogleWeissman2022_rpe1' : 'human',
    'knockTF_human' : 'human',
    'knockTF_mouse' : 'mouse',
    'hESC_TF_screen' : 'human',
    'JiangSatija2024_IFNB' : 'human',
    'JiangSatija2024_TNFA' : 'human',
    'JiangSatija2024_TGFB' : 'human',
    'JiangSatija2024_IFNG' : 'human',
    'JiangSatija2024_INS' : 'human',
    'FrangiehIzar2021_RNA' : 'human',
    'TianKampmann2021_CRISPRi' : 'human',
    'TianKampmann2021_CRISPRa' : 'human',
    'AdamsonWeissman2016_GSM2406681_10X010' : 'human',
    'Joung_TFScreen_HS_2023' : 'human',
}

adata_pert.obs['species'] = adata_pert.obs['data_file'].map(species_map)
species_counts = adata_pert.obs['species'].value_counts()

fig, ax = plt.subplots(figsize=(3, 3), dpi=300)

ax.pie(species_counts.values, labels=species_counts.index, autopct=lambda p: '{:.0f}'.format(p * species_counts.sum() / 100))
ax.set_title('species')
fig.savefig(f'{output_path}/species_pie.pdf')

In [None]:
exp_type_map = {
    'ReplogleWeissman2022_K562_gwps' : 'single-cell',
    'PertOrg' : 'bulk',
    'ReplogleWeissman2022_K562_essential' : 'single-cell',
    'ReplogleWeissman2022_rpe1' : 'single-cell',
    'knockTF_human' : 'bulk',
    'knockTF_mouse' : 'bulk',
    'hESC_TF_screen' : 'bulk',
    'JiangSatija2024_IFNB' : 'single-cell',
    'JiangSatija2024_TNFA' : 'single-cell',
    'JiangSatija2024_TGFB' : 'single-cell',
    'JiangSatija2024_IFNG' : 'single-cell',
    'JiangSatija2024_INS' : 'single-cell',
    'FrangiehIzar2021_RNA' : 'single-cell',
    'TianKampmann2021_CRISPRi' : 'single-cell',
    'TianKampmann2021_CRISPRa' : 'single-cell',
    'AdamsonWeissman2016_GSM2406681_10X010' : 'single-cell',
    'Joung_TFScreen_HS_2023' : 'single-cell',
}

adata_pert.obs['exp_type'] = adata_pert.obs['data_file'].map(exp_type_map)
exp_type_counts = adata_pert.obs['exp_type'].value_counts()

fig, ax = plt.subplots(figsize=(3, 3), dpi=300)

ax.pie(exp_type_counts.values, labels=exp_type_counts.index, autopct=lambda p: '{:.0f}'.format(p * species_counts.sum() / 100))
ax.set_title('experiment type')
fig.savefig(f'{output_path}/exp_type_pie.pdf')

In [None]:
pert_dir_counts = adata_pert.obs['perturbation_sign'].value_counts()

fig, ax = plt.subplots(figsize=(3, 3), dpi=300)

ax.pie(pert_dir_counts.values, labels=pert_dir_counts.index.map({-1: 'down', 1: 'up'}), autopct=lambda p: '{:.0f}'.format(p * species_counts.sum() / 100))
ax.set_title('perturbation direction')
fig.savefig(f'{output_path}/pert_dir_pie.pdf')

In [None]:
dataset_map = {
    'ReplogleWeissman2022_K562_gwps' : 'Replogle2022_K562',
    'PertOrg' : 'PertOrg',
    'ReplogleWeissman2022_K562_essential' : 'Replogle2022_K562',
    'ReplogleWeissman2022_rpe1' : 'Replogle2022_rpe1',
    'knockTF_human' : 'knockTF',
    'knockTF_mouse' : 'knockTF',
    'hESC_TF_screen' : 'Nakatake2020',
    'JiangSatija2024_IFNB' : 'Jiang2024',
    'JiangSatija2024_TNFA' : 'Jiang2024',
    'JiangSatija2024_TGFB' : 'Jiang2024',
    'JiangSatija2024_IFNG' : 'Jiang2024',
    'JiangSatija2024_INS' : 'Jiang2024',
    'FrangiehIzar2021_RNA' : 'Frangieh2021',
    'TianKampmann2021_CRISPRi' : 'Tian2021_CRISPRi',
    'TianKampmann2021_CRISPRa' : 'Tian2021_CRISPRa',
    'AdamsonWeissman2016_GSM2406681_10X010' : 'Adamson2016',
    'Joung_TFScreen_HS_2023' : 'Joung2023',
}

adata_pert.obs['dataset'] = adata_pert.obs['data_file'].map(dataset_map)
dataset_counts = adata_pert.obs['dataset'].value_counts()

fig, ax = plt.subplots(figsize=(3, 3), dpi=300)

ax.bar(dataset_counts.index, dataset_counts.values, color='gray')
ax.tick_params(axis='x', rotation=90)

ax.set_xlabel('Dataset')
ax.set_ylabel('Number of perturbations')
fig.savefig(f'{output_path}/dataset_bar.pdf')

In [None]:
dataset_counts

In [None]:
pert_gene_counts = adata_pert.obs['perturbed_gene'].value_counts()

fig, ax = plt.subplots(figsize=(3, 3), dpi=300)
ax.hist(pert_gene_counts, bins=np.arange(-0.5, 11.5, 1), color='gray')
ax.set_xticks(range(0, 15, 1))
ax.grid(False)
ax.set_xlim(0, 10)
ax.set_xlabel('Number of perturbations')
ax.set_ylabel('Gene counts')
fig.savefig(f'{output_path}/pert_gene_hist.pdf')