In [None]:
import os
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from tqdm import tqdm

import anndata
import scanpy as sc

from scmg.preprocessing.data_standardization import GeneNameMapper

gene_name_mapper = GeneNameMapper()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.autolayout"] = False
matplotlib.rc('pdf', fonttype=42)
plt.rcParams['font.family'] = 'FreeSans'
sc.set_figure_params(vector_friendly=True, dpi_save=300)
plt.rcParams['axes.grid'] = False

In [None]:
plot_output_path = '../hesc_pseudobulk_plots'
os.makedirs(plot_output_path, exist_ok=True)

In [None]:
adata = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/pseudo_bulk.h5ad')
#adata = sc.read_h5ad('/GPUData_xingjie/SCMG/perturbation_data/ReplogleWeissman2022_K562_gwps.h5ad')

adata

In [None]:
# Mask out the direct target genes
for i in range(adata.shape[0]):
    pg = adata.obs['perturbed_gene'].iloc[i]
    
    if pg in adata.var_names:
        adata.X[i, adata.var_names.get_loc(pg)] = 0

In [None]:
flat_shifts = adata.X.flatten()
plt.hist(flat_shifts, bins=100, range=(-0.5, 0.5))
plt.ylim(0, 1e6)
plt.show()

print(np.mean(flat_shifts), np.std(flat_shifts))

In [None]:
# Only keep the perturbations with strong effects
adata = adata[(np.abs(adata.X) > 0.3).sum(axis=1) > 0].copy()
adata

In [None]:
sc.pp.neighbors(adata, n_neighbors=5, use_rep='X', metric='cosine')
sc.tl.umap(adata, random_state=0)
sc.tl.leiden(adata, flavor="igraph", n_iterations=2, resolution=2, random_state=0)

In [None]:
fig, ax = plt.subplots(figsize=(4, 4), dpi=300)

sc.pl.umap(adata, color='leiden', legend_loc='on data', ax=ax, title='perturbed genes', 
           legend_fontsize=10, legend_fontoutline=2,
           show=False)
plt.savefig(f'{plot_output_path}/hesc_perturbed_genes_umap.pdf', dpi=300)

In [None]:
cluster_color_map = {l : c for l, c in zip(adata.obs['leiden'].cat.categories, adata.uns['leiden_colors'])}
adata.obs['leiden_color'] = adata.obs['leiden'].map(cluster_color_map)
adata.obs.to_csv('perturbed_gene_clusters_hESC.csv')

In [None]:
adata.write_h5ad('perturbed_gene_clusters_hESC.h5ad')