In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.autolayout"] = False
matplotlib.rc('pdf', fonttype=42)
plt.rcParams['font.family'] = 'FreeSans'
sc.set_figure_params(vector_friendly=True, dpi_save=300)
plt.rcParams['axes.grid'] = False

In [None]:
plot_output_path = 'hesc_sc_analysis_plots'
os.makedirs(plot_output_path, exist_ok=True)

In [None]:
adata_bulk = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/pseudo_bulk.h5ad')
adata_bulk

In [None]:
# Mask out the direct target genes
for i in range(adata_bulk.shape[0]):
    pg = adata_bulk.obs['perturbed_gene'].iloc[i]
    
    if pg in adata_bulk.var_names:
        adata_bulk.X[i, adata_bulk.var_names.get_loc(pg)] = 0

hv_genes = list(adata_bulk.var[(np.abs(adata_bulk.X) > 0.2).sum(axis=0) > 1]['gene_name'])

print(len(hv_genes))

In [None]:
enrichment_df = pd.read_parquet('enrichment_results/l2_c_enrich_gene.parquet')
enrichment_df

In [None]:
enrichment_df[enrichment_df['perturbed_gene'] == 'non-targeting'].sort_values('log2fc')[60:]

In [None]:
targeting_clusters = enrichment_df[(enrichment_df['perturbed_gene'] == 'non-targeting')
            &(enrichment_df['log2fc'] < 0)]['cluster'].values
targeting_clusters

In [None]:
adata = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/adata_single_gene_pert.h5ad')
adata.obs_names_make_unique()

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata

In [None]:
l2_obs_df = pd.read_csv('adata_obs_l2.csv', index_col=0)
adata.obs['leiden_l1'] = l2_obs_df['leiden_l1'].astype(str)
adata.obs['cluster'] = l2_obs_df['cluster'].astype(str)
adata

In [None]:
adata_targeting = adata[adata.obs['cluster'].isin(targeting_clusters)].copy()
adata_targeting

In [None]:
adata_targeting.raw = adata_targeting.copy()
adata_targeting = adata_targeting[:, hv_genes].copy()
sc.pp.scale(adata_targeting, max_value=10)
sc.tl.pca(adata_targeting, svd_solver='arpack')
adata_targeting

In [None]:
sc.pp.neighbors(adata_targeting, n_neighbors=20)

In [None]:
sc.tl.umap(adata_targeting)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

sc.pl.umap(adata_targeting, color='cluster', legend_loc='on data', ax=ax, legend_fontsize=5)

In [None]:
targeting_umap_df = adata_targeting.obs[[]].copy()
targeting_umap_df['umap_targeting_x'] = adata_targeting.obsm['X_umap'][:, 0]
targeting_umap_df['umap_targeting_y'] = adata_targeting.obsm['X_umap'][:, 1]
targeting_umap_df.to_csv('targeting_umap.csv')