In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.autolayout"] = False
matplotlib.rc('pdf', fonttype=42)
plt.rcParams['font.family'] = 'FreeSans'
sc.set_figure_params(vector_friendly=True, dpi_save=300)
plt.rcParams['axes.grid'] = False

In [None]:
plot_output_path = 'hesc_sc_analysis_plots'
os.makedirs(plot_output_path, exist_ok=True)

In [None]:
adata = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/adata_single_gene_pert.h5ad')
adata.obs_names_make_unique()
adata

In [None]:
l2_obs_df = pd.read_csv('adata_obs_l2.csv', index_col=0)
adata.obs['leiden_l1'] = l2_obs_df['leiden_l1'].astype(str)
adata.obs['cluster'] = l2_obs_df['cluster'].astype(str)
adata.obsm['X_umap'] = l2_obs_df[['umap_x', 'umap_y']].values
adata

In [None]:
fig, ax = plt.subplots(figsize=(10, 10), dpi=200)

sc.pl.umap(adata, color='cluster', legend_loc='on data', ax=ax, legend_fontsize=5, s=1, show=False)

fig.savefig(f'{plot_output_path}/hesc_sc_all_clusters_umap.pdf', bbox_inches='tight')

In [None]:
pd.DataFrame({
    'cluster' : list(adata.obs['cluster'].cat.categories),
    'color' : adata.uns['cluster_colors']
}).to_csv('cluster_colors.csv', index=False)

In [None]:
non_target_frac_dict = {
    'cluster' : [],
    'color' : [],
    'total_count' : [],
    'non_target_count' : [],
}

for i in range(len(adata.obs['cluster'].cat.categories)):
    cluster = adata.obs['cluster'].cat.categories[i]
    non_target_frac_dict['cluster'].append(cluster)
    non_target_frac_dict['color'].append(adata.uns['cluster_colors'][i])
    non_target_frac_dict['total_count'].append(np.sum(adata.obs['cluster'] == cluster))
    non_target_frac_dict['non_target_count'].append(np.sum((adata.obs['cluster'] == cluster)
                                                         & (adata.obs['perturbed_gene'] == 'non-targeting')))
    
non_target_frac_df = pd.DataFrame(non_target_frac_dict)
non_target_frac_df['non_target_frac'] = non_target_frac_df['non_target_count'] / non_target_frac_df['total_count']
global_nt_frac = np.sum(non_target_frac_df['non_target_count']) / np.sum(non_target_frac_df['total_count'])
non_target_frac_df['enrichment'] = non_target_frac_df['non_target_frac'] / global_nt_frac
non_target_frac_df = non_target_frac_df.sort_values('enrichment', ascending=False)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5), dpi=200)

ax.scatter(non_target_frac_df['total_count'], non_target_frac_df['enrichment'], c=non_target_frac_df['color'], s=10)
ax.axhline(1, color='grey', linestyle='--')
ax.set_xscale('log')
ax.set_xticks([1e1, 1e2, 1e3, 1e4, 1e5], [10, 100, 1000, 10000, 100000])
ax.set_xlabel('Cluster size')
ax.set_ylabel('Enrichment of non-targeting cells')

fig.savefig(f'{plot_output_path}/hesc_cluster_size_nt_enrichment_scatter.pdf', bbox_inches='tight')

In [None]:
targeting_umap_df = pd.read_csv('targeting_umap.csv', index_col=0)

adata_targeting = adata[targeting_umap_df.index.values].copy()
adata_targeting.obsm['X_umap'] = targeting_umap_df[['umap_targeting_x', 'umap_targeting_y']].values

fig, ax = plt.subplots(figsize=(10, 10), dpi=200)

sc.pl.umap(adata_targeting, color='cluster', legend_loc='on data', ax=ax, legend_fontsize=5, s=1, show=False)

fig.savefig(f'{plot_output_path}/hesc_sc_targeting_clusters_umap.pdf', bbox_inches='tight')

In [None]:
adata_targeting.var["MT"] = adata_targeting.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(
    adata_targeting, qc_vars=["MT"], percent_top=None, log1p=False, inplace=True
)
adata_targeting

In [None]:
fig, ax = plt.subplots(figsize=(5, 5), dpi=100)

sc.pl.umap(adata_targeting, color='total_counts', 
           ax=ax, show=False, vmax=30000)

fig.savefig(f'{plot_output_path}/hesc_sc_targeting_total_counts_umap.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(5, 5), dpi=100)

sc.pl.umap(adata_targeting, color='n_genes_by_counts', 
           ax=ax, show=False, vmax=7000)

fig.savefig(f'{plot_output_path}/hesc_sc_targeting_n_genes_by_counts_umap.pdf')

In [None]:
fig, ax = plt.subplots(figsize=(5, 5), dpi=100)

sc.pl.umap(adata_targeting, color='pct_counts_MT', 
           ax=ax, show=False, vmax=16)

fig.savefig(f'{plot_output_path}/hesc_sc_targeting_pct_counts_MT_umap.pdf')

In [None]:
sc.pp.normalize_total(adata_targeting, target_sum=1e4)
sc.pp.log1p(adata_targeting)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5), dpi=100)

sc.pl.umap(adata_targeting, color='POU5F1', 
           ax=ax, show=False, cmap='inferno_r')

fig.savefig(f'{plot_output_path}/hesc_sc_targeting_POU5F1_umap.pdf')

In [None]:
sc.pl.umap(adata_targeting, color=['DOCK1', 'CLTC', 'EPHA7'], vmax=None,
           cmap='inferno_r')

In [None]:
dg_cluster_df = pd.read_csv('../pseudo_bulk_analysis/clustering/downstream_gene_clusters_hESC.csv', index_col=0)

In [None]:
adata_t_scaled = adata_targeting.copy()
sc.pp.scale(adata_t_scaled, max_value=10)

In [None]:
ds_gene_module = 17
selected_genes = dg_cluster_df[dg_cluster_df['leiden'] == ds_gene_module]['gene_name'].values

adata_t_scaled.obs['mean_z_score'] = adata_t_scaled[:, adata_t_scaled.var.index.isin(selected_genes)].X.mean(axis=1)


fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
sc.pl.umap(adata_t_scaled, color='mean_z_score', cmap='seismic', vmin=-3, vmax=3, 
           title=f'anti-pluripotency module z-score', ax=ax)
#fig.savefig(f'{plot_output_path}/hESC_anti-pluripotency_module_targeting_umap.pdf')

In [None]:
selected_genes = ['TARS']

adata_t_scaled.obs['selected_genes'] = (adata_t_scaled.obs['perturbed_gene'].isin(selected_genes)).astype(int)

fig, ax = plt.subplots(figsize=(4, 4), dpi=300)
sc.pl.umap(adata_t_scaled, color='selected_genes', cmap='Reds', ax=ax,
       title=f'non-targeting', 
       show=False)

#fig.savefig(f'{plot_output_path}/hESC_perturb_DBR1_targeting_umap.pdf')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=200)
sc.pl.umap(adata_t_scaled, ax=ax, show=False)
sc.pl.umap(adata_t_scaled[adata_t_scaled.obs['perturbed_gene'].isin([
    'DDX21'
])],
           color='feature_call', ax=ax, show=False, s=5, palette='rainbow')

In [None]:
selected_genes = [
   'MED19', 'SP1', 'SUPT20H', 'NANOG', 'SOX2', #'POU5F1',
 
]

fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=200)
sc.pl.umap(adata_t_scaled, ax=ax, show=False)
sc.pl.umap(adata_t_scaled[adata_t_scaled.obs['perturbed_gene'].isin(selected_genes)], 
           color='perturbed_gene', ax=ax, show=False, s=5, palette='jet')

In [None]:
selected_genes = [
   'EIF1AX', 'EIF2B2', 'EIF2S2', 'EIF2S3', 'EIF4A2', 'EIF4E', 'EIF4G1', 
    'EIF5A', 'EIF5B',
    'EIF3J', 'EIF3I', 'EIF3A', 'EIF3C', 'EIF3CL', 'EIF3D', 'EIF3F',
    'EIF3M', 'EIF3G', 'EIF4A1', 'EIF3H',
    'EIF4G2', 'EIF2B3', 'EIF2B4', 'EIF2B5', 'EIF6'
 
]

fig, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=200)
sc.pl.umap(adata_t_scaled, ax=ax, show=False)
sc.pl.umap(adata_t_scaled[adata_t_scaled.obs['perturbed_gene'].isin(selected_genes)], 
           color='perturbed_gene', ax=ax, show=False, s=5, palette='jet')

fig.savefig(f'{plot_output_path}/hesc_sc_targeting_EIFs_umap.pdf')