In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os
import json

from tqdm import tqdm
import numpy as np
import scipy.spatial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import anndata
import scanpy as sc

In [None]:
adata_bulk = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/pseudo_bulk.h5ad')
adata_bulk

In [None]:
# Mask out the direct target genes
for i in range(adata_bulk.shape[0]):
    pg = adata_bulk.obs['perturbed_gene'].iloc[i]
    
    if pg in adata_bulk.var_names:
        adata_bulk.X[i, adata_bulk.var_names.get_loc(pg)] = 0

hv_genes = list(adata_bulk.var[(np.abs(adata_bulk.X) > 0.2).sum(axis=0) > 1]['gene_name'])

print(len(hv_genes))

In [None]:
adata_all = sc.read_h5ad('/GPUData_xingjie/SCMG/hESC_perturb_seq/adata_single_gene_pert.h5ad')
adata_all

In [None]:
l1_obs_df = pd.read_csv('adata_obs_l1.csv', index_col=0)
adata_all.obs['leiden_l1'] = l1_obs_df['leiden'].astype(str)
adata_all.obsm['X_umap'] = l1_obs_df.loc[adata_all.obs.index][['umap_x', 'umap_y']].values
adata_all

In [None]:
sc.pl.umap(adata_all, color=['leiden_l1', 'n_genes'], cmap='inferno_r', legend_loc='on data')

In [None]:
sc.pl.umap(adata_all, color=['POU5F1'], cmap='viridis', legend_loc='on data', vmax=50)

In [None]:
cluster_map = {}
for i in range(adata_all.shape[0]):
    cluster_map[adata_all.obs.index[i]] = adata_all.obs['leiden_l1'].iloc[i] + '_0'

In [None]:
leiden_l1 = '9'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)


adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.1)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '14'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)


adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.2)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '15'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)


adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.8)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '16'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.3)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '18'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.8)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '19'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.15)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '20'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.8)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '21'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.1)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '23'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.1)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '24'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.2)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '25'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.2)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
leiden_l1 = '28'

adata = adata_all[adata_all.obs['leiden_l1'] == leiden_l1].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

adata.raw = adata.copy()
adata = adata[:, hv_genes].copy()
hv_genes
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')

sc.pp.neighbors(adata, n_neighbors=20)
sc.tl.umap(adata)

sc.tl.leiden(adata, resolution=0.1)
for i in range(adata.shape[0]):
    cluster_map[adata.obs.index[i]] = leiden_l1 + '_' + adata.obs['leiden'].iloc[i]

sc.pl.umap(adata, color=['leiden'], legend_loc='on data')

In [None]:
adata_all.obs['cluster'] = adata_all.obs.index.map(cluster_map)
adata_all.obs['umap_x'] = adata_all.obsm['X_umap'][:, 0]
adata_all.obs['umap_y'] = adata_all.obsm['X_umap'][:, 1]

adata_all.obs.to_csv('adata_obs_l2.csv')

In [None]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
adata_all.var["mt"] = adata_all.var_names.str.startswith("MT-")

sc.pp.calculate_qc_metrics(
    adata_all, qc_vars=["mt"], inplace=True, log1p=True
)

sc.pl.violin(
    adata_all,
    ["n_genes_by_counts", "total_counts", "pct_counts_mt"],
    jitter=0.0,
    multi_panel=True,
)

In [None]:
sc.pl.umap(adata_all, color=['leiden_l1', 'cluster', 'total_counts'], vmax=50000, legend_loc='on data')

In [None]:
sc.pl.umap(adata_all, color=['IFNA2'], vmax=0.1, legend_loc='on data')

In [None]:
fig, ax = plt.subplots(figsize=(30, 4))
sns.boxplot(data=adata_all.obs, x='cluster', y='total_counts', ax=ax,
            showfliers=False)
fig, ax = plt.subplots(figsize=(30, 4))
sns.boxplot(data=adata_all.obs, x='cluster', y='n_genes_by_counts', ax=ax,
            showfliers=False)
fig, ax = plt.subplots(figsize=(30, 4))
sns.boxplot(data=adata_all.obs, x='cluster', y='pct_counts_mt', ax=ax,
            showfliers=False)

In [None]:
sc.pl.umap(adata_all[adata_all.obs['leiden_l1'] == '15'], color=['cluster', 'HESX1'])

In [None]:
adata = adata_all[adata_all.obs['leiden_l1'] == '15']

fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sc.pl.umap(adata, ax=ax, show=False)
sc.pl.umap(adata[adata.obs['perturbed_gene'] == 'PDCD11'], 
           color='perturbed_gene', ax=ax, show=False, s=20)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sc.pl.umap(adata_all, ax=ax, show=False)
sc.pl.umap(adata_all[adata_all.obs['perturbed_gene'] == 'TIMELESS'], 
           color='perturbed_gene', ax=ax, show=False, s=3)