In [1]:
import os
import scanpy as sc
import numpy as np
import pandas as pd
import scipy.sparse

In [2]:
sc.settings.verbosity = 3
sc.set_figure_params(dpi=200)

In [None]:
adata = sc.read("/storage/data/organoid_atlas_adatas/230510_08_organoids_labelled_.h5ad")

# Loading morphogen annotation

In [4]:
# read morphogen information
df = pd.read_csv("/storage/data/organoid_atlas_adatas/230521_integrations_labels/230523_morphogens_obs.csv", index_col=0)
assert np.all(df.index == adata.obs.index)
adata.obs = pd.concat((adata.obs,df), axis=1)

# Loading integrations for benchmarking

In [6]:
adata

AnnData object with n_obs × n_vars = 1770578 × 36842
    obs: 'assay_sc', 'assay_differentiation', 'assay_type_differentiation', 'bio_sample', 'cell_line', 'cell_type', 'development_stage', 'disease', 'ethnicity', 'gm', 'id', 'individual', 'organ', 'organism', 'sex', 'state_exact', 'sample_source', 'source_doi', 'suspension_type_original', 'tech_sample', 'treatment', 'assay_sc_original', 'cell_line_original', 'cell_type_original', 'development_stage_original', 'disease_original', 'ethnicity_original', 'organ_original', 'organism_original', 'sex_original', 'suspension_type', 'obs_names_original', 'organoid_age_days', 'publication', 'doi', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'leiden_pca_unintegrated_1', 'leiden_pca_unintegrated_80', 'leiden_pca_rss_1', 'leiden_pca_rss_80', 'snapseed_pca_unintegrated_level_1', 'snapseed_pca_unintegrated_level_2', 'snapseed_pca_unintegrat

In [7]:
namedict = {
    "scvi_results.h5ad": "X_benchmark_scvi",
    "scanvi_results_level_1.h5ad": "X_benchmark_scanvi_level1",
    "scanvi_results_level_12.h5ad": "X_benchmark_scanvi_level12",
    "scanvi_results_level_123.h5ad": "X_benchmark_scanvi_level123",
    "scpoli_results_level_1.h5ad": "X_benchmark_scpoli_level1",
    "scpoli_results_level_12.h5ad": "X_benchmark_scpoli_level12",
    "scpoli_results_level_123.h5ad": "X_benchmark_scpoli_level123",
    "emb_scpoli_aggr_level1.npy": "X_benchmark_aggr_scpoli_level1",
    "emb_scpoli_aggr_level123.npy": "X_benchmark_aggr_scpoli_level123",
    "emb_umap_scpoli_aggr_level1.npy": "X_umap_benchmark_aggr_scpoli_level1",
    "emb_umap_scpoli_aggr_level123.npy": "X_umap_benchmark_aggr_scpoli_level123",
}

In [8]:
basepath = "/storage/data/organoid_atlas_adatas/230521_integrations_labels/"
for f in os.listdir(basepath):
    if f.endswith(".npy"):
        adata.obsm[namedict[f]] = np.load(basepath+f)
    elif f.endswith(".h5ad"):
        a = sc.read(basepath+f)
        assert np.all(a.obs.index == adata.obs.index)
        adata.obsm[namedict[f]] = a.X.copy()

In [9]:
adata

AnnData object with n_obs × n_vars = 1770578 × 36842
    obs: 'assay_sc', 'assay_differentiation', 'assay_type_differentiation', 'bio_sample', 'cell_line', 'cell_type', 'development_stage', 'disease', 'ethnicity', 'gm', 'id', 'individual', 'organ', 'organism', 'sex', 'state_exact', 'sample_source', 'source_doi', 'suspension_type_original', 'tech_sample', 'treatment', 'assay_sc_original', 'cell_line_original', 'cell_type_original', 'development_stage_original', 'disease_original', 'ethnicity_original', 'organ_original', 'organism_original', 'sex_original', 'suspension_type', 'obs_names_original', 'organoid_age_days', 'publication', 'doi', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'leiden_pca_unintegrated_1', 'leiden_pca_unintegrated_80', 'leiden_pca_rss_1', 'leiden_pca_rss_80', 'snapseed_pca_unintegrated_level_1', 'snapseed_pca_unintegrated_level_2', 'snapseed_pca_unintegrat

In [10]:
adata.write_h5ad("/storage/data/organoid_atlas_adatas/230510_09_organoids_morphogens_manualannot_integrations.h5ad", compression="gzip")

In [10]:
for k in list(adata.obsp.keys()).copy():
        del adata.obsp[k]
for k in list(adata.obsm.keys()).copy():
    if "umap" in k or k == "X_rss":
        del adata.obsm[k]
for k in list(adata.layers.keys()).copy():
        del adata.layers[k]
adata.X = scipy.sparse.csr_matrix(adata.shape)

In [11]:
adata.write_h5ad("/storage/data/organoid_atlas_adatas/230510_09_organoids_morphogens_manualannot_integrations_cleanedbenchmarking.h5ad", compression="gzip")

# Loading final (curated) celltype annotation

In [5]:
adata = sc.read("/storage/data/organoid_atlas_adatas/230510_09_organoids_morphogens_manualannot_integrations.h5ad")

In [7]:
annot = pd.read_csv("/storage/data/organoid_atlas_adatas/metadata_0627.tsv.gz", sep="\t", index_col=0, low_memory=False)
assert np.all(annot.index == adata.obs.index)

In [16]:
adata.obs["annot_region"] = annot["final_region2"]

In [11]:
adata.obs["annot_level_1"] = annot["annot_level_1"]
adata.obs["annot_level_2"] = annot["annot_level_2"]
adata.obs["annot_level_3"] = annot["annot_level_3"]
adata.obs["annot_level_4"] = annot["annot_level_4"]

In [18]:
adata.obs["annot_region"].value_counts()

Dorsal telencephalon     757626
Unspecific               369569
Ventral telencephalon    158506
Medulla                  136526
Cerebellum                94494
Thalamus                  75016
Hypothalamus              70103
Pons                      63328
Dorsal midbrain           30651
Ventral midbrain          14759
Name: annot_region, dtype: int64

In [12]:
adata.obs["annot_level_1"].value_counts()

Neuron             856654
NPC                528922
Neuroepithelium     91689
Glioblast           79506
CP                  77315
MC                  49273
Astrocyte           48956
IP                  15433
NC Derivatives       9771
OPC                  8325
EC                   2451
PSC                  2187
Microglia              96
Name: annot_level_1, dtype: int64

In [13]:
adata.obs["annot_level_2"].value_counts()

Dorsal Telencephalic Neuron     518082
Non-telencephalic NPC           258311
Non-telencephalic Neuron        226566
Dorsal Telencephalic NPC        224111
Ventral Telencephalic Neuron    112006
Neuroepithelium                  91689
Glioblast                        79506
CP                               77315
MC                               49273
Astrocyte                        48956
Ventral Telencephalic NPC        46500
Dorsal Telencephalic IP          15433
NC Derivatives                    9771
OPC                               8325
EC                                2451
PSC                               2187
Microglia                           96
Name: annot_level_2, dtype: int64

In [14]:
adata.obs["annot_level_3"].value_counts()

Dorsal Telencephalic Neuron     518082
Dorsal Telencephalic NPC        224111
Ventral Telencephalic Neuron    112006
Neuroepithelium                  91689
Medulla NPC                      88818
Glioblast                        79506
CP                               77315
Cerebellar NPC                   55673
Hypothalamic Neuron              49813
MC                               49273
Astrocyte                        48956
Medulla Neuron                   47708
Ventral Telencephalic NPC        46500
Thalamic NPC                     40158
Cerebellar Neuron                38821
Pons Neuron                      36808
Thalamic Neuron                  34858
Pons NPC                         26520
Hypothalamic NPC                 20290
Dorsal Midbrain NPC              17928
Dorsal Telencephalic IP          15433
Dorsal Midbrain Neuron           12723
NC Derivatives                    9771
Ventral Midbrain NPC              8924
OPC                               8325
Ventral Midbrain Neuron  

In [15]:
adata.obs["annot_level_4"].value_counts()

Dorsal Telencephalic Neuron NT-VGLUT    518082
Dorsal Telencephalic NPC                224111
Ventral Telencephalic Neuron NT-GABA    112006
Neuroepithelium                          91689
Medulla NPC                              88818
Glioblast                                79506
CP                                       77315
Cerebellar NPC                           55673
MC                                       49273
Astrocyte                                48956
Ventral Telencephalic NPC                46500
Thalamic NPC                             40158
Medulla Neuron NT-VGLUT                  33086
Thalamic Neuron NT-VGLUT                 27184
Pons NPC                                 26520
Cerebellar Neuron NT-GABA                26050
Hypothalamic Neuron NT-GABA              25112
Hypothalamic Neuron NT-VGLUT             24701
Pons Neuron NT-VGLUT                     21518
Hypothalamic NPC                         20290
Dorsal Midbrain NPC                      17928
Dorsal Telenc

In [19]:
adata.write_h5ad("/storage/data/organoid_atlas_adatas/230620_10_updated_manual_annotation.h5ad", compression="gzip")