In [1]:
import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
import scipy.sparse

In [2]:
sc.set_figure_params(dpi=200)

In [3]:
basepath = "/storage/data/2404_revision/ce_data/testa_data"

In [4]:
def harmonise_genespace(adata, genes, keep_layer=[]):
    genes_to_add = []
    for g in genes:
        if g not in adata.var_names:
            genes_to_add += [g]
    print(f"{(len(genes)-len(genes_to_add))/len(genes)*100:.0f} % gene overlap with reference")
    adata = ad.AnnData(
        X=scipy.sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape=(adata.shape[0], adata.shape[1] + len(genes_to_add))),
        obs=adata.obs,
        var=pd.DataFrame(index=adata.var.index.tolist() + genes_to_add),
        layers={i:scipy.sparse.csr_matrix((adata.layers[i].data, adata.layers[i].indices, adata.layers[i].indptr), shape=(adata.shape[0], adata.shape[1] + len(genes_to_add))) for i in keep_layer}
    )[:, genes].copy()
    
    return adata

---

From: https://www.biorxiv.org/content/10.1101/2023.08.21.553507v1.full.pdf

Data provied directly by authors

In [5]:
adata_mosaic = ad.read_h5ad(f"{basepath}/adataPagaRawMultiplexing.h5ad")
adata_mosaic = adata_mosaic.raw.to_adata()

In [6]:
hnoca_genes = pd.read_csv("/storage/data/2404_revision/hnoca_features.csv", index_col=0)
adata_mosaic = harmonise_genespace(adata_mosaic, hnoca_genes.index.tolist())
adata_mosaic.var = hnoca_genes

60 % gene overlap with reference


In [7]:
adata_mosaic.obs["cellID_dataset"] = adata_mosaic.obs["cellID"].astype(str) + "_" + adata_mosaic.obs["dataset"].astype(str)
adata_mosaic.obs["cellID_dataset"] = adata_mosaic.obs["cellID_dataset"].astype("category")

In [8]:
adata_mosaic

AnnData object with n_obs × n_vars = 14913 × 36842
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'stage', 'type', 'id_stage', 'cellID_newName_type', 'S_score', 'G2M_score', 'phase', 'leidenAnnotated', 'leiden_1.2', 'endpoint_GlutamatergicNeurons_late', 'endpoint_GlutamatergicNeurons_early', 'endpoint_MigratingNeurons', 'endpoint_OuterRadialGliaAstrocytes', 'endpoint_Interneurons', 'endpoint_Interneurons_GAD2', 'endpoint_CajalR_like', 'Exc_Lineage', 'endpoint_GlutamatergicNeurons_both', 'cellID_dataset'
    var: 'ensembl', 'gene_symbol'

In [9]:
obs_new = pd.DataFrame(index=adata_mosaic.obs.index)

obs_new["sample_source"] = "3d_culture"
obs_new["organism"] = "Homo sapiens"
obs_new["disease"] = "healthy"

obs_new["cell_line_original"] = adata_mosaic.obs["cellID"]
obs_new["organoid_age_days"] = adata_mosaic.obs["dataset"].replace({
    'UpD300': 300,
    'UpD50': 50,
    'DownD50': 50,
    'DownD250': 250,
    'DownD100': 100,
    'UpD100_2': 100,
    'UpD100_1': 100,
})
obs_new["batch"] = adata_mosaic.obs["cellID_dataset"]
obs_new["treatment"] = ""
obs_new["cell_type_original"] = adata_mosaic.obs["leidenAnnotated"]

obs_new["suspension_type"] = "cell"
obs_new["organ"] = "cerebral cortex"
obs_new["assay_sc"] = "10x 3' v2"
obs_new["ethnicity"] = "unknown"
obs_new["sex"] = "unknown"
obs_new["development_stage"] = "unknown"
obs_new["cell_type"] = obs_new["cell_type_original"].replace({
    'Neurons': 'neuron',
    'GlutamatergicNeurons_late': 'glutamatergic neuron',
    'RadialGliaProgenitors': 'radial glial cell',
    'ProliferatingProgenitors': 'neural progenitor cell',
    'GlutamatergicNeurons_early': 'glutamatergic neuron',
    'MigratingNeurons': 'neuron',
    'intermediateProgenitors': 'neural progenitor cell',
    'Interneurons': 'interneuron',
    'Interneurons_GAD2': 'interneuron',
    'OuterRadialGliaAstrocytes': 'glioblast',
    'CajalR_like': 'Cajal-Retzius cell',
})

obs_new["obs_names_original"] = obs_new.index
obs_new["publication"] = "Caporale, 2023"
obs_new["doi"] = "10.1101/2023.08.21.553507"
obs_new["hnoca_core"] = False

In [10]:
adata_mosaic.obs = obs_new

adata_mosaic.layers["counts_lengthnorm"] = adata_mosaic.X.copy()
sc.pp.normalize_total(adata_mosaic, target_sum=1e6)
sc.pp.log1p(adata_mosaic)

In [11]:
ncells_before = adata_mosaic.n_obs
adata_mosaic = adata_mosaic[(adata_mosaic.X>0).sum(axis=1).A.ravel()>200].copy()
print(f"{(ncells_before-adata_mosaic.n_obs)/ncells_before*100:.1f} % cells removed")

0.0 % cells removed


In [12]:
adata_mosaic

AnnData object with n_obs × n_vars = 14913 × 36842
    obs: 'sample_source', 'organism', 'disease', 'cell_line_original', 'organoid_age_days', 'batch', 'treatment', 'cell_type_original', 'suspension_type', 'organ', 'assay_sc', 'ethnicity', 'sex', 'development_stage', 'cell_type', 'obs_names_original', 'publication', 'doi', 'hnoca_core'
    var: 'ensembl', 'gene_symbol'
    uns: 'log1p'
    layers: 'counts_lengthnorm'

In [13]:
adata_mosaic.write(f"{basepath}/testa_mosaic.h5ad", compression="gzip")

---

From: https://www.science.org/doi/10.1126/sciadv.adh2726

Data provied directly by authors

In [14]:
adata_ndd = ad.read_h5ad(f"{basepath}/Corrected_Velo_RawNolog_filtered.h5ad")

In [15]:
adata_ndd = adata_ndd.raw.to_adata()
adata_ndd = adata_ndd[adata_ndd.obs["Group"] == "CTL"].copy()


These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.
  warn(


In [16]:
hnoca_genes = pd.read_csv("/storage/data/2404_revision/hnoca_features.csv", index_col=0)
adata_ndd = harmonise_genespace(adata_ndd, hnoca_genes.index.tolist())
adata_ndd.var = hnoca_genes

54 % gene overlap with reference


In [17]:
obs_new = pd.DataFrame(index=adata_ndd.obs.index)

obs_new["sample_source"] = "3d_culture"
obs_new["organism"] = "Homo sapiens"
obs_new["disease"] = "healthy"

obs_new["cell_line_original"] = adata_ndd.obs["CellLine"]
obs_new["organoid_age_days"] = adata_ndd.obs["Stage"].replace({
    'd50': 50,
    'd100': 100,
})
obs_new["batch"] = adata_ndd.obs["easyNaming"]
obs_new["treatment"] = adata_ndd.obs["Group"]
obs_new["cell_type_original"] = adata_ndd.obs["lOfficial"]

obs_new["suspension_type"] = "cell"
obs_new["organ"] = "cerebral cortex"
obs_new["assay_sc"] = "10x 3' v2"
obs_new["ethnicity"] = "unknown"
obs_new["sex"] = "unknown"
obs_new["development_stage"] = "unknown"
obs_new["cell_type"] = obs_new["cell_type_original"].replace({
    'RG_IPC2': 'radial glial cell',
    'EN1': 'glutamatergic neuron',
    'Astro': 'astrocyte',
    'ENE': 'glutamatergic neuron',
    'RG1': 'radial glial cell',
    'IPC': 'neural progenitor cell',
    'RG_IPC': 'radial glial cell',
    'IN': 'interneuron',
    '5': 'unknown',
})

obs_new["obs_names_original"] = obs_new.index
obs_new["publication"] = "Lopez-Tobon, 2023"
obs_new["doi"] = "10.1126/sciadv.adh2726"
obs_new["hnoca_core"] = False

In [18]:
adata_ndd.obs = obs_new

adata_ndd.layers["counts_lengthnorm"] = adata_ndd.X.copy()
sc.pp.normalize_total(adata_ndd, target_sum=1e6)
sc.pp.log1p(adata_ndd)

In [19]:
ncells_before = adata_ndd.n_obs
adata_ndd = adata_ndd[(adata_ndd.X>0).sum(axis=1).A.ravel()>200].copy()
print(f"{(ncells_before-adata_ndd.n_obs)/ncells_before*100:.1f} % cells removed")

0.0 % cells removed


In [20]:
adata_ndd

AnnData object with n_obs × n_vars = 29585 × 36842
    obs: 'sample_source', 'organism', 'disease', 'cell_line_original', 'organoid_age_days', 'batch', 'treatment', 'cell_type_original', 'suspension_type', 'organ', 'assay_sc', 'ethnicity', 'sex', 'development_stage', 'cell_type', 'obs_names_original', 'publication', 'doi', 'hnoca_core'
    var: 'ensembl', 'gene_symbol'
    uns: 'log1p'
    layers: 'counts_lengthnorm'

In [21]:
adata_ndd.write(f"{basepath}/testa_ndd.h5ad", compression="gzip")