In [1]:
import anndata as ad
import pandas as pd
import numpy as np
import scipy.sparse
import scanpy as sc

In [2]:
def harmonise_genespace(adata, genes, keep_layer=[]):
    genes_to_add = []
    for g in genes:
        if g not in adata.var_names:
            genes_to_add += [g]
    print(f"{(len(genes)-len(genes_to_add))/len(genes)*100:.0f} % gene overlap with reference")
    adata = ad.AnnData(
        X=scipy.sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape=(adata.shape[0], adata.shape[1] + len(genes_to_add))),
        obs=adata.obs,
        var=pd.DataFrame(index=adata.var.index.tolist() + genes_to_add),
        layers={i:scipy.sparse.csr_matrix((adata.layers[i].data, adata.layers[i].indices, adata.layers[i].indptr), shape=(adata.shape[0], adata.shape[1] + len(genes_to_add))) for i in keep_layer}
    )[:, genes].copy()
    
    return adata

Paper: https://www.nature.com/articles/s41593-019-0350-2

Data: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE124174

In [3]:
basepath = "/storage/data/2404_revision/ce_data/giandomenico_data"

In [4]:
# the h5ad file loaded here has been generated from the GEO-deposited data using the giandomenico step 1 R script
adata = ad.read_h5ad(f"{basepath}/Giandomenico_2019.h5ad")

In [5]:
adata.X = adata.X.tocsr()

In [6]:
hnoca_genes = pd.read_csv("/storage/data/2404_revision/hnoca_features.csv", index_col=0)
adata = harmonise_genespace(adata, hnoca_genes.index.tolist())
adata.var = hnoca_genes

60 % gene overlap with reference


In [7]:
obs_new = pd.DataFrame(index=adata.obs.index)

obs_new["sample_source"] = "3d_culture"
obs_new["organism"] = "Homo sapiens"
obs_new["disease"] = "healthy"

obs_new["cell_line_original"] = adata.obs["cell_line"]
obs_new["organoid_age_days"] = 75
obs_new["batch"] = adata.obs["BioSample"]
obs_new["treatment"] = "none"
obs_new["cell_type_original"] = "unknown"

obs_new["suspension_type"] = "cell"
obs_new["organ"] = "brain"
obs_new["assay_sc"] = "10x 3' v2"
obs_new["ethnicity"] = "unknown"
obs_new["sex"] = obs_new["cell_line_original"].replace({"embryonic stem cell line H1": "male", "embryonic stem cell line H9": "female"})
obs_new["development_stage"] = "blastula stage"
obs_new["cell_type"] = "unknown"

obs_new["obs_names_original"] = obs_new.index
obs_new["publication"] = "Giandomenico, 2019"
obs_new["doi"] = "10.1038/s41593-019-0350-2"
obs_new["hnoca_core"] = False

In [8]:
adata

AnnData object with n_obs × n_vars = 13280 × 36842
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'BioSample', 'cell_line', 'AGE'
    var: 'ensembl', 'gene_symbol'

In [9]:
adata.obs = obs_new

adata.layers["counts_lengthnorm"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)

In [10]:
ncells_before = adata.n_obs
adata = adata[(adata.X>0).sum(axis=1).A.ravel()>200].copy()
print(f"{(ncells_before-adata.n_obs)/ncells_before*100:.1f} % cells removed")

0.0 % cells removed


In [11]:
adata

AnnData object with n_obs × n_vars = 13280 × 36842
    obs: 'sample_source', 'organism', 'disease', 'cell_line_original', 'organoid_age_days', 'batch', 'treatment', 'cell_type_original', 'suspension_type', 'organ', 'assay_sc', 'ethnicity', 'sex', 'development_stage', 'cell_type', 'obs_names_original', 'publication', 'doi', 'hnoca_core'
    var: 'ensembl', 'gene_symbol'
    uns: 'log1p'
    layers: 'counts_lengthnorm'

In [12]:
adata.write(f"{basepath}/giandomenico.h5ad", compression="gzip")