In [1]:
import anndata as ad
import pandas as pd
import numpy as np
import scipy.sparse
import scanpy as sc

In [2]:
def harmonise_genespace(adata, genes, keep_layer=[]):
    genes_to_add = []
    for g in genes:
        if g not in adata.var_names:
            genes_to_add += [g]
    print(f"{(len(genes)-len(genes_to_add))/len(genes)*100:.0f} % gene overlap with reference")
    adata = ad.AnnData(
        X=scipy.sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape=(adata.shape[0], adata.shape[1] + len(genes_to_add))),
        obs=adata.obs,
        var=pd.DataFrame(index=adata.var.index.tolist() + genes_to_add),
        layers={i:scipy.sparse.csr_matrix((adata.layers[i].data, adata.layers[i].indices, adata.layers[i].indptr), shape=(adata.shape[0], adata.shape[1] + len(genes_to_add))) for i in keep_layer}
    )[:, genes].copy()
    
    return adata

Preprint: https://www.biorxiv.org/content/10.1101/2024.01.21.576532

Data: https://doi.org/10.5281/zenodo.10391945

In [3]:
basepath = "/storage/data/2404_revision/ce_data/binder_data"

In [4]:
adata_409b2 = ad.read_h5ad(f"{basepath}/rna_409b2_processed.h5ad")
adata_fok4 = ad.read_h5ad(f"{basepath}/rna_fok4_processed.h5ad")

adata = ad.concat((adata_409b2, adata_fok4))

In [5]:
hnoca_genes = pd.read_csv("/storage/data/2404_revision/hnoca_features.csv", index_col=0)
adata = harmonise_genespace(adata, hnoca_genes.index.tolist(), ["counts", "lognorm"])
adata.var = hnoca_genes

53 % gene overlap with reference


In [6]:
obs_new = pd.DataFrame(index=adata.obs.index)

obs_new["sample_source"] = "3d_culture"
obs_new["organism"] = "Homo sapiens"
obs_new["disease"] = "healthy"

obs_new["cell_line_original"] = adata.obs["cellline"]
obs_new["organoid_age_days"] = adata.obs["day"].replace({"D70":70, "D90": 90})
obs_new["batch"] = adata.obs["sample"]
obs_new["treatment"] = adata.obs["treatment"]
obs_new["cell_type_original"] = adata.obs["celltypes"]

obs_new["suspension_type"] = "cell"
obs_new["organ"] = "brain"
obs_new["assay_sc"] = "10x 3' v2"
obs_new["ethnicity"] = "European"
obs_new["sex"] = "female"
obs_new["development_stage"] = obs_new["cell_line_original"].replace({"409b2": "36-year-old human stage", "FOK4": "unknown"})
obs_new["cell_type"] = obs_new["cell_type_original"].replace({
    'Imm.ChP': 'neural progenitor cell',
    'RG': 'radial glial cell',
    'ChP': 'choroid plexus epithelial cell',
    'IP': 'neural progenitor cell',
    'Cycling': 'neural progenitor cell',
    'Ex.Neurons': 'glutamatergic neuron',
    'Inh.Neurons': 'GABAergic neuron',
    'RGS5Neurons': 'neuron',
})

obs_new["obs_names_original"] = obs_new.index
obs_new["publication"] = "Dony, 2024"
obs_new["doi"] = "10.1101/2024.01.21.576532"
obs_new["hnoca_core"] = False

In [7]:
adata.obs = obs_new

adata = adata[adata.obs["treatment"].isin(['Veh-Veh', 'Veh'])].copy()

adata.X = adata.layers["counts"]
adata.layers["counts_lengthnorm"] = adata.layers["counts"]
del adata.layers["lognorm"], adata.layers["counts"]
sc.pp.normalize_total(adata, target_sum=1e6)
sc.pp.log1p(adata)

del adata.obsm
del adata.uns
del adata.obsp
del adata.varm

In [8]:
ncells_before = adata.n_obs
adata = adata[(adata.X>0).sum(axis=1).A.ravel()>200].copy()
print(f"{(ncells_before-adata.n_obs)/ncells_before*100:.1f} % cells removed")

0.0 % cells removed


In [9]:
adata

AnnData object with n_obs × n_vars = 16114 × 36842
    obs: 'sample_source', 'organism', 'disease', 'cell_line_original', 'organoid_age_days', 'batch', 'treatment', 'cell_type_original', 'suspension_type', 'organ', 'assay_sc', 'ethnicity', 'sex', 'development_stage', 'cell_type', 'obs_names_original', 'publication', 'doi', 'hnoca_core'
    var: 'ensembl', 'gene_symbol'
    layers: 'counts_lengthnorm'

In [10]:
adata.write(f"{basepath}/binder.h5ad", compression="gzip")