# Generate anndata for the section cellular structure

Fetching the NeurIPS human bone marrow dataset and then preprocessing it following the steps in the section preprocessing and visualization.
(Subsetting the dataset for `site4-donor8`.)

## Packages

In [52]:
import lamindb as ln
import numpy as np
import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
import rpy2.robjects.packages as rpackages
import scanpy as sc
from rpy2.robjects import default_converter, numpy2ri, pandas2ri, r
from rpy2.robjects.conversion import localconverter
from scipy.sparse import csc_matrix, csr_matrix, issparse
from scipy.stats import median_abs_deviation

## Downloading and Subsetting the dataset

The original dataset is from: https://figshare.com/ndownloader/files/51994907.
This dataset is uploaded to `laminDB` under the name of *human_bmmcs.h5ad", and it is loaded from there to be subsetted and preprocessed in this notebook.

In [2]:
assert ln.setup.settings.instance.slug == "theislab/sc-best-practices"

ln.track("639B7kNp8Fcb")

[92m→[0m loaded Transform('639B7kNp8Fcb0000', key='anndata_clustering.ipynb'), re-started Run('74JDugGQURpsTHdh') at 2026-02-16 14:39:09 UTC
[92m→[0m notebook imports: lamindb==2.0a2 numpy==2.1.3 rpy2==3.5.11 scanpy==1.11.1 scipy==1.14.1


In [3]:
af = ln.Artifact(
    "/Users/seohyon/Downloads/human_bmmcs.h5ad",
    key="cellular_structure/human_bmmcs.h5ad",
    description="Human bone marrow mononuclear cells dataset before subsetting and preprocessing.",
).save()
af

... uploading human_bmmcs.h5ad: 100.0%


Artifact(uid='kJw3eFAHPHO3Tfo90000', version_tag=None, is_latest=True, key='cellular_structure/human_bmmcs.h5ad', description='Human bone marrow mononuclear cells dataset before subsetting and preprocessing.', suffix='.h5ad', kind=None, otype='AnnData', size=403427025, hash='vSA467cLfvtnqBe2_UU7Sn', n_files=None, n_observations=None, branch_id=1, space_id=1, storage_id=1, run_id=38, schema_id=None, created_by_id=5, created_at=2026-02-16 14:39:11 UTC, is_locked=False)

In [5]:
af = ln.Artifact.get(key="cellular_structure/human_bmmcs.h5ad", is_latest=True)
adata = af.load()
adata

AnnData object with n_obs × n_vars = 69249 × 13431
    obs: 'pct_counts_mt', 'n_counts', 'n_genes', 'size_factors', 'phase', 'cell_type', 'batch', 'pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', '_scvi_batch', '_scvi_labels'
    var: 'feature_types', 'gene_id'
    uns: 'DonorNumber_colors', 'Samplename_colors', 'Site_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'cell_type_colors', 'dataset_id', 'genome', 'neighbors', 'organism', 'phase_colors', 'umap'
    obsm: 'X_pca', 'X_scANVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [6]:
# subset with site 4 donor 8
adata_s4d8 = adata[adata.obs_names.str.contains("s4d8")].copy()
adata_raw = adata_s4d8.copy()

In [7]:
# filtering out zero count genes in the subset,
# since when subsetting anndata the cells are subsetted but not the genes
adata_s4d8 = adata_s4d8[:, adata_s4d8.X.sum(axis=0).A1 > 0].copy()

In [8]:
# to make it easier to work with
adata = adata_s4d8.copy()
adata

AnnData object with n_obs × n_vars = 9876 × 13396
    obs: 'pct_counts_mt', 'n_counts', 'n_genes', 'size_factors', 'phase', 'cell_type', 'batch', 'pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', '_scvi_batch', '_scvi_labels'
    var: 'feature_types', 'gene_id'
    uns: 'DonorNumber_colors', 'Samplename_colors', 'Site_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'cell_type_colors', 'dataset_id', 'genome', 'neighbors', 'organism', 'phase_colors', 'umap'
    obsm: 'X_pca', 'X_scANVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

## Preprocessing

### Quality Control

In [9]:
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

In [10]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata

AnnData object with n_obs × n_vars = 9876 × 13396
    obs: 'pct_counts_mt', 'n_counts', 'n_genes', 'size_factors', 'phase', 'cell_type', 'batch', 'pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', '_scvi_batch', '_scvi_labels', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'feature_types', 'gene_id', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'DonorNumber_colors', 'Samplename_colors', 'Site_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'cell_type_colors', 'dataset_id', 'genom

In [11]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [14]:
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)
adata.obs.outlier.value_counts()

outlier
False    9654
True      222
Name: count, dtype: int64

In [15]:
adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
    adata.obs["pct_counts_mt"] > 8
)
adata.obs.mt_outlier.value_counts()

mt_outlier
False    9075
True      801
Name: count, dtype: int64

In [16]:
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

Total number of cells: 9876
Number of cells after filtering of low quality cells: 8874


**Correction of ambient RNA using SoupX**

In [17]:
import logging

rcb.logger.setLevel(logging.ERROR)


%load_ext rpy2.ipython

In [18]:
%%R
library(SoupX)


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [19]:
adata_pp = adata.copy()
sc.pp.normalize_total(adata_pp, target_sum=1e4)
sc.pp.log1p(adata_pp)

In [20]:
sc.pp.pca(adata_pp)
sc.pp.neighbors(adata_pp)
sc.tl.leiden(
    adata_pp, key_added="soupx_groups", flavor="igraph", n_iterations=2, directed=False
)

# Preprocess variables for SoupX
adata.obs["soupx_groups"] = adata_pp.obs["soupx_groups"]

In [21]:
del adata_pp

In [22]:
cells = adata.obs_names
genes = adata.var_names
data = adata.X.T

In [23]:
# synchronising the genes.. for soupx
common_genes = adata.var_names
adata_raw = adata_raw[:, common_genes].copy()

In [24]:
adata_raw.var_names_make_unique()

genes_raw = adata_raw.var_names
cells_raw = adata_raw.obs_names

data_tod = adata_raw.X.T

In [25]:
del adata_raw

In [26]:
data_csc = data.tocsc()
data_tod_csc = data_tod.tocsc()

# Extract sparse components and cast to correct types
x = data_csc.data.astype(np.float64)
i = data_csc.indices.astype(np.int32)
p = data_csc.indptr.astype(np.int32)
dims = np.array(data_csc.shape, dtype=np.int32)

x_tod = data_tod_csc.data.astype(np.float64)
i_tod = data_tod_csc.indices.astype(np.int32)
p_tod = data_tod_csc.indptr.astype(np.int32)
dims_tod = np.array(data_tod_csc.shape, dtype=np.int32)

with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
    ro.globalenv["x"] = x
    ro.globalenv["i"] = i
    ro.globalenv["p"] = p
    ro.globalenv["dims"] = dims

    ro.globalenv["x_tod"] = x_tod
    ro.globalenv["i_tod"] = i_tod
    ro.globalenv["p_tod"] = p_tod
    ro.globalenv["dims_tod"] = dims_tod

    ro.globalenv["genes"] = np.array(genes)
    ro.globalenv["genes_raw"] = np.array(genes_raw)
    ro.globalenv["cells"] = np.array(cells)
    ro.globalenv["cells_raw"] = np.array(cells_raw)
    ro.globalenv["soupx_groups"] = adata.obs["soupx_groups"].to_numpy()

In [27]:
%%R -o out 

library(Matrix)

# Manually coerce types to avoid "array" class errors
x <- as.numeric(x)
i <- as.integer(i)
p <- as.integer(p)
dims <- as.integer(dims)

x_tod <- as.numeric(x_tod)
i_tod <- as.integer(i_tod)
p_tod <- as.integer(p_tod)
dims_tod <- as.integer(dims_tod)

# Reconstruct sparse matrices
data <- new("dgCMatrix",
            Dim = dims,
            x = x,
            i = i,
            p = p)

data_tod <- new("dgCMatrix",
                Dim = dims_tod,
                x = x_tod,
                i = i_tod,
                p = p_tod)

# Assign row and column names
rownames(data) <- genes
colnames(data) <- cells
rownames(data_tod) <- genes_raw
colnames(data_tod) <- cells_raw

# SoupX pipeline
# sc = SoupChannel(data_tod, data, calcSoupProfile = TRUE)
# sc = setClusters(sc, soupx_groups)
# sc = autoEstCont(sc, doPlot = FALSE)
# out = adjustCounts(sc, roundToInt = TRUE)

#### OR

# SoupX - we don't have empty droplets
sc = SoupChannel(data_tod, data, calcSoupProfile = FALSE)

soupProf = data.frame(row.names = rownames(data), est = rowSums(data)/sum(data), counts = rowSums(data))
sc = setSoupProfile(sc, soupProf)
sc = setClusters(sc, soupx_groups)

# Estimate contamination fraction
sc  = autoEstCont(sc, doPlot=FALSE)
# Infer corrected table of counts and rount to integer
out = adjustCounts(sc, roundToInt = TRUE)

In [30]:
with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
    out_py = ro.conversion.rpy2py(ro.globalenv["out"])

x = np.array(out_py.slots["x"])
i = np.array(out_py.slots["i"])
p = np.array(out_py.slots["p"])
shape = tuple(out_py.slots["Dim"])

out_matrix = csc_matrix((x, i, p), shape=shape)

In [31]:
adata.layers["counts"] = adata.X.copy()
adata.layers["soupX_counts"] = out_matrix.T
adata.X = adata.layers["soupX_counts"]

In [32]:
print(f"Total number of genes: {adata.n_vars}")

# Min 20 cells - filters out 0 count genes
sc.pp.filter_genes(adata, min_cells=20)
print(f"Number of genes after cell filter: {adata.n_vars}")

Total number of genes: 13396
Number of genes after cell filter: 12850


**Doublet detection**

In [33]:
%%R
library(Seurat)
library(scater)
library(scDblFinder)
library(SingleCellExperiment)
library(BiocParallel)

In [34]:
data_mat = adata.X.T.tocsc()

x = data_mat.data.astype(np.float64)
i = data_mat.indices.astype(np.int32)
p = data_mat.indptr.astype(np.int32)
dims = np.array(data_mat.shape, dtype=np.int32)

with localconverter(ro.default_converter + numpy2ri.converter):
    ro.globalenv["x"] = x
    ro.globalenv["i"] = i
    ro.globalenv["p"] = p
    ro.globalenv["dims"] = dims

In [35]:
%%R -o doublet_score -o doublet_class

x <- as.numeric(x)
i <- as.integer(i)
p <- as.integer(p)
dims <- as.integer(dims)

data_mat <- new("dgCMatrix", Dim = dims, x = x, i = i, p = p)

set.seed(123)
sce <- scDblFinder(SingleCellExperiment(list(counts = data_mat)))

doublet_score <- sce$scDblFinder.score
doublet_class <- sce$scDblFinder.class

In [36]:
adata.obs["scDblFinder_score"] = doublet_score
adata.obs["scDblFinder_class"] = doublet_class
adata.obs.scDblFinder_class.value_counts()

scDblFinder_class
singlet    7906
doublet     968
Name: count, dtype: int64

### Normalization

In [37]:
scales_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
# log1p transform
adata.layers["log1p_norm"] = sc.pp.log1p(scales_counts["X"], copy=True)

In [38]:
%%R
library(scran)
library(BiocParallel)

In [39]:
# Preliminary clustering for differentiated normalisation
adata_pp = adata.copy()
sc.pp.normalize_total(adata_pp)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.leiden(
    adata_pp, key_added="groups", flavor="igraph", n_iterations=2, directed=False
)

In [44]:
data_mat = adata_pp.X.T
# convert to CSC if possible. See https://github.com/MarioniLab/scran/issues/70
if issparse(data_mat):
    if data_mat.nnz > 2**31 - 1:
        data_mat = data_mat.tocoo()
    else:
        data_mat = data_mat.tocsc()
    # Convert sparse matrix to dense numpy array
    data_mat = data_mat.toarray()

with localconverter(ro.default_converter + numpy2ri.converter):
    ro.globalenv["data_mat"] = data_mat

with localconverter(ro.default_converter + pandas2ri.converter):
    ro.globalenv["input_groups"] = adata_pp.obs["groups"]

In [45]:
del adata_pp

In [46]:
%%R -o size_factors

size_factors = sizeFactors(
    computeSumFactors(
        SingleCellExperiment(
            list(counts=data_mat)), 
            clusters = input_groups,
            min.mean = 0.1,
            BPPARAM = MulticoreParam()
    )
)

In [47]:
adata.obs["size_factors"] = size_factors
scran = adata.X / adata.obs["size_factors"].values[:, None]
scran_logged = np.log1p(scran)
adata.layers["scran_normalization"] = csr_matrix(scran_logged)

### Feature selection

In [48]:
%%R
library(scry)
library(SingleCellExperiment)

In [49]:
X_sparse = adata.X.T.tocoo()

Matrix = rpackages.importr("Matrix")

with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
    ro.globalenv["obs"] = adata.obs
    ro.globalenv["var"] = adata.var

i, j = X_sparse.row, X_sparse.col
x = X_sparse.data

ro.globalenv["i"] = ro.IntVector((i + 1).tolist())  # R is 1-indexed
ro.globalenv["j"] = ro.IntVector((j + 1).tolist())
ro.globalenv["x"] = ro.FloatVector(x.tolist())

r("X <- sparseMatrix(i = i, j = j, x = x, dims = c({}, {}))".format(*X_sparse.shape))

<rpy2.robjects.methods.RS4 object at 0x1870c7750> [25]
R classes: ('dgCMatrix',)

In [50]:
%%R
sce <- SingleCellExperiment(
  assays = list(X = X),
  colData = obs,
  rowData = var
)

sce <- devianceFeatureSelection(sce, assay = "X")

In [53]:
with localconverter(default_converter + pandas2ri.converter + numpy2ri.converter):
    binomial_deviance = ro.r("rowData(sce)$binomial_deviance")

In [54]:
idx = binomial_deviance.argsort()[-4000:]
mask = np.zeros(adata.var_names.shape, dtype=bool)
mask[idx] = True

adata.var["highly_deviant"] = mask
adata.var["binomial_deviance"] = binomial_deviance

In [55]:
sc.pp.highly_variable_genes(adata, layer="scran_normalization")

## Saving it to Lamindb

In [56]:
adata.write("s4d8_subset.h5ad")

In [57]:
af = ln.Artifact(
    "s4d8_subset.h5ad",
    key="cellular_structure/s4d8_subset.h5ad",
    description="s4d8 subset of bone marrow mononuclear cells (BMMCs) after quality control and normalization with SoupX and scran",
).save()
af

[92m→[0m creating new artifact version for key 'cellular_structure/s4d8_subset.h5ad' in storage 's3://lamin-eu-central-1/VPwcjx3CDAa2'
... uploading s4d8_subset.h5ad: 100.0%
[94m•[0m replacing the existing cache path /Users/seohyon/Library/Caches/lamindb/lamin-eu-central-1/VPwcjx3CDAa2/cellular_structure/s4d8_subset.h5ad


Artifact(uid='NHvJu326LXI8mgTx0001', version_tag=None, is_latest=True, key='cellular_structure/s4d8_subset.h5ad', description='s4d8 subset of bone marrow mononuclear cells (BMMCs) after quality control and normalization with SoupX and scran', suffix='.h5ad', kind=None, otype='AnnData', size=363333946, hash='B9RQHbqQYpgpMa0BFiW3WV', n_files=None, n_observations=None, branch_id=1, space_id=1, storage_id=1, run_id=38, schema_id=None, created_by_id=5, created_at=2026-02-16 14:53:36 UTC, is_locked=False)