# Generate anndata for the section cellular structure

Fetching the NeurIPS human bone marrow dataset and then preprocessing it following the steps in the section preprocessing and visualization.
(Subsetting the dataset for `site4-donor8`.)

## Packages

In [39]:
import numpy as np
import scanpy as sc
from rpy2.robjects import numpy2ri
from rpy2.robjects.conversion import localconverter
from scipy.stats import median_abs_deviation

## Downloading and Subsetting the dataset

In [149]:
# adata = sc.read(
#     filename="human_bmmcs.h5ad",
#     backup_url="https://figshare.com/ndownloader/files/51994907",
# )
# adata

# Somehow scanpy  can't fetch the file from the URL, so I downloaded it manually and read it from the local path.

adata = sc.read(filename="/Users/seohyon/Downloads/human_bmmcs.h5ad")
adata

AnnData object with n_obs × n_vars = 69249 × 13431
    obs: 'pct_counts_mt', 'n_counts', 'n_genes', 'size_factors', 'phase', 'cell_type', 'batch', 'pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', '_scvi_batch', '_scvi_labels'
    var: 'feature_types', 'gene_id'
    uns: 'DonorNumber_colors', 'Samplename_colors', 'Site_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'cell_type_colors', 'dataset_id', 'genome', 'neighbors', 'organism', 'phase_colors', 'umap'
    obsm: 'X_pca', 'X_scANVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [150]:
# subset with site 4 donor 8
adata_s4d8 = adata[adata.obs_names.str.contains("s4d8")].copy()
adata_raw = adata_s4d8.copy()

In [151]:
# filtering out zero count genes in the subset,
# since when subsetting anndata the cells are subsetted but not the genes
adata_s4d8 = adata_s4d8[:, adata_s4d8.X.sum(axis=0).A1 > 0].copy()

In [152]:
# to make it easier to work with
adata = adata_s4d8.copy()
adata

AnnData object with n_obs × n_vars = 9876 × 13396
    obs: 'pct_counts_mt', 'n_counts', 'n_genes', 'size_factors', 'phase', 'cell_type', 'batch', 'pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', '_scvi_batch', '_scvi_labels'
    var: 'feature_types', 'gene_id'
    uns: 'DonorNumber_colors', 'Samplename_colors', 'Site_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'cell_type_colors', 'dataset_id', 'genome', 'neighbors', 'organism', 'phase_colors', 'umap'
    obsm: 'X_pca', 'X_scANVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

## Preprocessing

### Quality Control

In [153]:
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains("^HB[^(P)]")

In [154]:
sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata

AnnData object with n_obs × n_vars = 9876 × 13396
    obs: 'pct_counts_mt', 'n_counts', 'n_genes', 'size_factors', 'phase', 'cell_type', 'batch', 'pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', '_scvi_batch', '_scvi_labels', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb'
    var: 'feature_types', 'gene_id', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'DonorNumber_colors', 'Samplename_colors', 'Site_colors', '_scvi_manager_uuid', '_scvi_uuid', 'batch_colors', 'cell_type_colors', 'dataset_id', 'genom

In [155]:
def is_outlier(adata, metric: str, nmads: int):
    M = adata.obs[metric]
    outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
        np.median(M) + nmads * median_abs_deviation(M) < M
    )
    return outlier

In [156]:
adata.obs["outlier"] = (
    is_outlier(adata, "log1p_total_counts", 5)
    | is_outlier(adata, "log1p_n_genes_by_counts", 5)
    | is_outlier(adata, "pct_counts_in_top_20_genes", 5)
)
adata.obs.outlier.value_counts()

outlier
False    9654
True      222
Name: count, dtype: int64

In [157]:
adata.obs["mt_outlier"] = is_outlier(adata, "pct_counts_mt", 3) | (
    adata.obs["pct_counts_mt"] > 8
)
adata.obs.mt_outlier.value_counts()

mt_outlier
False    9075
True      801
Name: count, dtype: int64

In [158]:
print(f"Total number of cells: {adata.n_obs}")
adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()

print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

Total number of cells: 9876
Number of cells after filtering of low quality cells: 8874


Correction of ambient RNA using SoupX

In [159]:
import logging

import rpy2.rinterface_lib.callbacks as rcb
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

rcb.logger.setLevel(logging.ERROR)


%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [160]:
%%R
library(SoupX)

In [161]:
adata_pp = adata.copy()
sc.pp.normalize_total(adata_pp, target_sum=1e4)
sc.pp.log1p(adata_pp)

In [162]:
sc.pp.pca(adata_pp)
sc.pp.neighbors(adata_pp)
sc.tl.leiden(
    adata_pp, key_added="soupx_groups", flavor="igraph", n_iterations=2, directed=False
)

# Preprocess variables for SoupX
adata.obs["soupx_groups"] = adata_pp.obs["soupx_groups"]

In [163]:
del adata_pp

In [164]:
cells = adata.obs_names
genes = adata.var_names
data = adata.X.T

In [165]:
# synchronising the genes.. for soupx
common_genes = adata.var_names
adata_raw = adata_raw[:, common_genes].copy()

In [166]:
adata_raw.var_names_make_unique()

genes_raw = adata_raw.var_names
cells_raw = adata_raw.obs_names

data_tod = adata_raw.X.T

In [167]:
del adata_raw

In [168]:
data_csc = data.tocsc()
data_tod_csc = data_tod.tocsc()

# Extract sparse components and cast to correct types
x = data_csc.data.astype(np.float64)
i = data_csc.indices.astype(np.int32)
p = data_csc.indptr.astype(np.int32)
dims = np.array(data_csc.shape, dtype=np.int32)

x_tod = data_tod_csc.data.astype(np.float64)
i_tod = data_tod_csc.indices.astype(np.int32)
p_tod = data_tod_csc.indptr.astype(np.int32)
dims_tod = np.array(data_tod_csc.shape, dtype=np.int32)

with localconverter(ro.default_converter + pandas2ri.converter + numpy2ri.converter):
    ro.globalenv["x"] = x
    ro.globalenv["i"] = i
    ro.globalenv["p"] = p
    ro.globalenv["dims"] = dims

    ro.globalenv["x_tod"] = x_tod
    ro.globalenv["i_tod"] = i_tod
    ro.globalenv["p_tod"] = p_tod
    ro.globalenv["dims_tod"] = dims_tod

    ro.globalenv["genes"] = np.array(genes)
    ro.globalenv["genes_raw"] = np.array(genes_raw)
    ro.globalenv["cells"] = np.array(cells)
    ro.globalenv["cells_raw"] = np.array(cells_raw)
    ro.globalenv["soupx_groups"] = adata.obs["soupx_groups"].to_numpy()

In [None]:
%%R -o out 

library(Matrix)

# Manually coerce types to avoid "array" class errors
x <- as.numeric(x)
i <- as.integer(i)
p <- as.integer(p)
dims <- as.integer(dims)

x_tod <- as.numeric(x_tod)
i_tod <- as.integer(i_tod)
p_tod <- as.integer(p_tod)
dims_tod <- as.integer(dims_tod)

# Reconstruct sparse matrices
data <- new("dgCMatrix",
            Dim = dims,
            x = x,
            i = i,
            p = p)

data_tod <- new("dgCMatrix",
                Dim = dims_tod,
                x = x_tod,
                i = i_tod,
                p = p_tod)

# Assign row and column names
rownames(data) <- genes
colnames(data) <- cells
rownames(data_tod) <- genes_raw
colnames(data_tod) <- cells_raw

# SoupX pipeline
# sc = SoupChannel(data_tod, data, calcSoupProfile = TRUE)
# sc = setClusters(sc, soupx_groups)
# sc = autoEstCont(sc, doPlot = FALSE)
# out = adjustCounts(sc, roundToInt = TRUE)

#### OR

# SoupX - don't find empty droplets?
# sc = SoupChannel(data_tod, data, calcSoupProfile = FALSE)
# sc = setClusters(sc, as.character(soupx_groups))

# # 3. Use the clusters to "estimate" what the ambient RNA is
# sc = estimateSoup(sc)

# # 4. Final safety wipe of NaNs
# sc$soupProfile$est[is.na(sc$soupProfile$est)] = 0

# # 5. Run the rest
# sc = autoEstCont(sc, tfidfMin = 0.3, soupQuantile = 0.5, forceAccept = TRUE, doPlot = FALSE)
# out = adjustCounts(sc, roundToInt = TRUE)


Error in autoEstCont(sc, tfidfMin = 0.3, soupQuantile = 0.5, forceAccept = TRUE,  : 
  No plausible marker genes found.  Is the channel low complexity (see help)?  If not, reduce tfidfMin or soupQuantile


RInterpreterError: Failed to parse and evaluate line '\nlibrary(Matrix)\n\n# Manually coerce types to avoid "array" class errors\nx <- as.numeric(x)\ni <- as.integer(i)\np <- as.integer(p)\ndims <- as.integer(dims)\n\nx_tod <- as.numeric(x_tod)\ni_tod <- as.integer(i_tod)\np_tod <- as.integer(p_tod)\ndims_tod <- as.integer(dims_tod)\n\n# Reconstruct sparse matrices\ndata <- new("dgCMatrix",\n            Dim = dims,\n            x = x,\n            i = i,\n            p = p)\n\ndata_tod <- new("dgCMatrix",\n                Dim = dims_tod,\n                x = x_tod,\n                i = i_tod,\n                p = p_tod)\n\n# Assign row and column names\nrownames(data) <- genes\ncolnames(data) <- cells\nrownames(data_tod) <- genes_raw\ncolnames(data_tod) <- cells_raw\n\n# SoupX pipeline\n# sc = SoupChannel(data_tod, data, calcSoupProfile = TRUE)\n# sc = setClusters(sc, soupx_groups)\n# sc = autoEstCont(sc, doPlot = FALSE)\n# out = adjustCounts(sc, roundToInt = TRUE)\n\n#### OR\n\n# SoupX - don\'t find empty droplets?\nsc = SoupChannel(data_tod, data, calcSoupProfile = FALSE)\nsc = setClusters(sc, as.character(soupx_groups))\n\n# 3. Use the clusters to "estimate" what the ambient RNA is\nsc = estimateSoup(sc)\n\n# 4. Final safety wipe of NaNs\nsc$soupProfile$est[is.na(sc$soupProfile$est)] = 0\n\n# 5. Run the rest\nsc = autoEstCont(sc, tfidfMin = 0.3, soupQuantile = 0.5, forceAccept = TRUE, doPlot = FALSE)\nout = adjustCounts(sc, roundToInt = TRUE)\n'.
R error message: 'Error in autoEstCont(sc, tfidfMin = 0.3, soupQuantile = 0.5, forceAccept = TRUE,  : \n  No plausible marker genes found.  Is the channel low complexity (see help)?  If not, reduce tfidfMin or soupQuantile'