In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import anndata2ri
import scipy
import scipy.io
from rpy2.robjects import r

anndata2ri.activate()
sc.logging.print_versions()

scanpy==1.4.6 anndata==0.7.5 umap==0.5.1 numpy==1.20.1 scipy==1.6.0 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.12.2


In [2]:
%load_ext rpy2.ipython

In [3]:
%%R
suppressPackageStartupMessages({
    library(SingleCellExperiment)
    library(Seurat)
    library(SeuratDisk)
})

# Expressions

## Download the data

In [4]:
#!wget "https://hmgubox2.helmholtz-muenchen.de/index.php/s/r2W5dMJdq6mFMZY/download?path=%2Fhao-2020&files=expressions.h5ad" -O hao-expressions.h5ad

In [5]:
adata = sc.read('hao-expressions.h5ad')
adata

AnnData object with n_obs × n_vars = 161764 × 4000
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'cell_type'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype.l1_colors', 'celltype.l2_colors', 'celltype.l3_colors', 'neighbors'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    varm: 'PCs', 'SPCA'
    layers: 'count'
    obsp: 'distances'

In [6]:
adata_ = ad.AnnData(adata.layers['count'])
adata_.obs_names = adata.obs_names
adata_.var_names = adata.var_names
adata_.obs['cell_type'] = adata.obs['cell_type']

In [7]:
%%R -i adata_
adata_

class: SingleCellExperiment 
dim: 4000 161764 
metadata(0):
assays(1): X
rownames(4000): HES4 ISG15 ... VWA2 LINC01228
rowData names(0):
colnames(161764): L1_AAACCCAAGAAACTCA L1_AAACCCAAGACATACA ...
  E2L8_TTTGTTGGTTGGCTAT E2L8_TTTGTTGTCTCATGGA
colData names(1): cell_type
reducedDimNames(0):
altExpNames(0):


In [8]:
%%R
rna = as.Seurat(adata_, counts='X', data=NULL)
rna

An object of class Seurat 
4000 features across 161764 samples within 1 assay 
Active assay: RNA (4000 features, 0 variable features)


# Protein

## Download the data

In [9]:
x=3
!wget "https://hmgubox2.helmholtz-muenchen.de/index.php/s/r2W5dMJdq6mFMZY/download?path=%2Fhao-2020&files=protein.h5ad" -O hao-proteins.h5ad

--2021-03-04 22:11:46--  https://hmgubox2.helmholtz-muenchen.de/index.php/s/r2W5dMJdq6mFMZY/download?path=%2Fhao-2020&files=protein.h5ad
Resolving hmgubox2.helmholtz-muenchen.de (hmgubox2.helmholtz-muenchen.de)... 146.107.5.51
Connecting to hmgubox2.helmholtz-muenchen.de (hmgubox2.helmholtz-muenchen.de)|146.107.5.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 904554908 (863M) [application/octet-stream]
Saving to: 'hao-proteins.h5ad'


2021-03-04 22:11:54 (111 MB/s) - 'hao-proteins.h5ad' saved [904554908/904554908]



In [10]:
adata = sc.read('hao-proteins.h5ad')

In [11]:
adata_ = ad.AnnData(adata.layers['count'])
adata_.obs_names = adata.obs_names
adata_.var_names = adata.var_names
adata_.obs['cell_type'] = adata.obs['cell_type']

In [12]:
%%R -i adata_
adata_

class: SingleCellExperiment 
dim: 224 161764 
metadata(0):
assays(1): X
rownames(224): CD80 CD86 ... CLEC2 CD26-2
rowData names(0):
colnames(161764): L1_AAACCCAAGAAACTCA L1_AAACCCAAGACATACA ...
  E2L8_TTTGTTGGTTGGCTAT E2L8_TTTGTTGTCTCATGGA
colData names(1): cell_type
reducedDimNames(0):
altExpNames(0):


In [13]:
%%R
cite = as.Seurat(adata_, counts='X', data=NULL)
cite

An object of class Seurat 
224 features across 161764 samples within 1 assay 
Active assay: RNA (224 features, 0 variable features)


# Run Seurat v4 WNN
Following https://satijalab.org/seurat/v4.0/weighted_nearest_neighbor_analysis.html.

In [14]:
%%R
bm <- rna
bm[["ADT"]] <- CreateAssayObject(counts = cite@assays$RNA@counts)

rm(rna)
rm(cite)

print('Preprocessing RNA...')
DefaultAssay(bm) <- "RNA"
bm <- NormalizeData(bm, verbose=FALSE)
VariableFeatures(bm) <- rownames(bm[["RNA"]])
bm <- ScaleData(bm, verbose=FALSE)

bm <- RunPCA(bm, verbose=FALSE)

print('Preprocessing ADT...')
DefaultAssay(bm) <- "ADT"
VariableFeatures(bm) <- rownames(bm[["ADT"]])
bm <- NormalizeData(bm, normalization.method = 'CLR', margin = 2, verbose=FALSE)
bm <- ScaleData(bm, verbose=FALSE)
bm <- RunPCA(bm, reduction.name = "apca", verbose=FALSE)

bm <- FindMultiModalNeighbors(
       bm, reduction.list = list("pca", "apca"), 
       dims.list = list(1:30, 1:18), modality.weight.name = "RNA.weight"
   )

bm <- RunSPCA(bm, assay = 'RNA', graph = 'wsnn', npcs = 20)

SaveH5Seurat(bm, 'hao-seurat.h5seurat', overwrite = TRUE)

[1] "Preprocessing RNA..."
[1] "Preprocessing ADT..."


R[write to console]: Calculating cell-specific modality weights

R[write to console]: Finding 20 nearest neighbors for each modality.



  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~01m 44s       |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03m 25s


R[write to console]: Calculating kernel bandwidths



  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~23s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=39s  


R[write to console]: Finding multimodal neighbors



  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~05m 04s       |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=10m 24s
  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~50s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01m 17s


R[write to console]: Constructing multimodal KNN graph

R[write to console]: Constructing multimodal SNN graph

R[write to console]: Computing sPCA transformation

R[write to console]: Creating h5Seurat file for version 3.1.5.9900

R[write to console]: Adding counts for RNA

R[write to console]: Adding data for RNA

R[write to console]: Adding scale.data for RNA

R[write to console]: Adding variable features for RNA

R[write to console]: No feature-level metadata found for RNA

R[write to console]: Adding counts for ADT

R[write to console]: Adding data for ADT

R[write to console]: Adding scale.data for ADT

R[write to console]: Adding variable features for ADT

R[write to console]: No feature-level metadata found for ADT

R[write to console]: Adding cell embeddings for pca

R[write to console]: Adding loadings for pca

R[write to console]: No projected loadings for pca

R[write to console]: Adding standard deviations for pca

R[write to console]: No JackStraw data for pca

R[write to

In [15]:
%%R
Convert("hao-seurat.h5seurat", assay="RNA", "hao-seurat.h5ad", overwrite=TRUE)

R[write to console]: Validating h5Seurat file

R[write to console]: Adding scale.data from RNA as X

R[write to console]: Adding data from RNA as raw

R[write to console]: Transfering meta.data to obs

R[write to console]: Adding dimensional reduction information for pca

R[write to console]: Adding feature loadings for pca

R[write to console]: Adding dimensional reduction information for spca

R[write to console]: Adding feature loadings for spca

R[write to console]: Adding wsnn as neighbors



In [16]:
integrated = sc.read('hao-seurat.h5ad')
integrated

AnnData object with n_obs × n_vars = 161764 × 4000
    obs: 'cell_type', 'nCount_ADT', 'nFeature_ADT', 'RNA.weight', 'ADT.weight'
    var: 'features'
    uns: 'neighbors'
    obsm: 'X_pca', 'X_spca'
    varm: 'PCs', 'SPCA'
    obsp: 'distances'

In [17]:
latent = sc.AnnData(integrated.obsm['X_spca'])
latent.obs = integrated.obs
latent.obsp['connectivities'] = integrated.uns['neighbors']['distances']

In [18]:
latent.write('../data/integrated/seurat/hao-seurat.h5ad')

In [19]:
!rm 'hao-expressions.h5ad'
!rm 'hao-proteins.h5ad'
!rm 'hao-seurat.h5ad'
!rm 'hao-seurat.h5seurat'

In [20]:
%%R
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /mnt/home/icb/anastasia.litinetskaya/miniconda3/envs/signac/lib/libopenblasp-r0.3.10.so

locale:
 [1] LC_CTYPE=C                 LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] parallel  stats4    tools     stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] Matrix_1.3-2                SeuratDisk_0.0.0.9018      
 [3] SeuratObject_4.0.0          Seurat_4.0.0               
 [5] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
 [7] Biobase_2.50.0              GenomicRanges_1.42.0     

# Visualize results

In [None]:
np.sum(latent.obs_names != adata.obs_names)

In [None]:
latent.obs['cell_type'] = adata.obs['cell_type']

In [None]:
sc.pp.neighbors(latent)
sc.tl.umap(latent)

In [None]:
sc.pl.umap(latent, color='cell_type', frameon=False)