In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import anndata2ri
import scipy
import scipy.io
from rpy2.robjects import r

anndata2ri.activate()

In [2]:
%load_ext rpy2.ipython

In [3]:
%%R
suppressPackageStartupMessages({
    library(SingleCellExperiment)
    library(Seurat)
    library(SeuratDisk)
})

# Load data

In [4]:
adata = sc.read('../data/10x-cite/10x-cite.h5ad')
adata

AnnData object with n_obs × n_vars = 10849 × 15792
    obs: 'n_genes', 'percent_mito', 'n_counts', 'batch', 'labels'
    var: 'encode', 'n_cells-0', 'highly_variable-0', 'hvg_encode-0', 'n_cells-1', 'highly_variable-1', 'hvg_encode-1'
    obsm: 'protein_expression'

In [5]:
# Seurat categories have to be strings
adata.obs.batch = adata.obs.batch.astype('category')
adata.obs.batch = adata.obs.batch.cat.rename_categories(["0", "1"])

In [6]:
query = adata[adata.obs.batch == '1']
query

View of AnnData object with n_obs × n_vars = 3994 × 15792
    obs: 'n_genes', 'percent_mito', 'n_counts', 'batch', 'labels'
    var: 'encode', 'n_cells-0', 'highly_variable-0', 'hvg_encode-0', 'n_cells-1', 'highly_variable-1', 'hvg_encode-1'
    obsm: 'protein_expression'

In [7]:
adata = adata[adata.obs.batch == '0']
adata

View of AnnData object with n_obs × n_vars = 6855 × 15792
    obs: 'n_genes', 'percent_mito', 'n_counts', 'batch', 'labels'
    var: 'encode', 'n_cells-0', 'highly_variable-0', 'hvg_encode-0', 'n_cells-1', 'highly_variable-1', 'hvg_encode-1'
    obsm: 'protein_expression'

In [8]:
adata_ = ad.AnnData(adata.X)
adata_.obs_names = adata.obs_names
adata_.var_names = adata.var_names
adata_.obs['cell_type'] = adata.obs['labels']

In [9]:
%%R -i adata_
rna = as.Seurat(adata_, counts='X', data=NULL)
rna

An object of class Seurat 
15792 features across 6855 samples within 1 assay 
Active assay: RNA (15792 features, 0 variable features)


In [10]:
adata_ = ad.AnnData(adata.obsm['protein_expression'])
adata_.obs['cell_type'] = adata.obs['labels']

In [11]:
%%R -i adata_
cite = as.Seurat(adata_, counts='X', data=NULL)
cite

R[write to console]:  Feature names cannot have underscores ('_'), replacing with dashes ('-')



An object of class Seurat 
14 features across 6855 samples within 1 assay 
Active assay: RNA (14 features, 0 variable features)


In [12]:
%%R
bm <- rna
bm[["ADT"]] <- CreateAssayObject(counts = cite@assays$RNA@counts)

rm(rna)
rm(cite)

print('Preprocessing RNA...')
DefaultAssay(bm) <- "RNA"
bm <- SCTransform(bm, verbose = FALSE)
VariableFeatures(bm) <- rownames(bm[["RNA"]])
bm <- ScaleData(bm, verbose=FALSE)
bm <- RunPCA(bm, verbose=FALSE)

print('Preprocessing ADT...')
DefaultAssay(bm) <- "ADT"
VariableFeatures(bm) <- rownames(bm[["ADT"]])
bm <- NormalizeData(bm, normalization.method = 'CLR', margin = 2, verbose=FALSE)
bm <- ScaleData(bm, verbose=FALSE)
bm <- RunPCA(bm, reduction.name = "apca", verbose=FALSE)

[1] "Preprocessing RNA..."


R[write to console]:  Not all features provided are in this Assay object, removing the following feature(s): AL034417.3, FHAD1, AL121992.2, SLC30A2, AL512408.1, SFN, CFAP57, AL139220.2, RAB3B, TMEM56, AL603832.1, AL390066.1, SPAG17, PDZK1, NUDT4B, AL450992.1, AL139011.2, AL356441.1, GPR52, AL596220.1, C4BPB, TGFB2, CAPN8, MIR3681HG, TCF7L1, NT5DC4, CKAP2L, LINC01412, SATB2, AC007879.3, MAP2, ABCB6, CHL1, ARPC4-TTLL3, AC011816.2, AC104184.1, AC134772.1, FAM3D, C3ORF67, SCAANT1, ZBTB20-AS4, SLC12A8, LINC02029, AC078802.1, AC008040.1, AC131235.3, AC092546.1, AC114811.2, AC097460.1, AC109347.2, GYPE, WWC2-AS2, AC084871.2, CCDC110, AC092354.1, MAST4-AS1, AC010273.1, ADGRV1, AC008522.1, GPRIN1, AL138831.2, HIST1H2BI, AL358933.1, TRIM31, KCNK5, LRFN2, GUCA1B, AL136304.1, RSPH9, AL133406.3, ENPP3, TAB2-AS1, AL078581.1, ULBP1, SOD2-OT1, AC004948.1, RAPGEF5, ADCY1, CLDN12, AC002456.1, AC006329.1, AC087071.1, AC008264.2, TMEM139, GHET1, AC093458.1, INE2, FAM156A, TRO, OTUD6A, CENPI, ALG13-AS1, KL

[1] "Preprocessing ADT..."


R[write to console]: 
 
R[write to console]:  You're computing too large a percentage of total singular values, use a standard svd instead.



In [13]:
%%R
bm <- FindMultiModalNeighbors(
       bm, reduction.list = list("pca", "apca"), 
       dims.list = list(1:30, 1:13), modality.weight.name = "RNA.weight"
   )

R[write to console]: Calculating cell-specific modality weights

R[write to console]: Finding 20 nearest neighbors for each modality.



  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~04s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=07s  


R[write to console]: Calculating kernel bandwidths



  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~00s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  


R[write to console]: Finding multimodal neighbors



  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~11s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=23s  
  |                                                  | 0 % ~calculating   |+++++++++++++++++++++++++                         | 50% ~01s           |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=02s  


R[write to console]: Constructing multimodal KNN graph

R[write to console]: Constructing multimodal SNN graph



In [14]:
adata_ = ad.AnnData(query.X)
adata_.obs_names = query.obs_names
adata_.var_names = query.var_names
adata_.obs['cell_type'] = query.obs['labels']

In [15]:
%%R -i adata_
query = as.Seurat(adata_, counts='X', data=NULL)
query

An object of class Seurat 
15792 features across 3994 samples within 1 assay 
Active assay: RNA (15792 features, 0 variable features)


In [16]:
%%R
query <- SCTransform(query, verbose = FALSE)

In [17]:
%%R
DefaultAssay(bm) <- "RNA"
VariableFeatures(bm) <- rownames(bm[["RNA"]])
bm

An object of class Seurat 
31380 features across 6855 samples within 3 assays 
Active assay: RNA (15792 features, 15792 variable features)
 2 other assays present: ADT, SCT
 2 dimensional reductions calculated: pca, apca


In [18]:
%%R
bm <- RunSPCA(bm, assay = 'SCT', graph = 'wsnn')

R[write to console]: Computing sPCA transformation



In [19]:
%%R
bm

An object of class Seurat 
31380 features across 6855 samples within 3 assays 
Active assay: RNA (15792 features, 15792 variable features)
 2 other assays present: ADT, SCT
 3 dimensional reductions calculated: pca, apca, spca


In [20]:
%%R
DefaultAssay(bm) <- "RNA"
bm[['SCT']]

SCTAssay data with 15574 features for 6855 cells, and 1 SCTModel(s) 
Top 10 variable features:
 AL627309.1, AL669831.5, LINC00115, FAM41C, NOC2L, KLHL17, PLEKHN1,
AL645608.8, HES4, ISG15 


In [21]:
%%R
anchors <- FindTransferAnchors(
  reference = bm,
  reference.assay = 'SCT',
  query = query,
  normalization.method = "SCT",
  reference.reduction = "spca"
)

R[write to console]: Normalizing query using reference SCT model

R[write to console]: Projecting cell embeddings

R[write to console]: Finding neighborhoods

R[write to console]: Finding anchors

R[write to console]: 	Found 2203 anchors



In [22]:
%%R
query <- TransferData(
  anchorset = anchors, 
  reference = bm,
  query = query,
  refdata = list(
    predicted_ADT = "ADT")
)

R[write to console]: Finding integration vectors

R[write to console]: Finding integration vector weights

R[write to console]: 0%   10   20   30   40   50   60   70   80   90   100%

R[write to console]: [----|----|----|----|----|----|----|----|----|----|

R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[write to console]: *
R[writ

In [23]:
%%R
DefaultAssay(query) <- "predicted_ADT"
query

An object of class Seurat 
31329 features across 3994 samples within 3 assays 
Active assay: predicted_ADT (14 features, 0 variable features)
 2 other assays present: RNA, SCT


In [24]:
%%R -o query_sc
query_sc <- as.SingleCellExperiment(query)
query_sc <- as(query_sc, 'SingleCellExperiment')

In [25]:
query_sc.obsm['predicted_protein_expression'] = query_sc.X.A

In [26]:
from scipy.stats import pearsonr

protein_corrs = []
for i, protein in enumerate(query_sc.var_names):
    protein = protein[:-10]
    protein_corrs.append(protein + ': Corr=' + str(np.round(pearsonr(np.log1p(query_sc.obsm['predicted_protein_expression'][:, i]), np.log1p(query.obsm['protein_expression'].iloc[:, i].values))[0], 3)))
    
protein_corrs

['CD3: Corr=0.937',
 'CD4: Corr=0.9',
 'CD8a: Corr=0.737',
 'CD14: Corr=0.956',
 'CD15: Corr=0.216',
 'CD16: Corr=0.813',
 'CD56: Corr=0.766',
 'CD19: Corr=0.888',
 'CD25: Corr=0.555',
 'CD45RA: Corr=0.836',
 'CD45RO: Corr=0.792',
 'PD-1: Corr=0.455',
 'TIGIT: Corr=0.571',
 'CD127: Corr=0.849']