# FCA analysis - doublets identification

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
import sys
import warnings
import anndata
import anndata
import rpy2

%load_ext rpy2.ipython
%matplotlib inline


warnings.filterwarnings('ignore')


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-sn/preprocessing/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

-----
anndata     0.7.5
scanpy      1.6.0
sinfo       0.3.1
-----
PIL                 8.0.1
anndata             0.7.5
backcall            0.2.0
cairo               1.20.0
cffi                1.14.3
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                1.0.0
dateutil            2.8.1
decorator           4.4.2
get_version         2.1
h5py                3.1.0
igraph              0.8.3
ipykernel           5.3.4
ipython_genutils    0.2.0
jedi                0.17.2
jinja2              2.11.2
joblib              0.17.0
kiwisolver          1.3.1
legacy_api_wrap     0.0.0
leidenalg           0.8.3
llvmlite            0.34.0
louvain             0.7.0
markupsafe          1.1.1
matplotlib          3.3.3
mpl_toolkits        NA
natsort             7.1.0
numba               0.51.2
numexpr             2.7.1
numpy               1.19.4
packaging           20.4
pandas              1.1.4
parso               0.7

'/opt/conda/bin/python'

# Load data

In [2]:
adata = sc.read('FCAsn-M1doubletscores.h5ad')
adata.X.shape

(7924, 29383)

In [3]:
#used rply to transferm sparse matrix to R
spx = rpy2.robjects.FloatVector(adata.raw.X.data)
spp = rpy2.robjects.IntVector(adata.raw.X.indptr)
spi = rpy2.robjects.IntVector(adata.raw.X.indices)
spo = adata.obs
spv = adata.var

In [4]:
spo.head()

Unnamed: 0,n_genes,sample,donor,Library_ATAC Library_RNA iRods_path,percent_mito,n_counts,batch,scrublet_score,scrublet_cluster_score,zscore,bh_pval,bonf_pval,is_doublet
HD_F_GON9525419_AAACAGCCACCTCAGG,4816,HD_F_GON9525419,Hrv15,HD_F_GON9525611 HD_F_GON9525419 ...,0.000665,12037.0,0,0.070248,0.145821,0.607078,0.764257,1.0,False
HD_F_GON9525419_AAACATGCAGGCTAGA,4581,HD_F_GON9525419,Hrv15,HD_F_GON9525611 HD_F_GON9525419 ...,0.000544,12873.0,0,0.140127,0.12069,0.309535,0.764257,1.0,False
HD_F_GON9525419_AAACCAACATACTCCT,1754,HD_F_GON9525419,Hrv15,HD_F_GON9525611 HD_F_GON9525419 ...,0.0,2951.0,0,0.050967,0.050967,-0.515946,0.764257,1.0,False
HD_F_GON9525419_AAACCAACATGGTTAT,1880,HD_F_GON9525419,Hrv15,HD_F_GON9525611 HD_F_GON9525419 ...,0.000713,2807.0,0,0.129909,0.151515,0.674491,0.764257,1.0,False
HD_F_GON9525419_AAACCGAAGCGTGCAC,3804,HD_F_GON9525419,Hrv15,HD_F_GON9525611 HD_F_GON9525419 ...,0.000357,8394.0,0,0.151515,0.164286,0.825687,0.764257,1.0,False


In [8]:
%%R -i spx -i spp -i spi -i spo -i spv -o spox -o spop -o spoi
library(SingleCellExperiment)
library(Matrix)
source("Rcode.R")
rawdata <- Matrix(0, max(spi)+1, length(spp) -1, sparse=T)
rawdata@x = spx
rawdata@i = as.integer(spi)
rawdata@p = as.integer(spp)
rownames(rawdata) = rownames(spv)
colnames(rawdata) = rownames(spo)
sce <- SingleCellExperiment(rawdata)
names(assays(sce)) <- c("rawdata")
for(i in colnames(spo)) {colData(sce)[i] <- spo[[i]]}
for(i in colnames(spv)) {rowData(sce)[i] <- spv[[i]]}

samples <- names(table(spo$sample))
print(samples)
library(celda)
denoisedmatrix <- Matrix(0, max(spi)+1,length(spp) -1, sparse=T)
for(i in samples){
    print(paste("Processing", i, "with", sum(spo$sample == i),"cells"))
	submat <- as.matrix(rawdata[, (spo$sample == i)])
    print(dim(submat))
	mode(submat) <- "integer" 
    res <- decontX(submat)
    print('Contamination summary:')
    print(summary(res$contamination))
	denoisedmatrix[, spo$sample == i] = res$decontXcounts
#     print("saving partial")
#     saveRDS(denoisedmatrix, "denoised.rds")
}
denoisedmatrix@x <- floor(denoisedmatrix@x) + (runif(length(denoisedmatrix@x)) < (denoisedmatrix@x %% 1) ) # sample fractionnal counts to integers
denoisedmatrix <- t(denoisedmatrix) # transpose
spox <- denoisedmatrix@x
spoi <- denoisedmatrix@i
spop <- denoisedmatrix@p

[1] "HD_F_GON9525419" "HD_F_GON9525420" "HD_F_GON9525421" "HD_F_GON9525422"
[1] "Processing HD_F_GON9525419 with 1396 cells"
[1] 29383  1396


R[write to console]: --------------------------------------------------

R[write to console]: Starting DecontX

R[write to console]: --------------------------------------------------

R[write to console]: Tue Feb 16 22:08:52 2021 .. Analyzing all cells

R[write to console]: Tue Feb 16 22:08:52 2021 .... Converting to sparse matrix

R[write to console]: Tue Feb 16 22:08:52 2021 .... Generating UMAP and estimating cell types

R[write to console]: Tue Feb 16 22:08:57 2021 .... Estimating contamination

R[write to console]: Tue Feb 16 22:08:58 2021 ...... Completed iteration: 10 | converge: 0.02685

R[write to console]: Tue Feb 16 22:08:58 2021 ...... Completed iteration: 20 | converge: 0.01859

R[write to console]: Tue Feb 16 22:08:59 2021 ...... Completed iteration: 30 | converge: 0.01181

R[write to console]: Tue Feb 16 22:08:59 2021 ...... Completed iteration: 40 | converge: 0.006875

R[write to console]: Tue Feb 16 22:09:00 2021 ...... Completed iteration: 50 | converge: 0.004131

R[

[1] 29383  1396
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0001642 0.0267815 0.0571044 0.1130662 0.1264615 0.9990556 
[1] "Processing HD_F_GON9525420 with 1426 cells"
[1] 29383  1426


R[write to console]: --------------------------------------------------

R[write to console]: Starting DecontX

R[write to console]: --------------------------------------------------

R[write to console]: Tue Feb 16 22:10:16 2021 .. Analyzing all cells

R[write to console]: Tue Feb 16 22:10:16 2021 .... Converting to sparse matrix

R[write to console]: Tue Feb 16 22:10:16 2021 .... Generating UMAP and estimating cell types

R[write to console]: Tue Feb 16 22:10:21 2021 .... Estimating contamination

R[write to console]: Tue Feb 16 22:10:22 2021 ...... Completed iteration: 10 | converge: 0.02579

R[write to console]: Tue Feb 16 22:10:22 2021 ...... Completed iteration: 20 | converge: 0.01182

R[write to console]: Tue Feb 16 22:10:23 2021 ...... Completed iteration: 30 | converge: 0.006791

R[write to console]: Tue Feb 16 22:10:23 2021 ...... Completed iteration: 40 | converge: 0.004353

R[write to console]: Tue Feb 16 22:10:24 2021 ...... Completed iteration: 50 | converge: 0.003024

R

[1] 29383  1426
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000209 0.029923 0.062638 0.117151 0.125154 0.997333 
[1] "Processing HD_F_GON9525421 with 1731 cells"
[1] 29383  1731


R[write to console]: --------------------------------------------------

R[write to console]: Starting DecontX

R[write to console]: --------------------------------------------------

R[write to console]: Tue Feb 16 22:11:39 2021 .. Analyzing all cells

R[write to console]: Tue Feb 16 22:11:40 2021 .... Converting to sparse matrix

R[write to console]: Tue Feb 16 22:11:40 2021 .... Generating UMAP and estimating cell types

R[write to console]: Tue Feb 16 22:11:45 2021 .... Estimating contamination

R[write to console]: Tue Feb 16 22:11:46 2021 ...... Completed iteration: 10 | converge: 0.02331

R[write to console]: Tue Feb 16 22:11:46 2021 ...... Completed iteration: 20 | converge: 0.009291

R[write to console]: Tue Feb 16 22:11:46 2021 ...... Completed iteration: 30 | converge: 0.005361

R[write to console]: Tue Feb 16 22:11:47 2021 ...... Completed iteration: 40 | converge: 0.002684

R[write to console]: Tue Feb 16 22:11:47 2021 ...... Completed iteration: 50 | converge: 0.002024



[1] 29383  1731
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0005148 0.0537918 0.1073617 0.1795773 0.2427796 0.9816453 
[1] "Processing HD_F_GON9525422 with 3371 cells"
[1] 29383  3371


R[write to console]: --------------------------------------------------

R[write to console]: Starting DecontX

R[write to console]: --------------------------------------------------

R[write to console]: Tue Feb 16 22:12:45 2021 .. Analyzing all cells

R[write to console]: Tue Feb 16 22:12:45 2021 .... Converting to sparse matrix

R[write to console]: Tue Feb 16 22:12:46 2021 .... Generating UMAP and estimating cell types

R[write to console]: Tue Feb 16 22:12:55 2021 .... Estimating contamination

R[write to console]: Tue Feb 16 22:12:56 2021 ...... Completed iteration: 10 | converge: 0.02702

R[write to console]: Tue Feb 16 22:12:57 2021 ...... Completed iteration: 20 | converge: 0.009933

R[write to console]: Tue Feb 16 22:12:58 2021 ...... Completed iteration: 30 | converge: 0.005964

R[write to console]: Tue Feb 16 22:12:58 2021 ...... Completed iteration: 40 | converge: 0.00374

R[write to console]: Tue Feb 16 22:12:59 2021 ...... Completed iteration: 50 | converge: 0.002581

R

[1] 29383  3371
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0000489 0.0637749 0.1592577 0.2487973 0.3583391 0.9970615 


In [6]:
# replace normalized and raw data
from scipy import sparse
denoised_mat = sparse.csc_matrix((spox, spoi, spop), shape=adata.X.shape )
denoised_mat.eliminate_zeros() # some counts might have been down sampled to 0
denoised_mat = denoised_mat.tocsc()
adata = anndata.AnnData(denoised_mat, obs = adata.obs, var = adata.var)
adata.raw = adata.copy()

# Save

In [7]:
adata.write('FCAsn-M2denoised.h5ad')