# FCA analysis - doublets identification

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import os
import sys
import warnings
import anndata
import anndata
import rpy2

%load_ext rpy2.ipython
%matplotlib inline


warnings.filterwarnings('ignore')


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-sn/preprocessing/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

-----
anndata     0.7.5
scanpy      1.6.0
sinfo       0.3.1
-----
PIL                 8.0.1
anndata             0.7.5
backcall            0.2.0
cairo               1.20.0
cffi                1.14.3
cloudpickle         1.6.0
colorama            0.4.4
cycler              0.10.0
cython_runtime      NA
cytoolz             0.11.0
dask                1.0.0
dateutil            2.8.1
decorator           4.4.2
get_version         2.1
h5py                3.1.0
igraph              0.8.3
ipykernel           5.3.4
ipython_genutils    0.2.0
jedi                0.17.2
jinja2              2.11.2
joblib              0.17.0
kiwisolver          1.3.1
legacy_api_wrap     0.0.0
leidenalg           0.8.3
llvmlite            0.34.0
louvain             0.7.0
markupsafe          1.1.1
matplotlib          3.3.3
mpl_toolkits        NA
natsort             7.1.0
numba               0.51.2
numexpr             2.7.1
numpy               1.19.4
packaging           20.4
pandas              1.1.4
parso               0.7

'/opt/conda/bin/python'

# Load data

In [2]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'
adata = sc.read(path_to_gonads+'multiomics_rna_counts.h5ad')
adata.X.shape

(73218, 33997)

In [3]:
# adata = adata[['Hrv39' in i for i in adata.obs.donor]]
# set(adata.obs['sample'])

In [4]:
males = ['Hrv41', 'Hrv3', 'Hrv15']
female_early = ['Hrv93', 'Hrv92']
female_late = ['Hrv58', 'Hrv39', 'Hrv65', 'Hrv91']

adata = adata[['Hrv39' in i for i in adata.obs.donor]]

In [5]:
#used rply to transferm sparse matrix to R
spx = rpy2.robjects.FloatVector(adata.raw.X.data)
spp = rpy2.robjects.IntVector(adata.raw.X.indptr)
spi = rpy2.robjects.IntVector(adata.raw.X.indices)
spo = adata.obs
spv = adata.var

In [6]:
spo.head()

Unnamed: 0,n_genes,sample,donor,Library_ATAC,Library_RNA,iRods_path,percent_mito,n_counts,batch,scrublet_score,scrublet_cluster_score,zscore,bh_pval,bonf_pval
HD_F_GON9525421_AAACAGCCAACAACAA,986,HD_F_GON9525421,Hrv39,HD_F_GON9525613,HD_F_GON9525421,/seq/illumina/cellranger-arc/cellranger-arc101...,0.0,1423.0,2,0.205882,0.176471,0.674491,0.943466,1.0
HD_F_GON9525421_AAACATGCACACCAAC,2474,HD_F_GON9525421,Hrv39,HD_F_GON9525613,HD_F_GON9525421,/seq/illumina/cellranger-arc/cellranger-arc101...,0.000189,5278.0,2,0.098039,0.152941,0.202347,0.943466,1.0
HD_F_GON9525421_AAACCAACAACACTTG,3723,HD_F_GON9525421,Hrv39,HD_F_GON9525613,HD_F_GON9525421,/seq/illumina/cellranger-arc/cellranger-arc101...,0.000604,6619.0,2,0.243697,0.223529,1.618778,0.452691,1.0
HD_F_GON9525421_AAACCAACACCTACTT,3237,HD_F_GON9525421,Hrv39,HD_F_GON9525613,HD_F_GON9525421,/seq/illumina/cellranger-arc/cellranger-arc101...,0.000695,7199.0,2,0.12532,0.12532,-0.351908,0.943466,1.0
HD_F_GON9525421_AAACCGAAGCCTGGTA,6198,HD_F_GON9525421,Hrv39,HD_F_GON9525613,HD_F_GON9525421,/seq/illumina/cellranger-arc/cellranger-arc101...,0.001147,17437.0,2,0.12532,0.170279,0.550242,0.943466,1.0


In [None]:
%%R -i spx -i spp -i spi -i spo -i spv -o spox -o spop -o spoi
library(SingleCellExperiment)
library(Matrix)
source("Rcode.R")
rawdata <- Matrix(0, max(spi)+1, length(spp) -1, sparse=T)
rawdata@x = spx
rawdata@i = as.integer(spi)
rawdata@p = as.integer(spp)
rownames(rawdata) = rownames(spv)
colnames(rawdata) = rownames(spo)
sce <- SingleCellExperiment(rawdata)
names(assays(sce)) <- c("rawdata")
for(i in colnames(spo)) {colData(sce)[i] <- spo[[i]]}
for(i in colnames(spv)) {rowData(sce)[i] <- spv[[i]]}

samples <- names(table(spo$sample))
print(samples)
library(celda)
denoisedmatrix <- Matrix(0, max(spi)+1,length(spp) -1, sparse=T)
for(i in samples){
    print(paste("Processing", i, "with", sum(spo$sample == i),"cells"))
	submat <- as.matrix(rawdata[, (spo$sample == i)])
    print(dim(submat))
	mode(submat) <- "integer" 
    res <- decontX(submat)
    print('Contamination summary:')
    print(summary(res$contamination))
	denoisedmatrix[, spo$sample == i] = res$decontXcounts
#     print("saving partial")
#     saveRDS(denoisedmatrix, "denoised.rds")
}
denoisedmatrix@x <- floor(denoisedmatrix@x) + (runif(length(denoisedmatrix@x)) < (denoisedmatrix@x %% 1) ) # sample fractionnal counts to integers
denoisedmatrix <- t(denoisedmatrix) # transpose
spox <- denoisedmatrix@x
spoi <- denoisedmatrix@i
spop <- denoisedmatrix@p

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: MatrixGenerics

R[write to console]: Loading required package: matrixStats

R[write to console]: 
Attaching package: ‘MatrixGenerics’


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiff

[1] "FCA_GND10287602" "HD_F_GON9525421" "HD_F_GON9525422"


R[write to console]: 
Attaching package: ‘celda’


R[write to console]: The following object is masked from ‘package:S4Vectors’:

    params




[1] "Processing FCA_GND10287602 with 1575 cells"
[1] 33997  1575


R[write to console]: --------------------------------------------------

R[write to console]: Starting DecontX

R[write to console]: --------------------------------------------------

R[write to console]: Mon Sep 20 15:18:35 2021 .. Analyzing all cells

R[write to console]: Mon Sep 20 15:18:35 2021 .... Converting to sparse matrix

R[write to console]: Mon Sep 20 15:18:36 2021 .... Generating UMAP and estimating cell types

R[write to console]: Mon Sep 20 15:18:42 2021 .... Estimating contamination

R[write to console]: Mon Sep 20 15:18:43 2021 ...... Completed iteration: 10 | converge: 0.04537

R[write to console]: Mon Sep 20 15:18:44 2021 ...... Completed iteration: 20 | converge: 0.01232

R[write to console]: Mon Sep 20 15:18:45 2021 ...... Completed iteration: 30 | converge: 0.006068

R[write to console]: Mon Sep 20 15:18:46 2021 ...... Completed iteration: 40 | converge: 0.003548

R[write to console]: Mon Sep 20 15:18:46 2021 ...... Completed iteration: 50 | converge: 0.002301

R

[1] "Contamination summary:"
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0002116 0.0553910 0.1261761 0.2667507 0.4235339 0.9996762 
[1] "Processing HD_F_GON9525421 with 1731 cells"
[1] 33997  1731


R[write to console]: --------------------------------------------------

R[write to console]: Starting DecontX

R[write to console]: --------------------------------------------------

R[write to console]: Mon Sep 20 15:20:44 2021 .. Analyzing all cells

R[write to console]: Mon Sep 20 15:20:45 2021 .... Converting to sparse matrix

R[write to console]: Mon Sep 20 15:20:45 2021 .... Generating UMAP and estimating cell types

R[write to console]: Mon Sep 20 15:20:51 2021 .... Estimating contamination

R[write to console]: Mon Sep 20 15:20:51 2021 ...... Completed iteration: 10 | converge: 0.02035

R[write to console]: Mon Sep 20 15:20:52 2021 ...... Completed iteration: 20 | converge: 0.008352

R[write to console]: Mon Sep 20 15:20:52 2021 ...... Completed iteration: 30 | converge: 0.005426

R[write to console]: Mon Sep 20 15:20:52 2021 ...... Completed iteration: 40 | converge: 0.003511

R[write to console]: Mon Sep 20 15:20:53 2021 ...... Completed iteration: 50 | converge: 0.002305



[1] "Contamination summary:"
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
0.0005938 0.0461560 0.0758898 0.1103087 0.1307258 0.8397358 


In [None]:
# replace normalized and raw data
from scipy import sparse
denoised_mat = sparse.csc_matrix((spox, spoi, spop), shape=adata.X.shape )
denoised_mat.eliminate_zeros() # some counts might have been down sampled to 0
denoised_mat = denoised_mat.tocsc()
adata = anndata.AnnData(denoised_mat, obs = adata.obs, var = adata.var)
adata.raw = adata.copy()

# Save

In [None]:
adata.write(path_to_gonads+'multiomics_rna_counts_denoised_Hrv39.h5ad')