# NVF Mouse Islets - Preprocessing - Prepare Data for DropletUtils

2023-11-25 10:44:48    

# Setup

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

  from IPython.core.display import display, HTML


In [2]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import gc # Free memory #gc.collect()

# Analysis
import scanpy as sc

In [3]:
# Settings

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()

-----
anndata     0.8.0
scanpy      1.9.3
-----
PIL                 9.4.0
asttokens           NA
backcall            0.2.0
cffi                1.15.1
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
defusedxml          0.7.1
executing           1.2.0
h5py                3.8.0
igraph              0.10.8
importlib_resources NA
ipykernel           6.21.2
ipython_genutils    0.2.0
jedi                0.18.2
joblib              1.2.0
kiwisolver          1.4.4
leidenalg           0.10.1
llvmlite            0.39.1
louvain             0.8.1
matplotlib          3.7.1
mpl_toolkits        NA
natsort             8.3.1
numba               0.56.4
numexpr             2.8.6
numpy               1.23.5
packaging           23.0
pandas              1.5.3
parso               0.8.3
pexpect             4.8.0
pickleshare         0.7.5
pkg_resources       NA
platformdirs        3.0.0
prompt_toolkit      3.0

# Setup R

In [4]:
#R
import os
os.environ['R_HOME'] = '/home/michi/Software/venvs/scverse_2023/lib/R' #path to your R installation

import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri

## R settings

### Ignore R warning messages
#### Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

### Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

In [5]:
%%R

.libPaths()

[1] "/home/michi/Software/venvs/scverse_2023/lib/R/library"


In [6]:
%%R
# General
#library(tidyverse) # incl. ggplot2
library(rhdf5)
library(Matrix)
library(SingleCellExperiment)
library(DropletUtils)

# Parallelization
library(BiocParallel)
register(MulticoreParam(64, progressbar = TRUE))

library(future)
plan(multicore, workers = 64)
options(future.globals.maxSize = 100 * 1024 ^ 3) # for 50 Gb RAM
plan()

library(doParallel)
registerDoParallel(64)

sessionInfo()

R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /home/michi/Software/venvs/scverse_2023_test/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] parallel  stats4    tools     stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] doParallel_1.0.17           iterators_1.0.14           
 [3] foreach_1.5.2               future_1.33.0              
 [5] BiocParallel_1.32.5         DropletUtils_1.18.1        
 [7] SingleCellExperiment_1.20.1 SummarizedExperiment_1.28.0
 [9] Biobase_2.

# Functions

In [7]:
def run_dropletUtils(sample=None, # sample name fround in base_path e.g. '21L011182'
                     base_path=None, # path to cellranger out put folder containing different samples e.g. '/storage/scRNA-seq/scRNA-seq_iPSC_IIR-KO_S5_P21092B/data/cr_rev_7/cellranger/'
                     outs_path=None # folder name containing count matrices (include "/" before) e.g. '/count_matrices'
                    ):
    ro.globalenv['sample'] = sample
    ro.globalenv['base_path'] = base_path
    ro.globalenv['outs_path'] = outs_path + '/'
    ro.r('''
    path_to_adata = paste0(base_path, sample, outs_path, sample, "_raw_gex_bc_matrix.h5ad")

    print(paste0('Loading ',path_to_adata))

    adata <- h5read(path_to_adata, "/", compoundAsDataFrame=FALSE)

    barcodes <- adata$obs$`_index`
    genes <- adata$var$`_index`
    counts <- adata$X$data
    indices <- adata$X$indices
    pointer <- adata$X$indptr

    print("Construct SingleCellExperiment object as input for DropUtils")
    sparse_mat <- sparseMatrix(p = as.numeric(pointer), x= as.numeric(counts),  i = as.numeric(indices)+1)
    sce <- SingleCellExperiment(assays = list(counts = sparse_mat), colData=barcodes)
    rownames(sce) <- genes

    print("Computing barcode ranks")
    barcode_ranks <- barcodeRanks(counts(sce))

    print("Run DropUtils")
    drops <- emptyDrops(counts(sce))
    rownames(drops) <- colData(sce)$X
    is_cell <- drops$FDR <= 0.05

    print("Save output")
    cell_barcodes <- barcodes[which(is_cell)]
    write.csv(cell_barcodes, file=paste0(base_path, sample, outs_path, sample, "_DropletUtils_CellBarcodes.csv"))

    ambient_genes <- drops@metadata$ambient
    write.csv(data.frame(ambient_genes), file=paste0(base_path, sample, outs_path, sample, "_DropletUtils_AmbientGenes.csv"))

    cell_probs <- drops$LogProb
    write.csv(data.frame(barcodes=rownames(drops), cell_probs=cell_probs), file=paste0(base_path, sample, outs_path, sample, "_DropletUtils_LogProbabilities.csv"))
    ''')

# Prepare Data for DropletUtils

Load raw data filter out droplets with less than 1 counts and save.

In [8]:
samples = ['E14_5','E15_5','NVF_E15-5_Rep2','NVF_E16-5_Rep1']
base_path = '/storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/'
outs_path = '/outs'

for sample in samples:
    print('Loading ' + base_path + sample + outs_path)
    path = base_path + sample + outs_path
    adata = sc.read_10x_h5(path + '/raw_feature_bc_matrix.h5', gex_only=False)
    print(adata.shape)
    sc.pp.filter_cells(adata, min_counts=1)
    sc.pp.filter_genes(adata, min_cells=1)
    print(adata.shape,'\n\n')
    # Save combined
    sc.write(path + '/' + sample + '_raw_feature_bc_matrix', adata)
    # Split and save GEX and ATAC
    print('Shape GEX:', adata[:,adata.var.feature_types.isin(['Gene Expression'])].shape)
    sc.write(path + '/' + sample + '_raw_gex_bc_matrix', adata[:,adata.var.feature_types.isin(['Gene Expression'])])
    print('Shape ATAC:', adata[:,adata.var.feature_types.isin(['Peaks'])].shape)
    sc.write(path + '/' + sample + '_raw_atac_bc_matrix', adata[:,adata.var.feature_types.isin(['Peaks'])])
    del adata
    gc.collect()
    run_dropletUtils(sample=sample, base_path=base_path, outs_path=outs_path)


Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/E14_5/outs
reading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/E14_5/outs/raw_feature_bc_matrix.h5
 (0:00:11)
(735798, 242829)
filtered out 355 cells that have less than 1 counts


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


filtered out 6622 genes that are detected in less than 1 cells


  utils.warn_names_duplicates("var")


(735443, 236207) 


Shape GEX: (735443, 24633)
Shape ATAC: (735443, 211574)
[1] "Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/E14_5/outs/E14_5_raw_gex_bc_matrix.h5ad"
[1] "Construct SingleCellExperiment object as input for DropUtils"
[1] "Computing barcode ranks"
[1] "Run DropUtils"
[1] "Save output"
Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/E15_5/outs
reading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/E15_5/outs/raw_feature_bc_matrix.h5
 (0:00:06)
(735587, 221966)
filtered out 768 cells that have less than 1 counts


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


filtered out 6471 genes that are detected in less than 1 cells


  utils.warn_names_duplicates("var")


(734819, 215495) 


Shape GEX: (734819, 24784)
Shape ATAC: (734819, 190711)
[1] "Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/E15_5/outs/E15_5_raw_gex_bc_matrix.h5ad"
[1] "Construct SingleCellExperiment object as input for DropUtils"
[1] "Computing barcode ranks"
[1] "Run DropUtils"
[1] "Save output"
Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/NVF_E15-5_Rep2/outs
reading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/NVF_E15-5_Rep2/outs/raw_feature_bc_matrix.h5
 (0:00:06)
(734646, 223466)
filtered out 2532 cells that have less than 1 counts


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


filtered out 6224 genes that are detected in less than 1 cells


  utils.warn_names_duplicates("var")


(732114, 217242) 


Shape GEX: (732114, 25031)
Shape ATAC: (732114, 192211)
[1] "Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/NVF_E15-5_Rep2/outs/NVF_E15-5_Rep2_raw_gex_bc_matrix.h5ad"
[1] "Construct SingleCellExperiment object as input for DropUtils"
[1] "Computing barcode ranks"
[1] "Run DropUtils"
[1] "Save output"
Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/NVF_E16-5_Rep1/outs
reading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/NVF_E16-5_Rep1/outs/raw_feature_bc_matrix.h5
 (0:00:07)
(735492, 212894)
filtered out 1419 cells that have less than 1 counts


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


filtered out 6284 genes that are detected in less than 1 cells


  utils.warn_names_duplicates("var")


(734073, 206610) 


Shape GEX: (734073, 24971)
Shape ATAC: (734073, 181639)
[1] "Loading /storage/scRNA-seq/scMultiome_Mouse-Islets_NVF_E14-E16_P23044/cr_arc_rev9/cr_count/NVF_E16-5_Rep1/outs/NVF_E16-5_Rep1_raw_gex_bc_matrix.h5ad"
[1] "Construct SingleCellExperiment object as input for DropUtils"
[1] "Computing barcode ranks"
[1] "Run DropUtils"
[1] "Save output"
