In [1]:
library(reticulate) 
library(Seurat)

Attaching SeuratObject



In [2]:
#### user specified

data_folder = "D:/analyze_Pearson_residuals/"
data_subfolder = "33k_PBMC"

data_path  <- paste0 ( data_folder, data_subfolder )

In [3]:
# output data
residual_variance_pkl = "Sg_complementary_samples.pkl"

# input data
gene_array_pkl  = "gene_array.pkl"
cell_array_pkl  = "cell_array.pkl"
counts_pkl = "counts_scipy_csc.pkl"

selected_genes_array_pkl  = "gene_array_complementary_samples.pkl"
df_selected_cells_pkl  = "df_cells_complementary_samples.pkl" 



# path: output data
residual_variance_dsn    <- paste0 ( data_path, '/', residual_variance_pkl )


# paths: input data
gene_array_dsn    <- paste0 ( data_path, '/', gene_array_pkl )
cell_array_dsn   <- paste0 ( data_path, '/', cell_array_pkl )
counts_dsn    <- paste0 ( data_path, '/', counts_pkl )

selected_genes_array_dsn <- paste0 ( data_path, '/', selected_genes_array_pkl )
df_selected_cells_dsn  <- paste0 ( data_path, '/', df_selected_cells_pkl )

In [4]:
gene_array = py_load_object ( gene_array_dsn )
print ( dim ( gene_array ) )
print ( head ( gene_array ) )

[1] 12324
[1] "ENSG00000254709" "ENSG00000163736" "ENSG00000087086" "ENSG00000163220"
[5] "ENSG00000090382" "ENSG00000132465"


In [5]:
cell_array = py_load_object ( cell_array_dsn )
print ( dim ( cell_array ) )
print ( head ( cell_array ) )

[1] 33148
[1] "AAACATTGACGACT-1" "AAACATTGACGGGA-1" "AAACATTGCCGCTT-1" "AAACATTGCTCATT-1"
[5] "AAACCGTGCCCTCA-1" "AAACCGTGCCTTTA-1"


In [6]:
df_counts = py_load_object( counts_dsn ) 

rownames(df_counts) <- gene_array
colnames(df_counts) <- cell_array
rm ( gene_array, cell_array )

print ( dim ( df_counts ) )
print ( head (df_counts [, c(1:5)] ) )

[1] 12324 33148
6 x 5 sparse Matrix of class "dgCMatrix"
                AAACATTGACGACT-1 AAACATTGACGGGA-1 AAACATTGCCGCTT-1
ENSG00000254709                .                .                .
ENSG00000163736                .                .                .
ENSG00000087086                3                6               39
ENSG00000163220                .                .                .
ENSG00000090382                1                .                4
ENSG00000132465                .                .                .
                AAACATTGCTCATT-1 AAACCGTGCCCTCA-1
ENSG00000254709                .                .
ENSG00000163736                .                .
ENSG00000087086                1                3
ENSG00000163220                .                .
ENSG00000090382                .                .
ENSG00000132465                .                .


In [7]:
selected_genes_array = py_load_object ( selected_genes_array_dsn )

df_selected_cells = py_load_object ( df_selected_cells_dsn )
print ( dim ( df_selected_cells ) )
print ( head ( df_selected_cells ) )

rownames_selected_cells = rownames ( df_selected_cells )

[1] 33148     2
                     0     1
AAACATTGACGACT-1 FALSE  TRUE
AAACATTGACGGGA-1  TRUE FALSE
AAACATTGCCGCTT-1  TRUE FALSE
AAACATTGCTCATT-1  TRUE FALSE
AAACCGTGCCCTCA-1 FALSE  TRUE
AAACCGTGCCTTTA-1  TRUE FALSE


In [8]:
df_counts_selected_cells = df_counts[,rownames_selected_cells]
rm ( df_counts )
print ( dim ( df_counts_selected_cells ) )
print ( head ( df_counts_selected_cells  [, c(1:5)] ) )

[1] 12324 33148
6 x 5 sparse Matrix of class "dgCMatrix"
                AAACATTGACGACT-1 AAACATTGACGGGA-1 AAACATTGCCGCTT-1
ENSG00000254709                .                .                .
ENSG00000163736                .                .                .
ENSG00000087086                3                6               39
ENSG00000163220                .                .                .
ENSG00000090382                1                .                4
ENSG00000132465                .                .                .
                AAACATTGCTCATT-1 AAACCGTGCCCTCA-1
ENSG00000254709                .                .
ENSG00000163736                .                .
ENSG00000087086                1                3
ENSG00000163220                .                .
ENSG00000090382                .                .
ENSG00000132465                .                .


In [9]:
df_counts_selected = df_counts_selected_cells [ rownames ( df_counts_selected_cells ) %in% selected_genes_array, ]  
rm ( df_counts_selected_cells )
print ( dim ( df_counts_selected ) )
print ( head ( df_counts_selected  [, c(1:5)] ) )

[1] 10908 33148
6 x 5 sparse Matrix of class "dgCMatrix"
                AAACATTGACGACT-1 AAACATTGACGGGA-1 AAACATTGCCGCTT-1
ENSG00000254709                .                .                .
ENSG00000163736                .                .                .
ENSG00000087086                3                6               39
ENSG00000163220                .                .                .
ENSG00000090382                1                .                4
ENSG00000132465                .                .                .
                AAACATTGCTCATT-1 AAACCGTGCCCTCA-1
ENSG00000254709                .                .
ENSG00000163736                .                .
ENSG00000087086                1                3
ENSG00000163220                .                .
ENSG00000090382                .                .
ENSG00000132465                .                .


In [10]:

# output data frames
list_df_residual_variance = list()

for ( sample in (1:2) )
{
  print ( paste ( 'sample:', sample ) )
  df_sample = df_selected_cells[ sample ]
  
  colnames(df_sample)  <-'select'
  vect_select = df_sample$select
  
  df_sample_new = data.frame ( rownames_selected_cells) 
  cells_select = df_sample_new[ vect_select, ]

  df_counts_select_sample = df_counts_selected[, cells_select ]  
  print ( dim ( df_counts_select_sample ) )

  seurat_object <-  CreateSeuratObject(counts = df_counts_select_sample , project = data_subfolder )  
  seurat_object <- SCTransform(seurat_object, vst.flavor="v2", method = "glmGamPoi", variable.features.n=10 )

  gene_list = seurat_object@assays$SCT@data@Dimnames[1]
  gene_vector = unlist ( gene_list )

  residual_variance = seurat_object@assays$SCT@SCTModel.list$model1@feature.attributes$residual_variance

  df_residual_variance <- data.frame( gene_vector )
  str_sample = as.character ( sample - 1 )
  df_residual_variance[[ str_sample ]] = residual_variance
  
  print ( '--------------------------------------------------------'  ) 
 
  list_df_residual_variance[[ sample ]]  = df_residual_variance 
} 

[1] "sample: 1"
[1] 10908 16593


vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.

Calculating cell attributes from input UMI matrix: log_umi

Total Step 1 genes: 10908

Total overdispersed genes: 10308

Excluding 600 genes from Step 1 because they are not overdispersed.

Variance stabilizing transformation of count matrix of size 10908 by 16593

Model formula is y ~ log_umi

Get Negative Binomial regression parameters per gene

Using 2000 genes, 5000 cells





Setting estimate of  266 genes to inf as theta_mm/theta_mle < 1e-3

# of step1 poisson genes (variance < mean): 0

# of low mean genes (mean < 0.001): 0

Total # of Step1 poisson genes (theta=Inf; variance < mean): 266

Total # of poisson genes (theta=Inf; variance < mean): 600

Calling offset model for all 600 poisson genes

Found 296 outliers - those will be ignored in fitting/regularization step


Ignoring theta inf genes

Replacing fit params for 600 poisson genes by theta=Inf

Setting min_variance based on median UMI:  0.04

Second step: Get residuals using fitted parameters for 10908 genes





Computing corrected count matrix for 10908 genes





Calculating gene attributes

Wall clock passed: Time difference of 53.13371 secs

Determine variable features

Place corrected count matrix in counts slot

Centering data matrix

Set default assay to SCT



[1] "--------------------------------------------------------"
[1] "sample: 2"
[1] 10908 16555


vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.

Calculating cell attributes from input UMI matrix: log_umi

Total Step 1 genes: 10908

Total overdispersed genes: 10300

Excluding 608 genes from Step 1 because they are not overdispersed.

Variance stabilizing transformation of count matrix of size 10908 by 16555

Model formula is y ~ log_umi

Get Negative Binomial regression parameters per gene

Using 2000 genes, 5000 cells





Setting estimate of  306 genes to inf as theta_mm/theta_mle < 1e-3

# of step1 poisson genes (variance < mean): 0

# of low mean genes (mean < 0.001): 0

Total # of Step1 poisson genes (theta=Inf; variance < mean): 306

Total # of poisson genes (theta=Inf; variance < mean): 608

Calling offset model for all 608 poisson genes

Found 384 outliers - those will be ignored in fitting/regularization step


Ignoring theta inf genes

Replacing fit params for 608 poisson genes by theta=Inf

Setting min_variance based on median UMI:  0.04

Second step: Get residuals using fitted parameters for 10908 genes





Computing corrected count matrix for 10908 genes





Calculating gene attributes

Wall clock passed: Time difference of 52.56205 secs

Determine variable features

Place corrected count matrix in counts slot

Centering data matrix

Set default assay to SCT



[1] "--------------------------------------------------------"


In [11]:
df_residual_variance_return = merge ( list_df_residual_variance[[1]], list_df_residual_variance[[2]], by=c("gene_vector") )
print ( head ( df_residual_variance_return ) ) 

      gene_vector          0          1
1 ENSG00000000419 0.87870455 1.14293142
2 ENSG00000000457 0.36943663 0.46791485
3 ENSG00000000460 0.09660799 0.09763996
4 ENSG00000000938 1.49990399 1.52813510
5 ENSG00000000971 0.15858405 0.17405765
6 ENSG00000001036 0.73151767 1.07396048


In [12]:
py_save_object ( df_residual_variance_return, residual_variance_dsn )