In [1]:
library(reticulate) 
library(Seurat)

Attaching SeuratObject



In [2]:
#### user specified

data_folder = "D:/analyze_Pearson_residuals/"
data_subfolder = "10k_heart"

data_path  <- paste0 ( data_folder, data_subfolder )

In [3]:
# output data
residual_variance_pkl = "Sg_complementary_samples.pkl"

# input data
gene_array_pkl  = "gene_array.pkl"
cell_array_pkl  = "cell_array.pkl"
counts_pkl = "counts_scipy_csc.pkl"

selected_genes_array_pkl  = "gene_array_complementary_samples.pkl"
df_selected_cells_pkl  = "df_cells_complementary_samples.pkl" 



# path: output data
residual_variance_dsn    <- paste0 ( data_path, '/', residual_variance_pkl )


# paths: input data
gene_array_dsn    <- paste0 ( data_path, '/', gene_array_pkl )
cell_array_dsn   <- paste0 ( data_path, '/', cell_array_pkl )
counts_dsn    <- paste0 ( data_path, '/', counts_pkl )

selected_genes_array_dsn <- paste0 ( data_path, '/', selected_genes_array_pkl )
df_selected_cells_dsn  <- paste0 ( data_path, '/', df_selected_cells_pkl )

In [4]:
gene_array = py_load_object ( gene_array_dsn )
print ( dim ( gene_array ) )
print ( head ( gene_array ) )

[1] 14472
[1] "ENSMUSG00000052305" "ENSMUSG00000069919" "ENSMUSG00000073940"
[4] "ENSMUSG00000069917" "ENSMUSG00000038791" "ENSMUSG00000041616"


In [5]:
cell_array = py_load_object ( cell_array_dsn )
print ( dim ( cell_array ) )
print ( head ( cell_array ) )

[1] 7713
[1] "AAACCCAAGCGAGTCA-1" "AAACCCAAGGTCATTC-1" "AAACCCACACTGTGTA-1"
[4] "AAACCCACATATCTGG-1" "AAACCCACATCAACCA-1" "AAACCCACATGTGCCG-1"


In [6]:
df_counts = py_load_object( counts_dsn ) 

rownames(df_counts) <- gene_array
colnames(df_counts) <- cell_array
rm ( gene_array, cell_array )

print ( dim ( df_counts ) )
print ( head (df_counts [, c(1:5)] ) )

[1] 14472  7713
6 x 5 sparse Matrix of class "dgCMatrix"
                   AAACCCAAGCGAGTCA-1 AAACCCAAGGTCATTC-1 AAACCCACACTGTGTA-1
ENSMUSG00000052305                823                  3                  .
ENSMUSG00000069919                268                  .                  .
ENSMUSG00000073940                 33                  .                  .
ENSMUSG00000069917                165                  .                  .
ENSMUSG00000038791                  .                  1                  .
ENSMUSG00000041616                  .                  .                  .
                   AAACCCACATATCTGG-1 AAACCCACATCAACCA-1
ENSMUSG00000052305                  1                  4
ENSMUSG00000069919                  .                  .
ENSMUSG00000073940                  1                  .
ENSMUSG00000069917                  .                  .
ENSMUSG00000038791                  .                  1
ENSMUSG00000041616                  4                  .


In [7]:
selected_genes_array = py_load_object ( selected_genes_array_dsn )

df_selected_cells = py_load_object ( df_selected_cells_dsn )
print ( dim ( df_selected_cells ) )
print ( head ( df_selected_cells ) )

rownames_selected_cells = rownames ( df_selected_cells )

[1] 7713    2
                       0     1
AAACCCAAGCGAGTCA-1 FALSE  TRUE
AAACCCAAGGTCATTC-1  TRUE FALSE
AAACCCACACTGTGTA-1  TRUE FALSE
AAACCCACATATCTGG-1  TRUE FALSE
AAACCCACATCAACCA-1 FALSE  TRUE
AAACCCACATGTGCCG-1  TRUE FALSE


In [8]:
df_counts_selected_cells = df_counts[,rownames_selected_cells]
rm ( df_counts )
print ( dim ( df_counts_selected_cells ) )
print ( head ( df_counts_selected_cells  [, c(1:5)] ) )

[1] 14472  7713
6 x 5 sparse Matrix of class "dgCMatrix"
                   AAACCCAAGCGAGTCA-1 AAACCCAAGGTCATTC-1 AAACCCACACTGTGTA-1
ENSMUSG00000052305                823                  3                  .
ENSMUSG00000069919                268                  .                  .
ENSMUSG00000073940                 33                  .                  .
ENSMUSG00000069917                165                  .                  .
ENSMUSG00000038791                  .                  1                  .
ENSMUSG00000041616                  .                  .                  .
                   AAACCCACATATCTGG-1 AAACCCACATCAACCA-1
ENSMUSG00000052305                  1                  4
ENSMUSG00000069919                  .                  .
ENSMUSG00000073940                  1                  .
ENSMUSG00000069917                  .                  .
ENSMUSG00000038791                  .                  1
ENSMUSG00000041616                  4                  .


In [9]:
df_counts_selected = df_counts_selected_cells [ rownames ( df_counts_selected_cells ) %in% selected_genes_array, ]  
rm ( df_counts_selected_cells )
print ( dim ( df_counts_selected ) )
print ( head ( df_counts_selected  [, c(1:5)] ) )

[1] 12995  7713
6 x 5 sparse Matrix of class "dgCMatrix"
                   AAACCCAAGCGAGTCA-1 AAACCCAAGGTCATTC-1 AAACCCACACTGTGTA-1
ENSMUSG00000052305                823                  3                  .
ENSMUSG00000069919                268                  .                  .
ENSMUSG00000073940                 33                  .                  .
ENSMUSG00000069917                165                  .                  .
ENSMUSG00000038791                  .                  1                  .
ENSMUSG00000041616                  .                  .                  .
                   AAACCCACATATCTGG-1 AAACCCACATCAACCA-1
ENSMUSG00000052305                  1                  4
ENSMUSG00000069919                  .                  .
ENSMUSG00000073940                  1                  .
ENSMUSG00000069917                  .                  .
ENSMUSG00000038791                  .                  1
ENSMUSG00000041616                  4                  .


In [10]:
# output data frames
list_df_residual_variance = list()

for ( sample in (1:2) )
{
  print ( paste ( 'sample:', sample ) )
  df_sample = df_selected_cells[ sample ]
  
  colnames(df_sample)  <-'select'
  vect_select = df_sample$select
  
  df_sample_new = data.frame ( rownames_selected_cells) 
  cells_select = df_sample_new[ vect_select, ]

  df_counts_select_sample = df_counts_selected[, cells_select ]  
  print ( paste ( "dim ( df_counts_select_sample ):",  dim ( df_counts_select_sample ) ) )

  seurat_object <-  CreateSeuratObject(counts = df_counts_select_sample , project = data_subfolder )
  print ( dim ( seurat_object ) )
  print ( paste ( "dim ( seurat_object ):",  dim ( seurat_object ) ) )
  
  seurat_object <- SCTransform(seurat_object, vst.flavor="v2", method = "glmGamPoi", variable.features.n=10 )

  gene_list = seurat_object@assays$SCT@data@Dimnames[1]
  gene_vector = unlist ( gene_list )

  residual_variance = seurat_object@assays$SCT@SCTModel.list$model1@feature.attributes$residual_variance

  df_residual_variance <- data.frame( gene_vector )
  str_sample = as.character ( sample - 1 )
  df_residual_variance[[ str_sample ]] = residual_variance
  
  print ( paste ( "head ( df_residual_variance ):", head ( df_residual_variance ) ) )  
  
  print ( paste ( "dim ( df_residual_variance ):",   dim ( df_residual_variance ) ) )  
  print ( '--------------------------------------------------------'  ) 
 
  list_df_residual_variance[[ sample ]]  = df_residual_variance 
} 

[1] "sample: 1"
[1] "dim ( df_counts_select_sample ): 12995"
[2] "dim ( df_counts_select_sample ): 3919" 
[1] 12995  3919
[1] "dim ( seurat_object ): 12995" "dim ( seurat_object ): 3919" 


vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.

Calculating cell attributes from input UMI matrix: log_umi

Total Step 1 genes: 12995

Total overdispersed genes: 12963

Excluding 32 genes from Step 1 because they are not overdispersed.

Variance stabilizing transformation of count matrix of size 12995 by 3919

Model formula is y ~ log_umi

Get Negative Binomial regression parameters per gene

Using 2000 genes, 3919 cells





Setting estimate of  14 genes to inf as theta_mm/theta_mle < 1e-3

# of step1 poisson genes (variance < mean): 0

# of low mean genes (mean < 0.001): 0

Total # of Step1 poisson genes (theta=Inf; variance < mean): 14

Total # of poisson genes (theta=Inf; variance < mean): 32

Calling offset model for all 32 poisson genes

Found 45 outliers - those will be ignored in fitting/regularization step


Ignoring theta inf genes

Replacing fit params for 32 poisson genes by theta=Inf

Setting min_variance based on median UMI:  0.04

Second step: Get residuals using fitted parameters for 12995 genes





Computing corrected count matrix for 12995 genes





Calculating gene attributes

Wall clock passed: Time difference of 26.63454 secs

Determine variable features

Place corrected count matrix in counts slot

Centering data matrix

Set default assay to SCT



[1] "head ( df_residual_variance ): c(\"ENSMUSG00000052305\", \"ENSMUSG00000069919\", \"ENSMUSG00000073940\", \"ENSMUSG00000069917\", \"ENSMUSG00000038791\", \"ENSMUSG00000041616\")"
[2] "head ( df_residual_variance ): c(485.863754157298, 475.510626155345, 385.034146465051, 469.77741059485, 8.97783521463509, 72.5389830494749)"                                     
[1] "dim ( df_residual_variance ): 12995" "dim ( df_residual_variance ): 2"    
[1] "--------------------------------------------------------"
[1] "sample: 2"
[1] "dim ( df_counts_select_sample ): 12995"
[2] "dim ( df_counts_select_sample ): 3794" 
[1] 12995  3794
[1] "dim ( seurat_object ): 12995" "dim ( seurat_object ): 3794" 


vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.

Calculating cell attributes from input UMI matrix: log_umi

Total Step 1 genes: 12995

Total overdispersed genes: 12971

Excluding 24 genes from Step 1 because they are not overdispersed.

Variance stabilizing transformation of count matrix of size 12995 by 3794

Model formula is y ~ log_umi

Get Negative Binomial regression parameters per gene

Using 2000 genes, 3794 cells





Setting estimate of  13 genes to inf as theta_mm/theta_mle < 1e-3

# of step1 poisson genes (variance < mean): 0

# of low mean genes (mean < 0.001): 0

Total # of Step1 poisson genes (theta=Inf; variance < mean): 13

Total # of poisson genes (theta=Inf; variance < mean): 24

Calling offset model for all 24 poisson genes

Found 37 outliers - those will be ignored in fitting/regularization step


Ignoring theta inf genes

Replacing fit params for 24 poisson genes by theta=Inf

Setting min_variance based on median UMI:  0.04

Second step: Get residuals using fitted parameters for 12995 genes





Computing corrected count matrix for 12995 genes





Calculating gene attributes

Wall clock passed: Time difference of 24.32034 secs

Determine variable features

Place corrected count matrix in counts slot

Centering data matrix

Set default assay to SCT



[1] "head ( df_residual_variance ): c(\"ENSMUSG00000052305\", \"ENSMUSG00000069919\", \"ENSMUSG00000073940\", \"ENSMUSG00000069917\", \"ENSMUSG00000038791\", \"ENSMUSG00000041616\")"
[2] "head ( df_residual_variance ): c(453.855440237947, 450.552117601969, 361.050870506398, 443.871645368363, 5.16101094610831, 65.2703612964483)"                                    
[1] "dim ( df_residual_variance ): 12995" "dim ( df_residual_variance ): 2"    
[1] "--------------------------------------------------------"


In [11]:
df_residual_variance_return = merge ( list_df_residual_variance[[1]], list_df_residual_variance[[2]], by=c("gene_vector") )
print ( head ( df_residual_variance_return ) ) 

         gene_vector          0          1
1 ENSMUSG00000000001  0.6435774  0.6745212
2 ENSMUSG00000000028  0.9566533  0.9116582
3 ENSMUSG00000000031 36.8947528 41.7027355
4 ENSMUSG00000000037  0.2617622  0.3161400
5 ENSMUSG00000000056  1.0293356  0.9308244
6 ENSMUSG00000000058  1.7151442  1.8838529


In [12]:
py_save_object ( df_residual_variance_return, residual_variance_dsn )