In [1]:
library(reticulate) 
library(Seurat)

Attaching SeuratObject



In [2]:
#### user specified

data_folder = "D:/analyze_Pearson_residuals/"
data_subfolder = "retinal"

data_path  <- paste0 ( data_folder, data_subfolder )

In [3]:
# output data
residual_variance_pkl = "Sg_complementary_samples.pkl"

# input data
gene_array_pkl  = "gene_array.pkl"
cell_array_pkl  = "cell_array.pkl"
counts_pkl = "counts_scipy_csc.pkl"

selected_genes_array_pkl  = "gene_array_complementary_samples.pkl"
df_selected_cells_pkl  = "df_cells_complementary_samples.pkl" 



# path: output data
residual_variance_dsn    <- paste0 ( data_path, '/', residual_variance_pkl )


# paths: input data
gene_array_dsn    <- paste0 ( data_path, '/', gene_array_pkl )
cell_array_dsn   <- paste0 ( data_path, '/', cell_array_pkl )
counts_dsn    <- paste0 ( data_path, '/', counts_pkl )

selected_genes_array_dsn <- paste0 ( data_path, '/', selected_genes_array_pkl )
df_selected_cells_dsn  <- paste0 ( data_path, '/', df_selected_cells_pkl )

In [4]:
gene_array = py_load_object ( gene_array_dsn )
print ( dim ( gene_array ) )
print ( head ( gene_array ) )

[1] 13552
[1] "CARTPT"  "RGS5"    "RHO"     "APOE"    "GLUL"    "MT-RNR2"


In [5]:
cell_array = py_load_object ( cell_array_dsn )
print ( dim ( cell_array ) )
print ( head ( cell_array ) )

[1] 24769
[1] "r4_TGCCACATGGGC" "r4_ATCGGCTACCGA" "r4_TAGATATCTTAT" "r4_CCTGGATTGTAC"
[5] "r4_CGGTAATAGGAA" "r4_CAACCGAATATA"


In [6]:
df_counts = py_load_object( counts_dsn ) 

rownames(df_counts) <- gene_array
colnames(df_counts) <- cell_array
rm ( gene_array, cell_array )

print ( dim ( df_counts ) )
print ( head (df_counts [, c(1:5)] ) )

[1] 13552 24769
6 x 5 sparse Matrix of class "dgCMatrix"
        r4_TGCCACATGGGC r4_ATCGGCTACCGA r4_TAGATATCTTAT r4_CCTGGATTGTAC
CARTPT                .              45             166               1
RGS5                  .               .               .               .
RHO                 496               5               4               2
APOE                  .               .               .               .
GLUL                  7               4               1               3
MT-RNR2              69              49              51              77
        r4_CGGTAATAGGAA
CARTPT              102
RGS5                  1
RHO                   3
APOE                  .
GLUL                  5
MT-RNR2              72


In [7]:
selected_genes_array = py_load_object ( selected_genes_array_dsn )

df_selected_cells = py_load_object ( df_selected_cells_dsn )
print ( dim ( df_selected_cells ) )
print ( head ( df_selected_cells ) )

rownames_selected_cells = rownames ( df_selected_cells )

[1] 24769     2
                    0     1
r4_TGCCACATGGGC FALSE  TRUE
r4_ATCGGCTACCGA  TRUE FALSE
r4_TAGATATCTTAT  TRUE FALSE
r4_CCTGGATTGTAC  TRUE FALSE
r4_CGGTAATAGGAA FALSE  TRUE
r4_CAACCGAATATA  TRUE FALSE


In [8]:
df_counts_selected_cells = df_counts[,rownames_selected_cells]
rm ( df_counts )
print ( dim ( df_counts_selected_cells ) )
print ( head ( df_counts_selected_cells  [, c(1:5)] ) )

[1] 13552 24769
6 x 5 sparse Matrix of class "dgCMatrix"
        r4_TGCCACATGGGC r4_ATCGGCTACCGA r4_TAGATATCTTAT r4_CCTGGATTGTAC
CARTPT                .              45             166               1
RGS5                  .               .               .               .
RHO                 496               5               4               2
APOE                  .               .               .               .
GLUL                  7               4               1               3
MT-RNR2              69              49              51              77
        r4_CGGTAATAGGAA
CARTPT              102
RGS5                  1
RHO                   3
APOE                  .
GLUL                  5
MT-RNR2              72


In [9]:
df_counts_selected = df_counts_selected_cells [ rownames ( df_counts_selected_cells ) %in% selected_genes_array, ]  
rm ( df_counts_selected_cells )
print ( dim ( df_counts_selected ) )
print ( head ( df_counts_selected  [, c(1:5)] ) )

[1] 12082 24769
6 x 5 sparse Matrix of class "dgCMatrix"
        r4_TGCCACATGGGC r4_ATCGGCTACCGA r4_TAGATATCTTAT r4_CCTGGATTGTAC
CARTPT                .              45             166               1
RGS5                  .               .               .               .
RHO                 496               5               4               2
APOE                  .               .               .               .
GLUL                  7               4               1               3
MT-RNR2              69              49              51              77
        r4_CGGTAATAGGAA
CARTPT              102
RGS5                  1
RHO                   3
APOE                  .
GLUL                  5
MT-RNR2              72


In [10]:

# output data frames
list_df_residual_variance = list()

for ( sample in (1:2) )
{
  print ( paste ( 'sample:', sample ) )
  df_sample = df_selected_cells[ sample ]
  
  colnames(df_sample)  <-'select'
  vect_select = df_sample$select
  
  df_sample_new = data.frame ( rownames_selected_cells) 
  cells_select = df_sample_new[ vect_select, ]

  df_counts_select_sample = df_counts_selected[, cells_select ]  
  print ( dim ( df_counts_select_sample ) )

  seurat_object <-  CreateSeuratObject(counts = df_counts_select_sample , project = data_subfolder )  
  seurat_object <- SCTransform(seurat_object, vst.flavor="v2", method = "glmGamPoi", variable.features.n=10 )

  gene_list = seurat_object@assays$SCT@data@Dimnames[1]
  gene_vector = unlist ( gene_list )

  residual_variance = seurat_object@assays$SCT@SCTModel.list$model1@feature.attributes$residual_variance

  df_residual_variance <- data.frame( gene_vector )
  str_sample = as.character ( sample - 1 )
  df_residual_variance[[ str_sample ]] = residual_variance
  
  print ( '--------------------------------------------------------'  ) 
 
  list_df_residual_variance[[ sample ]]  = df_residual_variance 
} 

[1] "sample: 1"
[1] 12082 12448


vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.

Calculating cell attributes from input UMI matrix: log_umi

Total Step 1 genes: 12082

Total overdispersed genes: 12062

Excluding 20 genes from Step 1 because they are not overdispersed.

Variance stabilizing transformation of count matrix of size 12082 by 12448

Model formula is y ~ log_umi

Get Negative Binomial regression parameters per gene

Using 2000 genes, 5000 cells





Setting estimate of  34 genes to inf as theta_mm/theta_mle < 1e-3

# of step1 poisson genes (variance < mean): 0

# of low mean genes (mean < 0.001): 0

Total # of Step1 poisson genes (theta=Inf; variance < mean): 34

Total # of poisson genes (theta=Inf; variance < mean): 20

Calling offset model for all 20 poisson genes

Found 51 outliers - those will be ignored in fitting/regularization step


Ignoring theta inf genes

Replacing fit params for 20 poisson genes by theta=Inf

Setting min_variance based on median UMI:  0.04

Second step: Get residuals using fitted parameters for 12082 genes





Computing corrected count matrix for 12082 genes





Calculating gene attributes

Wall clock passed: Time difference of 48.45075 secs

Determine variable features

Place corrected count matrix in counts slot

Centering data matrix

Set default assay to SCT



[1] "--------------------------------------------------------"
[1] "sample: 2"
[1] 12082 12321


vst.flavor='v2' set, setting model to use fixed slope and exclude poisson genes.

Calculating cell attributes from input UMI matrix: log_umi

Total Step 1 genes: 12082

Total overdispersed genes: 12063

Excluding 19 genes from Step 1 because they are not overdispersed.

Variance stabilizing transformation of count matrix of size 12082 by 12321

Model formula is y ~ log_umi

Get Negative Binomial regression parameters per gene

Using 2000 genes, 5000 cells





Setting estimate of  25 genes to inf as theta_mm/theta_mle < 1e-3

# of step1 poisson genes (variance < mean): 0

# of low mean genes (mean < 0.001): 0

Total # of Step1 poisson genes (theta=Inf; variance < mean): 25

Total # of poisson genes (theta=Inf; variance < mean): 19

Calling offset model for all 19 poisson genes

Found 42 outliers - those will be ignored in fitting/regularization step


Ignoring theta inf genes

Replacing fit params for 19 poisson genes by theta=Inf

Setting min_variance based on median UMI:  0.04

Second step: Get residuals using fitted parameters for 12082 genes





Computing corrected count matrix for 12082 genes





Calculating gene attributes

Wall clock passed: Time difference of 46.05532 secs

Determine variable features

Place corrected count matrix in counts slot

Centering data matrix

Set default assay to SCT



[1] "--------------------------------------------------------"


In [11]:
df_residual_variance_return = merge ( list_df_residual_variance[[1]], list_df_residual_variance[[2]], by=c("gene_vector") )
print ( head ( df_residual_variance_return ) ) 

    gene_vector         0         1
1 0610007N19RIK 0.2410941 0.2212500
2 0610007P14RIK 0.8831853 0.8252444
3 0610009B22RIK 1.0836001 1.0768527
4 0610009D07RIK 0.8824066 0.8213193
5 0610009E02RIK 0.1887797 0.2348251
6 0610009L18RIK 0.3827116 0.3493258


In [12]:
py_save_object ( df_residual_variance_return, residual_variance_dsn )