In [1]:
import pandas as pd
import numpy  as np

import pickle
  
from scipy.sparse import issparse 
 
from pathlib import Path


pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "lupus"

data_path = Path ( data_folder + data_subfolder )

In [3]:
# output data
results_pkl = "residual_variance_scanpy.pkl"

# input data
counts_scipy_csc_pkl  = "counts_scipy_csc.pkl"
gene_array_pkl  = "gene_array.pkl"

 
# path: output data
results_dsn = data_path / results_pkl

# paths: input data
counts_scipy_csc_dsn = data_path / counts_scipy_csc_pkl
gene_array_dsn = data_path / gene_array_pkl 

In [4]:
# https://github.com/scverse/scanpy/tree/master/scanpy/experimental/pp
# from   _normalization.py  rows 26-64

def _pearson_residuals(X, theta, clip, copy=False):    # removed check_values from parm list

    X = X.copy() if copy else X

    # check theta
    if theta <= 0:
        # TODO: would "underdispersion" with negative theta make sense?
        # then only theta=0 were undefined..
        raise ValueError('Pearson residuals require theta > 0')
    # prepare clipping
    if clip is None:
        n = X.shape[0]
        clip = np.sqrt(n)
    if clip < 0:
        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

# vfk 2022 08 16		
    # if check_values and not check_nonnegative_integers(X):
        # warn(
            # "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
            # UserWarning,
        # )

    if issparse(X):
        sums_genes = np.sum(X, axis=0)
        sums_cells = np.sum(X, axis=1)
        sum_total = np.sum(sums_genes).squeeze()
    else:
        sums_genes = np.sum(X, axis=0, keepdims=True)
        sums_cells = np.sum(X, axis=1, keepdims=True)
        sum_total = np.sum(sums_genes)

    mu = np.array(sums_cells @ sums_genes / sum_total)
    diff = np.array(X - mu)
    residuals = diff / np.sqrt(mu + mu**2 / theta)

    # clip
    residuals = np.clip(residuals, a_min=-clip, a_max=clip)

    return residuals	
	
	  
	
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99 ]

In [5]:
f = open( counts_scipy_csc_dsn, 'rb' )  
scipy_csc_mat = pickle.load( f )           
f.close()       
print ( ' scipy_csc_mat.shape: ' ,  scipy_csc_mat.shape )
print ( '\n scipy_csc_mat:\n' , scipy_csc_mat)
 
scipy_csr_mat = scipy_csc_mat.transpose().astype ( 'float64'  )
print ( '\n\n scipy_csr_mat.shape: ' ,  scipy_csr_mat.shape )
print ( '\n scipy_csr_mat:\n' , scipy_csr_mat)

del scipy_csc_mat  


f = open( gene_array_dsn, 'rb' )
gene_array = pickle.load( f )           
f.close()       
print ( '\n\n  gene_array.shape:  ' ,  gene_array.shape )
print ( '\n gene_array: ' , gene_array)

 scipy_csc_mat.shape:  (10619, 26820)

 scipy_csc_mat:
   (2, 0)	32
  (15, 0)	217
  (16, 0)	89
  (33, 0)	50
  (35, 0)	27
  (41, 0)	26
  (42, 0)	19
  (43, 0)	34
  (44, 0)	2
  (45, 0)	5
  (47, 0)	19
  (50, 0)	22
  (53, 0)	20
  (54, 0)	18
  (56, 0)	22
  (62, 0)	18
  (67, 0)	19
  (69, 0)	22
  (75, 0)	19
  (76, 0)	1
  (78, 0)	21
  (79, 0)	20
  (80, 0)	2
  (86, 0)	11
  (90, 0)	20
  :	:
  (6993, 26819)	1
  (7045, 26819)	1
  (7137, 26819)	1
  (7214, 26819)	1
  (7364, 26819)	1
  (7423, 26819)	1
  (7462, 26819)	1
  (7584, 26819)	1
  (7622, 26819)	1
  (7699, 26819)	1
  (7718, 26819)	1
  (7844, 26819)	1
  (7960, 26819)	1
  (8173, 26819)	1
  (8321, 26819)	1
  (8360, 26819)	1
  (8440, 26819)	1
  (8482, 26819)	1
  (8965, 26819)	1
  (8997, 26819)	1
  (9354, 26819)	1
  (9377, 26819)	1
  (9389, 26819)	1
  (9629, 26819)	1
  (10259, 26819)	1


 scipy_csr_mat.shape:  (26820, 10619)

 scipy_csr_mat:
   (0, 2)	32.0
  (0, 15)	217.0
  (0, 16)	89.0
  (0, 33)	50.0
  (0, 35)	27.0
  (0, 41)	26.0
  (0, 42)	19.0
  (

In [6]:
residuals = _pearson_residuals ( scipy_csr_mat, 100, 1e9 )
residual_variance_unclipped = np.var( residuals, axis=0)

residuals = _pearson_residuals ( scipy_csr_mat, 100, None )
residual_variance_clipped = np.var( residuals, axis=0)

In [7]:
df_results = pd.DataFrame ( index = gene_array, data = { 'CLIPPED': residual_variance_clipped, 'UNCLIPPED': residual_variance_unclipped } ) 
print ( '\n df_results' )
print ( df_results  )
print ( '\n\n df_results.describe' )
print ( df_results.describe ( percentiles=pctl_list ) )


 df_results
                   CLIPPED   UNCLIPPED
ENSG00000244734  79.810010  546.699020
ENSG00000188536  64.410016  179.990128
ENSG00000167996  42.471471   42.471471
ENSG00000206172  56.161057  102.908438
ENSG00000087086  51.001116   51.001116
...                    ...         ...
ENSG00000136492   0.708888    0.708888
ENSG00000253593   0.722437    0.722437
ENSG00000172379   0.717648    0.717648
ENSG00000226004   0.690486    0.690486
ENSG00000123643   0.742950    0.742950

[10619 rows x 2 columns]


 df_results.describe
            CLIPPED     UNCLIPPED
count  10619.000000  10619.000000
mean       1.386865      1.521137
std        1.674506      8.540505
min        0.612223      0.612223
1%         0.771905      0.771905
5%         0.923907      0.923907
10%        1.022985      1.022985
25%        1.140900      1.140900
50%        1.247285      1.247285
75%        1.369829      1.369829
90%        1.587221      1.587221
95%        1.861001      1.864821
96%        1.999839      2.0

In [8]:
df_results.to_pickle ( results_dsn )