In [1]:
import pandas as pd
import numpy  as np

import pickle
  
from scipy.sparse import issparse 
 
from pathlib import Path


pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "33k_PBMC"

data_path = Path ( data_folder + data_subfolder )

In [3]:
# output data
results_pkl = "residual_variance_scanpy.pkl"

# input data
counts_scipy_csc_pkl  = "counts_scipy_csc.pkl"
gene_array_pkl  = "gene_array.pkl"

 
# path: output data
results_dsn = data_path / results_pkl

# paths: input data
counts_scipy_csc_dsn = data_path / counts_scipy_csc_pkl
gene_array_dsn = data_path / gene_array_pkl 

In [4]:
# https://github.com/scverse/scanpy/tree/master/scanpy/experimental/pp
# from   _normalization.py  rows 26-64

def _pearson_residuals(X, theta, clip, copy=False):    # removed check_values from parm list

    X = X.copy() if copy else X

    # check theta
    if theta <= 0:
        # TODO: would "underdispersion" with negative theta make sense?
        # then only theta=0 were undefined..
        raise ValueError('Pearson residuals require theta > 0')
    # prepare clipping
    if clip is None:
        n = X.shape[0]
        clip = np.sqrt(n)
    if clip < 0:
        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

# vfk 2022 08 16		
    # if check_values and not check_nonnegative_integers(X):
        # warn(
            # "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
            # UserWarning,
        # )

    if issparse(X):
        sums_genes = np.sum(X, axis=0)
        sums_cells = np.sum(X, axis=1)
        sum_total = np.sum(sums_genes).squeeze()
    else:
        sums_genes = np.sum(X, axis=0, keepdims=True)
        sums_cells = np.sum(X, axis=1, keepdims=True)
        sum_total = np.sum(sums_genes)

    mu = np.array(sums_cells @ sums_genes / sum_total)
    diff = np.array(X - mu)
    residuals = diff / np.sqrt(mu + mu**2 / theta)

    # clip
    residuals = np.clip(residuals, a_min=-clip, a_max=clip)

    return residuals	
	
	  
	
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99 ]

In [5]:
f = open( counts_scipy_csc_dsn, 'rb' )  
scipy_csc_mat = pickle.load( f )           
f.close()       
print ( ' scipy_csc_mat.shape: ' ,  scipy_csc_mat.shape )
print ( '\n scipy_csc_mat:\n' , scipy_csc_mat)
 
scipy_csr_mat = scipy_csc_mat.transpose().astype ( 'float64'  )
print ( '\n\n scipy_csr_mat.shape: ' ,  scipy_csr_mat.shape )
print ( '\n scipy_csr_mat:\n' , scipy_csr_mat)

del scipy_csc_mat  


f = open( gene_array_dsn, 'rb' )
gene_array = pickle.load( f )           
f.close()       
print ( '\n\n  gene_array.shape:  ' ,  gene_array.shape )
print ( '\n gene_array: ' , gene_array)

 scipy_csc_mat.shape:  (12324, 33148)

 scipy_csc_mat:
   (2, 0)	3
  (4, 0)	1
  (6, 0)	9
  (10, 0)	89
  (23, 0)	27
  (29, 0)	8
  (31, 0)	24
  (43, 0)	1
  (45, 0)	4
  (50, 0)	1
  (53, 0)	1
  (54, 0)	1
  (62, 0)	34
  (63, 0)	2
  (64, 0)	32
  (66, 0)	22
  (78, 0)	7
  (79, 0)	15
  (80, 0)	8
  (82, 0)	13
  (85, 0)	24
  (86, 0)	29
  (87, 0)	3
  (88, 0)	17
  (91, 0)	20
  :	:
  (8737, 33147)	1
  (8874, 33147)	1
  (8889, 33147)	1
  (9020, 33147)	1
  (9039, 33147)	1
  (9142, 33147)	1
  (9144, 33147)	1
  (9357, 33147)	1
  (9403, 33147)	1
  (9434, 33147)	1
  (9525, 33147)	1
  (9561, 33147)	1
  (9593, 33147)	1
  (9600, 33147)	1
  (9718, 33147)	1
  (9760, 33147)	1
  (9969, 33147)	1
  (9976, 33147)	1
  (10085, 33147)	1
  (10101, 33147)	1
  (10172, 33147)	1
  (10316, 33147)	1
  (10339, 33147)	1
  (11122, 33147)	1
  (11840, 33147)	1


 scipy_csr_mat.shape:  (33148, 12324)

 scipy_csr_mat:
   (0, 2)	3.0
  (0, 4)	1.0
  (0, 6)	9.0
  (0, 10)	89.0
  (0, 23)	27.0
  (0, 29)	8.0
  (0, 31)	24.0
  (0, 43)	1.0
  

In [6]:
residuals = _pearson_residuals ( scipy_csr_mat, 100, 1e9 )
residual_variance_unclipped = np.var( residuals, axis=0)

residuals = _pearson_residuals ( scipy_csr_mat, 100, None )
residual_variance_clipped = np.var( residuals, axis=0)

In [7]:
df_results = pd.DataFrame ( index = gene_array, data = { 'CLIPPED': residual_variance_clipped, 'UNCLIPPED': residual_variance_unclipped } ) 
print ( '\n df_results' )
print ( df_results  )
print ( '\n\n df_results.describe' )
print ( df_results.describe ( percentiles=pctl_list ) )


 df_results
                   CLIPPED   UNCLIPPED
ENSG00000254709  16.208453  575.773651
ENSG00000163736  63.909458   85.236894
ENSG00000087086  32.692607   32.692607
ENSG00000163220  33.945418   33.945418
ENSG00000090382  30.883432   30.883432
...                    ...         ...
ENSG00000171241   0.599552    0.599552
ENSG00000149573   0.722545    0.722545
ENSG00000122025   0.651422    0.651422
ENSG00000237980   0.682376    0.682376
ENSG00000158488   0.666108    0.666108

[12324 rows x 2 columns]


 df_results.describe
            CLIPPED     UNCLIPPED
count  12324.000000  12324.000000
mean       1.377470      1.466819
std        1.141436      5.383959
min        0.599552      0.599552
1%         0.850694      0.850694
5%         0.960949      0.960949
10%        1.001811      1.001811
25%        1.069765      1.069765
50%        1.204892      1.204892
75%        1.459306      1.459306
90%        1.822806      1.839138
95%        2.057806      2.299378
96%        2.155080      2.4

In [8]:
df_results.to_pickle ( results_dsn )