In [1]:
import pandas as pd
import numpy  as np

import pickle
  
from scipy.sparse import issparse 
 
from pathlib import Path


pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = r"D:/analyze_Pearson_residuals/"

data_subfolder = "10k_heart"

data_path = Path ( data_folder + data_subfolder )

In [3]:
# output data
results_pkl = "residual_variance_scanpy.pkl"

# input data
counts_scipy_csc_pkl  = "counts_scipy_csc.pkl"
gene_array_pkl  = "gene_array.pkl"

 
# path: output data
results_dsn = data_path / results_pkl

# paths: input data
counts_scipy_csc_dsn = data_path / counts_scipy_csc_pkl
gene_array_dsn = data_path / gene_array_pkl 

In [4]:
# https://github.com/scverse/scanpy/tree/master/scanpy/experimental/pp
# from   _normalization.py  rows 26-64

def _pearson_residuals(X, theta, clip, copy=False):    # removed check_values from parm list

    X = X.copy() if copy else X

    # check theta
    if theta <= 0:
        # TODO: would "underdispersion" with negative theta make sense?
        # then only theta=0 were undefined..
        raise ValueError('Pearson residuals require theta > 0')
    # prepare clipping
    if clip is None:
        n = X.shape[0]
        clip = np.sqrt(n)
    if clip < 0:
        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

# vfk 2022 08 16		
    # if check_values and not check_nonnegative_integers(X):
        # warn(
            # "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
            # UserWarning,
        # )

    if issparse(X):
        sums_genes = np.sum(X, axis=0)
        sums_cells = np.sum(X, axis=1)
        sum_total = np.sum(sums_genes).squeeze()
    else:
        sums_genes = np.sum(X, axis=0, keepdims=True)
        sums_cells = np.sum(X, axis=1, keepdims=True)
        sum_total = np.sum(sums_genes)

    mu = np.array(sums_cells @ sums_genes / sum_total)
    diff = np.array(X - mu)
    residuals = diff / np.sqrt(mu + mu**2 / theta)

    # clip
    residuals = np.clip(residuals, a_min=-clip, a_max=clip)

    return residuals	
	
	  
	
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99 ]

In [5]:
f = open( counts_scipy_csc_dsn, 'rb' )  
scipy_csc_mat = pickle.load( f )           
f.close()       
print ( ' scipy_csc_mat.shape: ' ,  scipy_csc_mat.shape )
print ( '\n scipy_csc_mat:\n' , scipy_csc_mat)
 
scipy_csr_mat = scipy_csc_mat.transpose().astype ( 'float64'  )
print ( '\n\n scipy_csr_mat.shape: ' ,  scipy_csr_mat.shape )
print ( '\n scipy_csr_mat:\n' , scipy_csr_mat)

del scipy_csc_mat  


f = open( gene_array_dsn, 'rb' )
gene_array = pickle.load( f )           
f.close()       
print ( '\n\n  gene_array.shape:  ' ,  gene_array.shape )
print ( '\n gene_array: ' , gene_array)

 scipy_csc_mat.shape:  (14472, 7713)

 scipy_csc_mat:
   (0, 0)	823
  (1, 0)	268
  (2, 0)	33
  (3, 0)	165
  (9, 0)	17
  (10, 0)	12
  (11, 0)	12
  (12, 0)	1
  (13, 0)	5
  (19, 0)	1
  (27, 0)	7
  (28, 0)	3
  (32, 0)	4
  (35, 0)	1
  (42, 0)	6
  (46, 0)	6
  (59, 0)	1
  (84, 0)	1
  (90, 0)	2
  (101, 0)	1
  (121, 0)	3
  (124, 0)	1
  (144, 0)	10
  (145, 0)	1
  (171, 0)	1
  :	:
  (6837, 7712)	1
  (6873, 7712)	1
  (7345, 7712)	1
  (7414, 7712)	1
  (7638, 7712)	1
  (7811, 7712)	1
  (7934, 7712)	2
  (8178, 7712)	1
  (8184, 7712)	1
  (8217, 7712)	1
  (8418, 7712)	1
  (8575, 7712)	1
  (9443, 7712)	1
  (9908, 7712)	1
  (9975, 7712)	1
  (10174, 7712)	1
  (10263, 7712)	1
  (10321, 7712)	1
  (10939, 7712)	1
  (11032, 7712)	1
  (12208, 7712)	1
  (12625, 7712)	1
  (12646, 7712)	1
  (13007, 7712)	1
  (13401, 7712)	1


 scipy_csr_mat.shape:  (7713, 14472)

 scipy_csr_mat:
   (0, 0)	823.0
  (0, 1)	268.0
  (0, 2)	33.0
  (0, 3)	165.0
  (0, 9)	17.0
  (0, 10)	12.0
  (0, 11)	12.0
  (0, 12)	1.0
  (0, 13)	5.0
  (0

In [6]:
residuals = _pearson_residuals ( scipy_csr_mat, 100, 1e9 )
residual_variance_unclipped = np.var( residuals, axis=0)

residuals = _pearson_residuals ( scipy_csr_mat, 100, None )
residual_variance_clipped = np.var( residuals, axis=0)

In [7]:
df_results = pd.DataFrame ( index = gene_array, data = { 'CLIPPED': residual_variance_clipped, 'UNCLIPPED': residual_variance_unclipped } ) 
print ( '\n df_results' )
print ( df_results  )
print ( '\n\n df_results.describe' )
print ( df_results.describe ( percentiles=pctl_list ) )


 df_results
                       CLIPPED    UNCLIPPED
ENSMUSG00000052305  987.369677  2134.180098
ENSMUSG00000069919  668.658665   959.468398
ENSMUSG00000073940  400.280374   922.209307
ENSMUSG00000069917  590.400652   733.295277
ENSMUSG00000038791   12.462472   956.768047
...                        ...          ...
ENSMUSG00000020905    0.611010     0.611010
ENSMUSG00000027313    0.579941     0.579941
ENSMUSG00000104861    0.551858     0.551858
ENSMUSG00000041144    0.649768     0.649768
ENSMUSG00000000154    0.534641     0.534641

[14472 rows x 2 columns]


 df_results.describe
            CLIPPED     UNCLIPPED
count  14472.000000  14472.000000
mean       2.228946      2.728223
std       12.353792     25.080889
min        0.442913      0.442913
1%         0.688436      0.688436
5%         0.814043      0.814043
10%        0.883573      0.883573
25%        1.015036      1.015036
50%        1.260418      1.260418
75%        1.760157      1.760395
90%        3.029099      3.085666
95

In [8]:
df_results.to_pickle ( results_dsn )