In [1]:
import pandas as pd
import numpy  as np

import pickle
  
from scipy.sparse import issparse 
 
from pathlib import Path


pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "10k_brain"

data_path = Path ( data_folder + data_subfolder )

In [3]:
# output data
results_pkl = "residual_variance_scanpy.pkl"

# input data
counts_scipy_csc_pkl  = "counts_scipy_csc.pkl"
gene_array_pkl  = "gene_array.pkl"

 
# path: output data
results_dsn = data_path / results_pkl

# paths: input data
counts_scipy_csc_dsn = data_path / counts_scipy_csc_pkl
gene_array_dsn = data_path / gene_array_pkl 

In [4]:
# https://github.com/scverse/scanpy/tree/master/scanpy/experimental/pp
# from   _normalization.py  rows 26-64

def _pearson_residuals(X, theta, clip, copy=False):    # removed check_values from parm list

    X = X.copy() if copy else X

    # check theta
    if theta <= 0:
        # TODO: would "underdispersion" with negative theta make sense?
        # then only theta=0 were undefined..
        raise ValueError('Pearson residuals require theta > 0')
    # prepare clipping
    if clip is None:
        n = X.shape[0]
        clip = np.sqrt(n)
    if clip < 0:
        raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.")

# vfk 2022 08 16		
    # if check_values and not check_nonnegative_integers(X):
        # warn(
            # "`normalize_pearson_residuals()` expects raw count data, but non-integers were found.",
            # UserWarning,
        # )

    if issparse(X):
        sums_genes = np.sum(X, axis=0)
        sums_cells = np.sum(X, axis=1)
        sum_total = np.sum(sums_genes).squeeze()
    else:
        sums_genes = np.sum(X, axis=0, keepdims=True)
        sums_cells = np.sum(X, axis=1, keepdims=True)
        sum_total = np.sum(sums_genes)

    mu = np.array(sums_cells @ sums_genes / sum_total)
    diff = np.array(X - mu)
    residuals = diff / np.sqrt(mu + mu**2 / theta)

    # clip
    residuals = np.clip(residuals, a_min=-clip, a_max=clip)

    return residuals	
	
	  
	
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99 ]

In [5]:
f = open( counts_scipy_csc_dsn, 'rb' )  
scipy_csc_mat = pickle.load( f )           
f.close()       
print ( ' scipy_csc_mat.shape: ' ,  scipy_csc_mat.shape )
print ( '\n scipy_csc_mat:\n' , scipy_csc_mat)
 
scipy_csr_mat = scipy_csc_mat.transpose().astype ( 'float64'  )
print ( '\n\n scipy_csr_mat.shape: ' ,  scipy_csr_mat.shape )
print ( '\n scipy_csr_mat:\n' , scipy_csr_mat)

del scipy_csc_mat  


f = open( gene_array_dsn, 'rb' )
gene_array = pickle.load( f )           
f.close()       
print ( '\n\n  gene_array.shape:  ' ,  gene_array.shape )
print ( '\n gene_array: ' , gene_array)

 scipy_csc_mat.shape:  (13817, 11843)

 scipy_csc_mat:
   (4, 0)	20
  (6, 0)	2
  (9, 0)	2
  (12, 0)	8
  (13, 0)	21
  (15, 0)	1
  (20, 0)	1
  (22, 0)	15
  (24, 0)	2
  (25, 0)	1
  (40, 0)	3
  (46, 0)	4
  (47, 0)	3
  (60, 0)	2
  (63, 0)	11
  (70, 0)	3
  (71, 0)	2
  (76, 0)	1
  (88, 0)	4
  (97, 0)	1
  (102, 0)	1
  (110, 0)	1
  (112, 0)	7
  (115, 0)	2
  (116, 0)	15
  :	:
  (12660, 11842)	1
  (12663, 11842)	1
  (12675, 11842)	1
  (12717, 11842)	1
  (12737, 11842)	1
  (12740, 11842)	1
  (12741, 11842)	1
  (12782, 11842)	1
  (12830, 11842)	1
  (12834, 11842)	1
  (12906, 11842)	1
  (12931, 11842)	1
  (12952, 11842)	1
  (12954, 11842)	1
  (12956, 11842)	1
  (12978, 11842)	1
  (13011, 11842)	1
  (13015, 11842)	1
  (13024, 11842)	1
  (13044, 11842)	1
  (13075, 11842)	1
  (13081, 11842)	1
  (13265, 11842)	1
  (13516, 11842)	1
  (13537, 11842)	1


 scipy_csr_mat.shape:  (11843, 13817)

 scipy_csr_mat:
   (0, 4)	20.0
  (0, 6)	2.0
  (0, 9)	2.0
  (0, 12)	8.0
  (0, 13)	21.0
  (0, 15)	1.0
  (0, 20)	1.0
 

In [6]:
residuals = _pearson_residuals ( scipy_csr_mat, 100, 1e9 )
residual_variance_unclipped = np.var( residuals, axis=0)

residuals = _pearson_residuals ( scipy_csr_mat, 100, None )
residual_variance_clipped = np.var( residuals, axis=0)

In [7]:
df_results = pd.DataFrame ( index = gene_array, data = { 'CLIPPED': residual_variance_clipped, 'UNCLIPPED': residual_variance_unclipped } ) 
print ( '\n df_results' )
print ( df_results  )
print ( '\n\n df_results.describe' )
print ( df_results.describe ( percentiles=pctl_list ) )


 df_results
                       CLIPPED    UNCLIPPED
ENSMUSG00000052305  343.479659  2286.218409
ENSMUSG00000073940  192.945229   983.121451
ENSMUSG00000069919  261.035675   653.893380
ENSMUSG00000069917  243.073028   481.069479
ENSMUSG00000019874  124.047541   131.850845
...                        ...          ...
ENSMUSG00000027075    1.112728     1.112728
ENSMUSG00000026389    1.134343     1.134343
ENSMUSG00000063021    0.973234     0.973234
ENSMUSG00000032204    0.894035     0.894035
ENSMUSG00000043629    0.949970     0.949970

[13817 rows x 2 columns]


 df_results.describe
            CLIPPED     UNCLIPPED
count  13817.000000  13817.000000
mean       1.536604      1.850436
std        4.886614     22.587183
min        0.646741      0.646741
1%         0.856301      0.856301
5%         0.918027      0.918027
10%        0.951255      0.951255
25%        1.030824      1.030824
50%        1.165219      1.165219
75%        1.360385      1.360385
90%        1.817532      1.817532
95

In [8]:
df_results.to_pickle ( results_dsn )