In [1]:
import pandas as pd
import numpy  as np

import pickle 

from pathlib import Path

import time


pd.set_option('display.max_columns', 5)

In [2]:
np.random.seed( 12345 ) 

In [3]:
#### user specified

data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "retinal"

data_path = Path ( data_folder + data_subfolder )


import sys
sys.path.append( data_folder )

from  nru_DE import *

In [4]:
# output data
results_pkl = "Ag_complementary_samples.pkl"
gene_array_pkl  = "gene_array_complementary_samples.pkl"  # for scanpy and SCTransform
df_cells_pkl  = "df_cells_complementary_samples.pkl"      # for scanpy and SCTransform
 
# input data
nru_dict_pkl =  "nru_dict.pkl"
counts_pkl = "counts_sparse.pkl" # sparse pandas data frame: cells in columns; genes in rows
 


# paths: output data
results_dsn = data_path / results_pkl
gene_array_dsn = data_path / gene_array_pkl
df_cells_dsn = data_path / df_cells_pkl

# paths: input data
nru_dict_dsn = data_path / nru_dict_pkl
counts_dsn = data_path / counts_pkl

In [5]:
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .99 ]

In [6]:
nz_min_complemetary = 50
n_samples = 2

In [7]:
f = open( nru_dict_dsn, 'rb' )    
nru_dict = pickle.load(f)           
f.close()       

df_selected_cells = nru_dict['df_selected_cells']
print ( ' df_selected_cells:  ' )
print ( '\n', df_selected_cells )

df_gene_stats = nru_dict['df_gene_stats'] [[ 'nz_cells', 'M_g', 'A_g']]
print ( '\n\n df_gene_stats:  ' )
print ( '\n', df_gene_stats )

 df_selected_cells:  

                     0      1   ...     38     39
r4_TGCCACATGGGC  False   True  ...   True  False
r4_ATCGGCTACCGA   True  False  ...  False   True
r4_TAGATATCTTAT   True  False  ...  False   True
r4_CCTGGATTGTAC   True  False  ...  False   True
r4_CGGTAATAGGAA  False   True  ...   True  False
...                ...    ...  ...    ...    ...
p1_TCAAAAGCCGGG  False   True  ...  False   True
p1_ATTAAGTTCCAA  False   True  ...   True  False
p1_CTGTCTGAGACC   True  False  ...   True  False
p1_TAACGCGCTCCT   True  False  ...  False   True
p1_ATTCTTGTTCTT   True  False  ...  False   True

[24769 rows x 40 columns]


 df_gene_stats:  

                nz_cells        M_g        A_g
CARTPT              812  50.106662  42.740254
RGS5                417  21.950167  17.483289
RHO               21812  15.911165  15.539177
APOE               2255  16.349967  14.865133
GLUL               4792  15.349501  14.769975
...                 ...        ...        ...
RTN4R            

In [8]:
df_counts_sparse = pd.read_pickle ( counts_dsn )
print ( ' df_counts_sparse:' )
print ( df_counts_sparse )
print ( '\n type( df_counts_sparse ) ', type ( df_counts_sparse ))
print ( '\n df_counts_sparse.sparse.density: ', df_counts_sparse.sparse.density)

 df_counts_sparse:
               r4_TGCCACATGGGC  r4_ATCGGCTACCGA  ...  p1_TAACGCGCTCCT  \
KITL                         0                2  ...                0   
TMTC3                       10                0  ...                0   
CEP290                      24                8  ...                0   
4930430F08RIK                1                1  ...                0   
1700017N19RIK                0                1  ...                0   
...                        ...              ...  ...              ...   
GM23791                      0                0  ...                0   
GM26101                      0                0  ...                0   
4930513O06RIK                0                0  ...                0   
VSIG1                        0                0  ...                0   
GM25207                      0                0  ...                0   

               p1_ATTCTTGTTCTT  
KITL                         0  
TMTC3                        0  
CEP29

In [9]:
df_counts_sparse_gene_sel = df_counts_sparse.loc [ df_gene_stats.index ]
print ( ' df_counts_sparse_gene_sel:' )
print ( df_counts_sparse_gene_sel )
print ( '\n type( df_counts_sparse_gene_sel ) ', type ( df_counts_sparse_gene_sel ))
print ( '\n df_counts_sparse_gene_sel.sparse.density: ', df_counts_sparse_gene_sel.sparse.density)

del df_counts_sparse

 df_counts_sparse_gene_sel:
               r4_TGCCACATGGGC  r4_ATCGGCTACCGA  ...  p1_TAACGCGCTCCT  \
CARTPT                       0               45  ...                3   
RGS5                         0                0  ...                0   
RHO                        496                5  ...               32   
APOE                         0                0  ...                0   
GLUL                         7                4  ...                0   
...                        ...              ...  ...              ...   
RTN4R                        0                0  ...                0   
GM25749                      0                0  ...                0   
KIF26B                       0                1  ...                0   
A330033J07RIK                0                0  ...                0   
RHOV                         0                0  ...                0   

               p1_ATTCTTGTTCTT  
CARTPT                       0  
RGS5                         

In [10]:
df_sample_nz_totals_list = []
df_sample_nz_totals = pd.DataFrame ( index=df_counts_sparse_gene_sel.index )


print ( 'for each gene, compute number of nonzero cells in each UMI count sub-matrix' )
for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  
  df_cell_sample = df_selected_cells[[sample]]
  df_cell_select = df_cell_sample [ df_cell_sample[sample] ]
  barcode_list = df_cell_select.index.values.tolist()
  
  df_counts_sparse_sample = df_counts_sparse_gene_sel[ barcode_list ]  
  df_counts_GT_0 = ( df_counts_sparse_sample > 0 ).astype( int )  
  df_sample_nz_sum = df_counts_GT_0.sum ( axis=1 ).to_frame ( name = sample )
  df_sample_nz_totals_list.append ( df_sample_nz_sum )
  
df_sample_nz_totals = pd.concat ( df_sample_nz_totals_list , axis=1 )  
df_sample_nz_totals['min_nz'] = df_sample_nz_totals.min ( axis=1 )

print ( '\n df_sample_nz_totals' )
print ( df_sample_nz_totals  )
print ( '\n\n df_sample_nz_totals.describe' )
print ( df_sample_nz_totals.describe ( percentiles=pctl_list ) )

for each gene, compute number of nonzero cells in each UMI count sub-matrix

 sample:  0

 sample:  1

 df_sample_nz_totals
                   0      1  min_nz
CARTPT           418    394     394
RGS5             200    217     200
RHO            10997  10815   10815
APOE            1131   1124    1124
GLUL            2418   2374    2374
...              ...    ...     ...
RTN4R             22     33      22
GM25749           31     29      29
KIF26B            33     18      18
A330033J07RIK     30     31      30
RHOV              26     25      25

[13552 rows x 3 columns]


 df_sample_nz_totals.describe
                  0             1        min_nz
count  13552.000000  13552.000000  13552.000000
mean     721.704545    711.034681    703.598657
std     1024.410501   1010.731617   1008.875949
min       17.000000     14.000000     14.000000
1%        26.000000     25.000000     24.000000
5%        36.000000     36.000000     32.000000
10%       51.000000     50.000000     47.000000
25

In [11]:
gene_select_array = df_sample_nz_totals[ df_sample_nz_totals['min_nz'] >= nz_min_complemetary ].index.values

df_counts_sparse_all_samples = df_counts_sparse_gene_sel.loc [ gene_select_array ]
print ( '\n df_counts_sparse_all_samples' )
print ( df_counts_sparse_all_samples  )

del df_counts_sparse_gene_sel


 df_counts_sparse_all_samples
         r4_TGCCACATGGGC  r4_ATCGGCTACCGA  ...  p1_TAACGCGCTCCT  \
CARTPT                 0               45  ...                3   
RGS5                   0                0  ...                0   
RHO                  496                5  ...               32   
APOE                   0                0  ...                0   
GLUL                   7                4  ...                0   
...                  ...              ...  ...              ...   
GM26906                0                1  ...                0   
LHFPL5                 0                2  ...                0   
SHB                    0                0  ...                0   
IL34                   0                4  ...                0   
MRAP2                  0                0  ...                0   

         p1_ATTCTTGTTCTT  
CARTPT                 0  
RGS5                   0  
RHO                   35  
APOE                   0  
GLUL                   0  
..

In [12]:
#### verify that all cells have nonzero sequencing depth
ser_cell_totals = df_counts_sparse_all_samples.sum ( axis=0 ) 
print ( '\n\n ser_cell_totals.describe' )
print ( ser_cell_totals.describe ( percentiles=pctl_list ) )

ser_cell_totals_GT_0 = ser_cell_totals[ ser_cell_totals > 0 ]
print ( '\n\n ser_cell_totals_GT_0.describe' )
print ( ser_cell_totals_GT_0.describe ( percentiles=pctl_list ) )



 ser_cell_totals.describe
count    24769.000000
mean      1326.385078
std       1726.776725
min        289.000000
1%         304.000000
5%         355.000000
10%        394.000000
25%        494.000000
50%        811.000000
75%       1475.000000
90%       2713.000000
95%       3921.000000
99%       8201.280000
max      50434.000000
dtype: float64


 ser_cell_totals_GT_0.describe
count    24769.000000
mean      1326.385078
std       1726.776725
min        289.000000
1%         304.000000
5%         355.000000
10%        394.000000
25%        494.000000
50%        811.000000
75%       1475.000000
90%       2713.000000
95%       3921.000000
99%       8201.280000
max      50434.000000
dtype: float64


In [13]:
df_counts_sparse_all_samples_SD_positive  = df_counts_sparse_all_samples[ ser_cell_totals_GT_0.index ]
print ( ' df_counts_sparse_all_samples_SD_positive:' )
print ( df_counts_sparse_all_samples_SD_positive )
print ( '\n type( df_counts_sparse_all_samples_SD_positive ) ', type ( df_counts_sparse_all_samples_SD_positive ))
print ( '\n df_counts_sparse_all_samples_SD_positive.sparse.density: ', df_counts_sparse_all_samples_SD_positive.sparse.density)

 df_counts_sparse_all_samples_SD_positive:
         r4_TGCCACATGGGC  r4_ATCGGCTACCGA  ...  p1_TAACGCGCTCCT  \
CARTPT                 0               45  ...                3   
RGS5                   0                0  ...                0   
RHO                  496                5  ...               32   
APOE                   0                0  ...                0   
GLUL                   7                4  ...                0   
...                  ...              ...  ...              ...   
GM26906                0                1  ...                0   
LHFPL5                 0                2  ...                0   
SHB                    0                0  ...                0   
IL34                   0                4  ...                0   
MRAP2                  0                0  ...                0   

         p1_ATTCTTGTTCTT  
CARTPT                 0  
RGS5                   0  
RHO                   35  
APOE                   0  
GLUL             

In [14]:
df_cells_SD_positive = pd.DataFrame ( index = df_counts_sparse_all_samples_SD_positive.columns, data=True, columns=['SD_positive'] )


df_complementary_selected_cells_list = []

for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  df_cell_sample = pd.concat ( [ df_selected_cells[[sample]], df_cells_SD_positive ], axis=1 )
  df_cell_sample = df_cell_sample.assign( SD_positive = df_cell_sample['SD_positive'].fillna(False) )
  df_complementary_selected_cells_list.append ( df_cell_sample.all ( axis=1 ).to_frame ( name = sample )   ) 

df_complementary_selected_cells = pd.concat ( df_complementary_selected_cells_list, axis=1 )  
  
print ( '\n df_complementary_selected_cells' )
print ( df_complementary_selected_cells  )
print ( '\n\n df_complementary_selected_cells.sum' )
print ( df_complementary_selected_cells.sum () )


 sample:  0

 sample:  1

 df_complementary_selected_cells
                     0      1
r4_TGCCACATGGGC  False   True
r4_ATCGGCTACCGA   True  False
r4_TAGATATCTTAT   True  False
r4_CCTGGATTGTAC   True  False
r4_CGGTAATAGGAA  False   True
...                ...    ...
p1_TCAAAAGCCGGG  False   True
p1_ATTAAGTTCCAA  False   True
p1_CTGTCTGAGACC   True  False
p1_TAACGCGCTCCT   True  False
p1_ATTCTTGTTCTT   True  False

[24769 rows x 2 columns]


 df_complementary_selected_cells.sum
0    12448
1    12321
dtype: int64


In [15]:
print ( 'calculations for samples' )

df_gene_stats_half_cell_samples_list = []
  
for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  df_cell_sample = df_complementary_selected_cells[[sample]]
  df_cell_select = df_cell_sample [ df_cell_sample[sample] ]
  barcode_list = df_cell_select.index.values.tolist() 
  df_counts_sparse_sample_analy = df_counts_sparse_all_samples_SD_positive[ barcode_list ]  
 
  start_time = time.time()
  nru_dict = nru ( df_counts_sparse_sample_analy , nz_min=nz_min_complemetary , n_genes=5 ) 
  end_time = time.time()
  elapsed = end_time - start_time
  print ( 'function nru elapsed time: ',  f"{ elapsed:.1f}", ' seconds'  )

  df_gene_stats_half_cell_sample = nru_dict['df_gene_stats'] [['A_g']].rename ( columns={'A_g': sample} )
  df_gene_stats_half_cell_sample.sort_values ( [sample], ascending=False, inplace=True )
 
  df_gene_stats_half_cell_samples_list.append ( df_gene_stats_half_cell_sample )
  
  
df_gene_stats_half_cell_samples = pd.concat ( df_gene_stats_half_cell_samples_list, axis=1 )
print ( '\n df_gene_stats_half_cell_samples' )
print ( df_gene_stats_half_cell_samples  )
print ( '\n\n df_gene_stats_half_cell_samples.describe' )
print ( df_gene_stats_half_cell_samples.describe ( percentiles=pctl_list ) )

calculations for samples

 sample:  0
counts_sparse_selected_genes.shape:  (12082, 12448)
counts_sparse_selected_csr.shape:  (12082, 12448)
calculating sum of squares of Pearson residuals using all cells
calculating sum of squares of Pearson residuals using cell sample  0
calculating sum of squares of Pearson residuals using cell sample  1
calculating sum of squares of Pearson residuals using cell sample  2
calculating sum of squares of Pearson residuals using cell sample  3
calculating sum of squares of Pearson residuals using cell sample  4
calculating sum of squares of Pearson residuals using cell sample  5
calculating sum of squares of Pearson residuals using cell sample  6
calculating sum of squares of Pearson residuals using cell sample  7
calculating sum of squares of Pearson residuals using cell sample  8
calculating sum of squares of Pearson residuals using cell sample  9
calculating sum of squares of Pearson residuals using cell sample  10
calculating sum of squares of Pearso

In [16]:
df_gene_stats_half_cell_samples.to_pickle ( results_dsn )
df_complementary_selected_cells.to_pickle ( df_cells_dsn )

f = open( gene_array_dsn, 'wb' )    
pickle.dump ( gene_select_array, f )
f.close()  