In [1]:
import pandas as pd
import numpy  as np

import pickle 

from pathlib import Path

import time


pd.set_option('display.max_columns', 5)

In [2]:
np.random.seed( 12345 ) 

In [3]:
#### user specified

data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "10k_heart"

data_path = Path ( data_folder + data_subfolder )


import sys
sys.path.append( data_folder )

from  nru_DE import *

In [4]:
# output data
results_pkl = "Ag_complementary_samples.pkl"
gene_array_pkl  = "gene_array_complementary_samples.pkl"  # for scanpy and SCTransform
df_cells_pkl  = "df_cells_complementary_samples.pkl"      # for scanpy and SCTransform
 
# input data
nru_dict_pkl =  "nru_dict.pkl"
counts_pkl = "counts_sparse.pkl" # sparse pandas data frame: cells in columns; genes in rows
 


# paths: output data
results_dsn = data_path / results_pkl
gene_array_dsn = data_path / gene_array_pkl
df_cells_dsn = data_path / df_cells_pkl

# paths: input data
nru_dict_dsn = data_path / nru_dict_pkl
counts_dsn = data_path / counts_pkl

In [5]:
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .99 ]

In [6]:
nz_min_complemetary = 50
n_samples = 2

In [7]:
f = open( nru_dict_dsn, 'rb' )    
nru_dict = pickle.load(f)           
f.close()       

df_selected_cells = nru_dict['df_selected_cells']
print ( ' df_selected_cells:  ' )
print ( '\n', df_selected_cells )

df_gene_stats = nru_dict['df_gene_stats'] [[ 'nz_cells', 'M_g', 'A_g']]
print ( '\n\n df_gene_stats:  ' )
print ( '\n', df_gene_stats )

 df_selected_cells:  

                        0      1   ...     38     39
AAACCCAAGCGAGTCA-1  False   True  ...  False   True
AAACCCAAGGTCATTC-1   True  False  ...  False   True
AAACCCACACTGTGTA-1   True  False  ...   True  False
AAACCCACATATCTGG-1   True  False  ...   True  False
AAACCCACATCAACCA-1  False   True  ...   True  False
...                   ...    ...  ...    ...    ...
TTTGGTTTCTGTACAG-1  False   True  ...  False   True
TTTGTTGAGCTTTCCC-1  False   True  ...  False   True
TTTGTTGGTGCTCTCT-1  False   True  ...   True  False
TTTGTTGTCCCAGGAC-1   True  False  ...   True  False
TTTGTTGTCTTCGACC-1   True  False  ...   True  False

[7713 rows x 40 columns]


 df_gene_stats:  

                     nz_cells          M_g          A_g
ENSMUSG00000052305      6202  5632.169642  5431.424034
ENSMUSG00000069919      2897  1465.445393  1391.789121
ENSMUSG00000073940      2925  1444.191949  1383.160300
ENSMUSG00000069917      2625  1017.600302   971.366465
ENSMUSG00000038791       802 

In [8]:
df_counts_sparse = pd.read_pickle ( counts_dsn )
print ( ' df_counts_sparse:' )
print ( df_counts_sparse )
print ( '\n type( df_counts_sparse ) ', type ( df_counts_sparse ))
print ( '\n df_counts_sparse.sparse.density: ', df_counts_sparse.sparse.density)

 df_counts_sparse:
                    AAACCCAAGCGAGTCA-1  AAACCCAAGGTCATTC-1  ...  \
ENSEMBL_ID                                                  ...   
ENSMUSG00000051951                   0                   0  ...   
ENSMUSG00000102343                   0                   0  ...   
ENSMUSG00000025900                   0                   0  ...   
ENSMUSG00000025902                   0                   0  ...   
ENSMUSG00000033845                   0                   1  ...   
...                                ...                 ...  ...   
ENSMUSG00000079800                   0                   0  ...   
ENSMUSG00000079808                   0                   0  ...   
ENSMUSG00000095041                   0                   0  ...   
ENSMUSG00000063897                   0                   0  ...   
ENSMUSG00000095742                   0                   0  ...   

                    TTTGTTGTCCCAGGAC-1  TTTGTTGTCTTCGACC-1  
ENSEMBL_ID                                      

In [9]:
df_counts_sparse_gene_sel = df_counts_sparse.loc [ df_gene_stats.index ]
print ( ' df_counts_sparse_gene_sel:' )
print ( df_counts_sparse_gene_sel )
print ( '\n type( df_counts_sparse_gene_sel ) ', type ( df_counts_sparse_gene_sel ))
print ( '\n df_counts_sparse_gene_sel.sparse.density: ', df_counts_sparse_gene_sel.sparse.density)

del df_counts_sparse

 df_counts_sparse_gene_sel:
                    AAACCCAAGCGAGTCA-1  AAACCCAAGGTCATTC-1  ...  \
ENSMUSG00000052305                 823                   3  ...   
ENSMUSG00000069919                 268                   0  ...   
ENSMUSG00000073940                  33                   0  ...   
ENSMUSG00000069917                 165                   0  ...   
ENSMUSG00000038791                   0                   1  ...   
...                                ...                 ...  ...   
ENSMUSG00000020905                   0                   0  ...   
ENSMUSG00000027313                   0                   0  ...   
ENSMUSG00000104861                   0                   0  ...   
ENSMUSG00000041144                   0                   0  ...   
ENSMUSG00000000154                   0                   0  ...   

                    TTTGTTGTCCCAGGAC-1  TTTGTTGTCTTCGACC-1  
ENSMUSG00000052305                   3                2913  
ENSMUSG00000069919                   1       

In [10]:
df_sample_nz_totals_list = []
df_sample_nz_totals = pd.DataFrame ( index=df_counts_sparse_gene_sel.index )


print ( 'for each gene, compute number of nonzero cells in each UMI count sub-matrix' )
for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  
  df_cell_sample = df_selected_cells[[sample]]
  df_cell_select = df_cell_sample [ df_cell_sample[sample] ]
  barcode_list = df_cell_select.index.values.tolist()
  
  df_counts_sparse_sample = df_counts_sparse_gene_sel[ barcode_list ]  
  df_counts_GT_0 = ( df_counts_sparse_sample > 0 ).astype( int )  
  df_sample_nz_sum = df_counts_GT_0.sum ( axis=1 ).to_frame ( name = sample )
  df_sample_nz_totals_list.append ( df_sample_nz_sum )
  
df_sample_nz_totals = pd.concat ( df_sample_nz_totals_list , axis=1 )  
df_sample_nz_totals['min_nz'] = df_sample_nz_totals.min ( axis=1 )

print ( '\n df_sample_nz_totals' )
print ( df_sample_nz_totals  )
print ( '\n\n df_sample_nz_totals.describe' )
print ( df_sample_nz_totals.describe ( percentiles=pctl_list ) )

for each gene, compute number of nonzero cells in each UMI count sub-matrix

 sample:  0

 sample:  1

 df_sample_nz_totals
                       0     1  min_nz
ENSMUSG00000052305  3132  3070    3070
ENSMUSG00000069919  1450  1447    1447
ENSMUSG00000073940  1450  1475    1450
ENSMUSG00000069917  1319  1306    1306
ENSMUSG00000038791   399   403     399
...                  ...   ...     ...
ENSMUSG00000020905    27    27      27
ENSMUSG00000027313    26    24      24
ENSMUSG00000104861    25    30      25
ENSMUSG00000041144    31    25      25
ENSMUSG00000000154    27    31      27

[14472 rows x 3 columns]


 df_sample_nz_totals.describe
                  0             1        min_nz
count  14472.000000  14472.000000  14472.000000
mean     658.298231    651.538557    643.293740
std      653.925177    637.100205    638.595519
min       16.000000     16.000000     16.000000
1%        25.000000     26.000000     24.000000
5%        36.000000     37.000000     33.000000
10%       51.1

In [11]:
gene_select_array = df_sample_nz_totals[ df_sample_nz_totals['min_nz'] >= nz_min_complemetary ].index.values

df_counts_sparse_all_samples = df_counts_sparse_gene_sel.loc [ gene_select_array ]
print ( '\n df_counts_sparse_all_samples' )
print ( df_counts_sparse_all_samples  )

del df_counts_sparse_gene_sel


 df_counts_sparse_all_samples
                    AAACCCAAGCGAGTCA-1  AAACCCAAGGTCATTC-1  ...  \
ENSMUSG00000052305                 823                   3  ...   
ENSMUSG00000069919                 268                   0  ...   
ENSMUSG00000073940                  33                   0  ...   
ENSMUSG00000069917                 165                   0  ...   
ENSMUSG00000038791                   0                   1  ...   
...                                ...                 ...  ...   
ENSMUSG00000028527                   0                   0  ...   
ENSMUSG00000045790                   0                   0  ...   
ENSMUSG00000048503                   0                   0  ...   
ENSMUSG00000027777                   0                   0  ...   
ENSMUSG00000037455                   0                   0  ...   

                    TTTGTTGTCCCAGGAC-1  TTTGTTGTCTTCGACC-1  
ENSMUSG00000052305                   3                2913  
ENSMUSG00000069919                   1    

In [12]:
#### verify that all cells have nonzero sequencing depth
ser_cell_totals = df_counts_sparse_all_samples.sum ( axis=0 ) 
print ( '\n\n ser_cell_totals.describe' )
print ( ser_cell_totals.describe ( percentiles=pctl_list ) )

ser_cell_totals_GT_0 = ser_cell_totals[ ser_cell_totals > 0 ]
print ( '\n\n ser_cell_totals_GT_0.describe' )
print ( ser_cell_totals_GT_0.describe ( percentiles=pctl_list ) )



 ser_cell_totals.describe
count      7713.000000
mean      10890.337871
std       11943.514807
min         478.000000
1%          544.120000
5%          733.600000
10%        1117.400000
25%        2475.000000
50%        6360.000000
75%       15712.000000
90%       27499.600000
95%       34961.000000
99%       50428.800000
max      136905.000000
dtype: float64


 ser_cell_totals_GT_0.describe
count      7713.000000
mean      10890.337871
std       11943.514807
min         478.000000
1%          544.120000
5%          733.600000
10%        1117.400000
25%        2475.000000
50%        6360.000000
75%       15712.000000
90%       27499.600000
95%       34961.000000
99%       50428.800000
max      136905.000000
dtype: float64


In [13]:
df_counts_sparse_all_samples_SD_positive  = df_counts_sparse_all_samples[ ser_cell_totals_GT_0.index ]
print ( ' df_counts_sparse_all_samples_SD_positive:' )
print ( df_counts_sparse_all_samples_SD_positive )
print ( '\n type( df_counts_sparse_all_samples_SD_positive ) ', type ( df_counts_sparse_all_samples_SD_positive ))
print ( '\n df_counts_sparse_all_samples_SD_positive.sparse.density: ', df_counts_sparse_all_samples_SD_positive.sparse.density)

 df_counts_sparse_all_samples_SD_positive:
                    AAACCCAAGCGAGTCA-1  AAACCCAAGGTCATTC-1  ...  \
ENSMUSG00000052305                 823                   3  ...   
ENSMUSG00000069919                 268                   0  ...   
ENSMUSG00000073940                  33                   0  ...   
ENSMUSG00000069917                 165                   0  ...   
ENSMUSG00000038791                   0                   1  ...   
...                                ...                 ...  ...   
ENSMUSG00000028527                   0                   0  ...   
ENSMUSG00000045790                   0                   0  ...   
ENSMUSG00000048503                   0                   0  ...   
ENSMUSG00000027777                   0                   0  ...   
ENSMUSG00000037455                   0                   0  ...   

                    TTTGTTGTCCCAGGAC-1  TTTGTTGTCTTCGACC-1  
ENSMUSG00000052305                   3                2913  
ENSMUSG00000069919            

In [14]:
df_cells_SD_positive = pd.DataFrame ( index = df_counts_sparse_all_samples_SD_positive.columns, data=True, columns=['SD_positive'] )


df_complementary_selected_cells_list = []

for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  df_cell_sample = pd.concat ( [ df_selected_cells[[sample]], df_cells_SD_positive ], axis=1 )
  df_cell_sample = df_cell_sample.assign( SD_positive = df_cell_sample['SD_positive'].fillna(False) )
  df_complementary_selected_cells_list.append ( df_cell_sample.all ( axis=1 ).to_frame ( name = sample )   ) 

df_complementary_selected_cells = pd.concat ( df_complementary_selected_cells_list, axis=1 )  
  
print ( '\n df_complementary_selected_cells' )
print ( df_complementary_selected_cells  )
print ( '\n\n df_complementary_selected_cells.sum' )
print ( df_complementary_selected_cells.sum () )


 sample:  0

 sample:  1

 df_complementary_selected_cells
                        0      1
AAACCCAAGCGAGTCA-1  False   True
AAACCCAAGGTCATTC-1   True  False
AAACCCACACTGTGTA-1   True  False
AAACCCACATATCTGG-1   True  False
AAACCCACATCAACCA-1  False   True
...                   ...    ...
TTTGGTTTCTGTACAG-1  False   True
TTTGTTGAGCTTTCCC-1  False   True
TTTGTTGGTGCTCTCT-1  False   True
TTTGTTGTCCCAGGAC-1   True  False
TTTGTTGTCTTCGACC-1   True  False

[7713 rows x 2 columns]


 df_complementary_selected_cells.sum
0    3919
1    3794
dtype: int64


In [15]:
print ( 'calculations for samples' )

df_gene_stats_half_cell_samples_list = []
  
for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  df_cell_sample = df_complementary_selected_cells[[sample]]
  df_cell_select = df_cell_sample [ df_cell_sample[sample] ]
  barcode_list = df_cell_select.index.values.tolist() 
  df_counts_sparse_sample_analy = df_counts_sparse_all_samples_SD_positive[ barcode_list ]  
 
  start_time = time.time()
  nru_dict = nru ( df_counts_sparse_sample_analy , nz_min=nz_min_complemetary , n_genes=5 ) 
  end_time = time.time()
  elapsed = end_time - start_time
  print ( 'function nru elapsed time: ',  f"{ elapsed:.1f}", ' seconds'  )

  df_gene_stats_half_cell_sample = nru_dict['df_gene_stats'] [['A_g']].rename ( columns={'A_g': sample} )
  df_gene_stats_half_cell_sample.sort_values ( [sample], ascending=False, inplace=True )
 
  df_gene_stats_half_cell_samples_list.append ( df_gene_stats_half_cell_sample )
  
  
df_gene_stats_half_cell_samples = pd.concat ( df_gene_stats_half_cell_samples_list, axis=1 )
print ( '\n df_gene_stats_half_cell_samples' )
print ( df_gene_stats_half_cell_samples  )
print ( '\n\n df_gene_stats_half_cell_samples.describe' )
print ( df_gene_stats_half_cell_samples.describe ( percentiles=pctl_list ) )

calculations for samples

 sample:  0
counts_sparse_selected_genes.shape:  (12995, 3919)
counts_sparse_selected_csr.shape:  (12995, 3919)
calculating sum of squares of Pearson residuals using all cells
calculating sum of squares of Pearson residuals using cell sample  0
calculating sum of squares of Pearson residuals using cell sample  1
calculating sum of squares of Pearson residuals using cell sample  2
calculating sum of squares of Pearson residuals using cell sample  3
calculating sum of squares of Pearson residuals using cell sample  4
calculating sum of squares of Pearson residuals using cell sample  5
calculating sum of squares of Pearson residuals using cell sample  6
calculating sum of squares of Pearson residuals using cell sample  7
calculating sum of squares of Pearson residuals using cell sample  8
calculating sum of squares of Pearson residuals using cell sample  9
calculating sum of squares of Pearson residuals using cell sample  10
calculating sum of squares of Pearson 

In [16]:
df_gene_stats_half_cell_samples.to_pickle ( results_dsn )
df_complementary_selected_cells.to_pickle ( df_cells_dsn )

f = open( gene_array_dsn, 'wb' )    
pickle.dump ( gene_select_array, f )
f.close()  