In [1]:
import pandas as pd
import numpy  as np

import pickle 

from pathlib import Path

import time


pd.set_option('display.max_columns', 5)

In [2]:
np.random.seed( 12345 ) 

In [3]:
#### user specified

data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "lupus"

data_path = Path ( data_folder + data_subfolder )


import sys
sys.path.append( data_folder )

from  nru_DE import *

In [4]:
# output data
results_pkl = "Ag_complementary_samples.pkl"
gene_array_pkl  = "gene_array_complementary_samples.pkl"  # for scanpy and SCTransform
df_cells_pkl  = "df_cells_complementary_samples.pkl"      # for scanpy and SCTransform
 
# input data
nru_dict_pkl =  "nru_dict.pkl"
counts_pkl = "counts_sparse.pkl" # sparse pandas data frame: cells in columns; genes in rows
 


# paths: output data
results_dsn = data_path / results_pkl
gene_array_dsn = data_path / gene_array_pkl
df_cells_dsn = data_path / df_cells_pkl

# paths: input data
nru_dict_dsn = data_path / nru_dict_pkl
counts_dsn = data_path / counts_pkl

In [5]:
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .99 ]

In [6]:
nz_min_complemetary = 50
n_samples = 2

In [7]:
f = open( nru_dict_dsn, 'rb' )    
nru_dict = pickle.load(f)           
f.close()       

df_selected_cells = nru_dict['df_selected_cells']
print ( ' df_selected_cells:  ' )
print ( '\n', df_selected_cells )

df_gene_stats = nru_dict['df_gene_stats'] [[ 'nz_cells', 'M_g', 'A_g']]
print ( '\n\n df_gene_stats:  ' )
print ( '\n', df_gene_stats )

 df_selected_cells:  

                      0      1   ...     38     39
AAACATACAATGCC-1  False   True  ...  False   True
AAACATACATTTCC-1   True  False  ...   True  False
AAACATACCAGAAA-1   True  False  ...   True  False
AAACATACCAGCTA-1   True  False  ...  False   True
AAACATACCATGCA-1  False   True  ...   True  False
...                 ...    ...  ...    ...    ...
TTTGCATGCTAAGC-1   True  False  ...  False   True
TTTGCATGGGACGA-1   True  False  ...   True  False
TTTGCATGGTGAGG-1  False   True  ...   True  False
TTTGCATGGTTTGG-1   True  False  ...  False   True
TTTGCATGTCTTAC-1  False   True  ...  False   True

[26820 rows x 40 columns]


 df_gene_stats:  

                  nz_cells         M_g         A_g
ENSG00000244734      2432  618.223412  583.983476
ENSG00000188536       614  187.010513  170.301545
ENSG00000167996     26522   95.555335   94.267171
ENSG00000206172       380  105.145843   93.234056
ENSG00000087086     23321   83.811940   82.854512
...                   ...  

In [8]:
df_counts_sparse = pd.read_pickle ( counts_dsn )
print ( ' df_counts_sparse:' )
print ( df_counts_sparse )
print ( '\n type( df_counts_sparse ) ', type ( df_counts_sparse ))
print ( '\n df_counts_sparse.sparse.density: ', df_counts_sparse.sparse.density)

 df_counts_sparse:
barcode          AAACATACAATGCC-1  AAACATACATTTCC-1  ...  TTTGCATGGTTTGG-1  \
ENSEMBL_ID                                           ...                     
ENSG00000239945                 0                 0  ...                 0   
ENSG00000237683                 0                 0  ...                 0   
ENSG00000228463                 0                 0  ...                 0   
ENSG00000228327                 0                 0  ...                 0   
ENSG00000237491                 0                 0  ...                 0   
...                           ...               ...  ...               ...   
ENSG00000160298                 0                 0  ...                 0   
ENSG00000160299                 0                 0  ...                 0   
ENSG00000160305                 0                 0  ...                 0   
ENSG00000160307                 0                 0  ...                 0   
ENSG00000160310                 0            

In [9]:
df_counts_sparse_gene_sel = df_counts_sparse.loc [ df_gene_stats.index ]
print ( ' df_counts_sparse_gene_sel:' )
print ( df_counts_sparse_gene_sel )
print ( '\n type( df_counts_sparse_gene_sel ) ', type ( df_counts_sparse_gene_sel ))
print ( '\n df_counts_sparse_gene_sel.sparse.density: ', df_counts_sparse_gene_sel.sparse.density)

del df_counts_sparse

 df_counts_sparse_gene_sel:
barcode          AAACATACAATGCC-1  AAACATACATTTCC-1  ...  TTTGCATGGTTTGG-1  \
ENSG00000244734                 0                 0  ...                 0   
ENSG00000188536                 0                 0  ...                 0   
ENSG00000167996                32               388  ...                 8   
ENSG00000206172                 0                 0  ...                 0   
ENSG00000087086                 0               279  ...                 4   
...                           ...               ...  ...               ...   
ENSG00000136492                 0                 0  ...                 0   
ENSG00000253593                 0                 0  ...                 0   
ENSG00000172379                 0                 0  ...                 0   
ENSG00000226004                 0                 0  ...                 0   
ENSG00000123643                 0                 0  ...                 0   

barcode          TTTGCATGTCTTAC-1  

In [10]:
df_sample_nz_totals_list = []
df_sample_nz_totals = pd.DataFrame ( index=df_counts_sparse_gene_sel.index )


print ( 'for each gene, compute number of nonzero cells in each UMI count sub-matrix' )
for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  
  df_cell_sample = df_selected_cells[[sample]]
  df_cell_select = df_cell_sample [ df_cell_sample[sample] ]
  barcode_list = df_cell_select.index.values.tolist()
  
  df_counts_sparse_sample = df_counts_sparse_gene_sel[ barcode_list ]  
  df_counts_GT_0 = ( df_counts_sparse_sample > 0 ).astype( int )  
  df_sample_nz_sum = df_counts_GT_0.sum ( axis=1 ).to_frame ( name = sample )
  df_sample_nz_totals_list.append ( df_sample_nz_sum )
  
df_sample_nz_totals = pd.concat ( df_sample_nz_totals_list , axis=1 )  
df_sample_nz_totals['min_nz'] = df_sample_nz_totals.min ( axis=1 )

print ( '\n df_sample_nz_totals' )
print ( df_sample_nz_totals  )
print ( '\n\n df_sample_nz_totals.describe' )
print ( df_sample_nz_totals.describe ( percentiles=pctl_list ) )

for each gene, compute number of nonzero cells in each UMI count sub-matrix

 sample:  0

 sample:  1

 df_sample_nz_totals
                     0      1  min_nz
ENSG00000244734   1190   1242    1190
ENSG00000188536    297    317     297
ENSG00000167996  13323  13199   13199
ENSG00000206172    180    200     180
ENSG00000087086  11766  11555   11555
...                ...    ...     ...
ENSG00000136492     32     39      32
ENSG00000253593     25     25      25
ENSG00000172379     38     44      38
ENSG00000226004     40     33      33
ENSG00000123643     27     27      27

[10619 rows x 3 columns]


 df_sample_nz_totals.describe
                  0             1        min_nz
count  10619.000000  10619.000000  10619.000000
mean     715.678783    709.416141    701.075996
std     1496.410309   1481.705783   1479.980696
min       14.000000     17.000000     14.000000
1%        25.000000     25.000000     23.000000
5%        33.000000     33.000000     30.000000
10%       43.000000     43

In [11]:
gene_select_array = df_sample_nz_totals[ df_sample_nz_totals['min_nz'] >= nz_min_complemetary ].index.values

df_counts_sparse_all_samples = df_counts_sparse_gene_sel.loc [ gene_select_array ]
print ( '\n df_counts_sparse_all_samples' )
print ( df_counts_sparse_all_samples  )

del df_counts_sparse_gene_sel


 df_counts_sparse_all_samples
barcode          AAACATACAATGCC-1  AAACATACATTTCC-1  ...  TTTGCATGGTTTGG-1  \
ENSG00000244734                 0                 0  ...                 0   
ENSG00000188536                 0                 0  ...                 0   
ENSG00000167996                32               388  ...                 8   
ENSG00000206172                 0                 0  ...                 0   
ENSG00000087086                 0               279  ...                 4   
...                           ...               ...  ...               ...   
ENSG00000113494                 0                 0  ...                 0   
ENSG00000196550                 0                 0  ...                 0   
ENSG00000241399                 0                 0  ...                 0   
ENSG00000246100                 0                 0  ...                 0   
ENSG00000158186                 0                 0  ...                 0   

barcode          TTTGCATGTCTTAC-

In [12]:
#### verify that all cells have nonzero sequencing depth
ser_cell_totals = df_counts_sparse_all_samples.sum ( axis=0 ) 
print ( '\n\n ser_cell_totals.describe' )
print ( ser_cell_totals.describe ( percentiles=pctl_list ) )

ser_cell_totals_GT_0 = ser_cell_totals[ ser_cell_totals > 0 ]
print ( '\n\n ser_cell_totals_GT_0.describe' )
print ( ser_cell_totals_GT_0.describe ( percentiles=pctl_list ) )



 ser_cell_totals.describe
count    26820.000000
mean      1493.266219
std        747.881154
min        516.000000
1%         611.190000
5%         705.000000
10%        790.000000
25%        967.000000
50%       1246.000000
75%       1817.000000
90%       2636.000000
95%       3067.000000
99%       3787.810000
max       7104.000000
dtype: float64


 ser_cell_totals_GT_0.describe
count    26820.000000
mean      1493.266219
std        747.881154
min        516.000000
1%         611.190000
5%         705.000000
10%        790.000000
25%        967.000000
50%       1246.000000
75%       1817.000000
90%       2636.000000
95%       3067.000000
99%       3787.810000
max       7104.000000
dtype: float64


In [13]:
df_counts_sparse_all_samples_SD_positive  = df_counts_sparse_all_samples[ ser_cell_totals_GT_0.index ]
print ( ' df_counts_sparse_all_samples_SD_positive:' )
print ( df_counts_sparse_all_samples_SD_positive )
print ( '\n type( df_counts_sparse_all_samples_SD_positive ) ', type ( df_counts_sparse_all_samples_SD_positive ))
print ( '\n df_counts_sparse_all_samples_SD_positive.sparse.density: ', df_counts_sparse_all_samples_SD_positive.sparse.density)

 df_counts_sparse_all_samples_SD_positive:
barcode          AAACATACAATGCC-1  AAACATACATTTCC-1  ...  TTTGCATGGTTTGG-1  \
ENSG00000244734                 0                 0  ...                 0   
ENSG00000188536                 0                 0  ...                 0   
ENSG00000167996                32               388  ...                 8   
ENSG00000206172                 0                 0  ...                 0   
ENSG00000087086                 0               279  ...                 4   
...                           ...               ...  ...               ...   
ENSG00000113494                 0                 0  ...                 0   
ENSG00000196550                 0                 0  ...                 0   
ENSG00000241399                 0                 0  ...                 0   
ENSG00000246100                 0                 0  ...                 0   
ENSG00000158186                 0                 0  ...                 0   

barcode          TTT

In [14]:
df_cells_SD_positive = pd.DataFrame ( index = df_counts_sparse_all_samples_SD_positive.columns, data=True, columns=['SD_positive'] )


df_complementary_selected_cells_list = []

for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  df_cell_sample = pd.concat ( [ df_selected_cells[[sample]], df_cells_SD_positive ], axis=1 )
  df_cell_sample = df_cell_sample.assign( SD_positive = df_cell_sample['SD_positive'].fillna(False) )
  df_complementary_selected_cells_list.append ( df_cell_sample.all ( axis=1 ).to_frame ( name = sample )   ) 

df_complementary_selected_cells = pd.concat ( df_complementary_selected_cells_list, axis=1 )  
  
print ( '\n df_complementary_selected_cells' )
print ( df_complementary_selected_cells  )
print ( '\n\n df_complementary_selected_cells.sum' )
print ( df_complementary_selected_cells.sum () )


 sample:  0

 sample:  1

 df_complementary_selected_cells
                      0      1
AAACATACAATGCC-1  False   True
AAACATACATTTCC-1   True  False
AAACATACCAGAAA-1   True  False
AAACATACCAGCTA-1   True  False
AAACATACCATGCA-1  False   True
...                 ...    ...
TTTGCATGCTAAGC-1   True  False
TTTGCATGGGACGA-1   True  False
TTTGCATGGTGAGG-1  False   True
TTTGCATGGTTTGG-1   True  False
TTTGCATGTCTTAC-1  False   True

[26820 rows x 2 columns]


 df_complementary_selected_cells.sum
0    13478
1    13342
dtype: int64


In [15]:
print ( 'calculations for samples' )

df_gene_stats_half_cell_samples_list = []
  
for sample in  range(n_samples):
  print ( '\n sample: ', sample )
  df_cell_sample = df_complementary_selected_cells[[sample]]
  df_cell_select = df_cell_sample [ df_cell_sample[sample] ]
  barcode_list = df_cell_select.index.values.tolist() 
  df_counts_sparse_sample_analy = df_counts_sparse_all_samples_SD_positive[ barcode_list ]  
 
  start_time = time.time()
  nru_dict = nru ( df_counts_sparse_sample_analy , nz_min=nz_min_complemetary , n_genes=5 ) 
  end_time = time.time()
  elapsed = end_time - start_time
  print ( 'function nru elapsed time: ',  f"{ elapsed:.1f}", ' seconds'  )

  df_gene_stats_half_cell_sample = nru_dict['df_gene_stats'] [['A_g']].rename ( columns={'A_g': sample} )
  df_gene_stats_half_cell_sample.sort_values ( [sample], ascending=False, inplace=True )
 
  df_gene_stats_half_cell_samples_list.append ( df_gene_stats_half_cell_sample )
  
  
df_gene_stats_half_cell_samples = pd.concat ( df_gene_stats_half_cell_samples_list, axis=1 )
print ( '\n df_gene_stats_half_cell_samples' )
print ( df_gene_stats_half_cell_samples  )
print ( '\n\n df_gene_stats_half_cell_samples.describe' )
print ( df_gene_stats_half_cell_samples.describe ( percentiles=pctl_list ) )

calculations for samples

 sample:  0
counts_sparse_selected_genes.shape:  (9091, 13478)
counts_sparse_selected_csr.shape:  (9091, 13478)
calculating sum of squares of Pearson residuals using all cells
calculating sum of squares of Pearson residuals using cell sample  0
calculating sum of squares of Pearson residuals using cell sample  1
calculating sum of squares of Pearson residuals using cell sample  2
calculating sum of squares of Pearson residuals using cell sample  3
calculating sum of squares of Pearson residuals using cell sample  4
calculating sum of squares of Pearson residuals using cell sample  5
calculating sum of squares of Pearson residuals using cell sample  6
calculating sum of squares of Pearson residuals using cell sample  7
calculating sum of squares of Pearson residuals using cell sample  8
calculating sum of squares of Pearson residuals using cell sample  9
calculating sum of squares of Pearson residuals using cell sample  10
calculating sum of squares of Pearson 

In [16]:
df_gene_stats_half_cell_samples.to_pickle ( results_dsn )
df_complementary_selected_cells.to_pickle ( df_cells_dsn )

f = open( gene_array_dsn, 'wb' )    
pickle.dump ( gene_select_array, f )
f.close()  