In [1]:
import pandas as pd
import numpy  as np

import pickle
 
from scipy import sparse

from pathlib import Path

pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "lupus"

data_path = Path ( data_folder + data_subfolder )

In [3]:
# output data
counts_pkl = "counts_sparse.pkl" # sparse Pandas data frame: cells in columns; genes in rows
clusters_pkl = "clusters.pkl"

# input data
counts_in_pkl = "sce_counts_nz_GE_1.pkl"
cells_pkl = "sce_row_data_nz_GE_1.pkl"
genes_pkl = "sce_column_data_nz_GE_1.pkl"


# paths: output data
counts_dsn = data_path / counts_pkl
clusters_dsn = data_path / clusters_pkl

# paths: input data
counts_in_dsn = data_path / counts_in_pkl
cells_dsn = data_path / cells_pkl
genes_dsn = data_path / genes_pkl

In [4]:
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99, .995, .999 ]

In [5]:
df_clusters_in = pd.read_pickle ( cells_dsn ).set_index ( ['barcode'] ).rename ( columns={'cluster':'Cluster'} )
print ( ' df_clusters_in \n', df_clusters_in )
print ( '\n\n df_clusters_in.value_counts \n' )
print ( df_clusters_in['Cluster'].value_counts() )

 df_clusters_in 
                           Cluster
barcode                          
AAACATACAATGCC-1      CD4 T cells
AAACATACATTTCC-1  CD14+ Monocytes
AAACATACCAGAAA-1  CD14+ Monocytes
AAACATACCAGCTA-1  CD14+ Monocytes
AAACATACCATGCA-1      CD4 T cells
...                           ...
TTTGCATGCTAAGC-1      CD4 T cells
TTTGCATGGGACGA-1      CD4 T cells
TTTGCATGGTGAGG-1      CD4 T cells
TTTGCATGGTTTGG-1      CD4 T cells
TTTGCATGTCTTAC-1      CD4 T cells

[26820 rows x 1 columns]


 df_clusters_in.value_counts 

CD4 T cells          11771
CD14+ Monocytes       5517
B cells               2735
CD8 T cells           2557
NK cells              2284
FCGR3A+ Monocytes     1489
Megakaryocytes         247
Dendritic cells        220
Name: Cluster, dtype: int64


In [6]:
df_genes = pd.read_pickle ( genes_dsn )
print (  ' df_genes \n\n', df_genes)

 df_genes 

             ENSEMBL_ID    gene_symbol
0      ENSG00000239945   RP11-34P13.8
1      ENSG00000237683     AL627309.1
2      ENSG00000228463     AP006222.2
3      ENSG00000228327  RP11-206L10.2
4      ENSG00000237491  RP11-206L10.9
...                ...            ...
18496  ENSG00000160298       C21orf58
18497  ENSG00000160299           PCNT
18498  ENSG00000160305          DIP2A
18499  ENSG00000160307          S100B
18500  ENSG00000160310          PRMT2

[18501 rows x 2 columns]


In [7]:
f = open( counts_in_dsn, 'rb' )  
arr_sparse_counts  = pickle.load (f)
f.close()       
print (  ' arr_sparse_counts:' )
print (  arr_sparse_counts )

df_counts_in_sparse = pd.DataFrame.sparse.from_spmatrix( arr_sparse_counts, index=df_genes['ENSEMBL_ID'], columns=df_clusters_in.index   ).astype(int)
print (  '\n\n df_counts_in_sparse:\n', df_counts_in_sparse)
print (  '\n type( df_counts_in_sparse )' , type ( df_counts_in_sparse ))
print (  '\n df_counts_in_sparse.sparse.density:  ', df_counts_in_sparse.sparse.density)

 arr_sparse_counts:
  (29, 0)	1.0
  (42, 0)	1.0
  (65, 0)	1.0
  (85, 0)	1.0
  (90, 0)	3.0
  (99, 0)	1.0
  (106, 0)	1.0
  (108, 0)	1.0
  (153, 0)	1.0
  (160, 0)	1.0
  (203, 0)	1.0
  (240, 0)	1.0
  (270, 0)	1.0
  (278, 0)	15.0
  (281, 0)	1.0
  (301, 0)	1.0
  (305, 0)	3.0
  (324, 0)	1.0
  (327, 0)	5.0
  (329, 0)	6.0
  (341, 0)	1.0
  (342, 0)	1.0
  (362, 0)	1.0
  (364, 0)	1.0
  (384, 0)	1.0
  :	:
  (17262, 26819)	3.0
  (17319, 26819)	12.0
  (17378, 26819)	1.0
  (17422, 26819)	1.0
  (17473, 26819)	1.0
  (17485, 26819)	1.0
  (17498, 26819)	5.0
  (17534, 26819)	10.0
  (17662, 26819)	1.0
  (17669, 26819)	6.0
  (17720, 26819)	6.0
  (17818, 26819)	3.0
  (17877, 26819)	2.0
  (17910, 26819)	1.0
  (18006, 26819)	1.0
  (18051, 26819)	1.0
  (18082, 26819)	1.0
  (18138, 26819)	11.0
  (18160, 26819)	1.0
  (18247, 26819)	1.0
  (18264, 26819)	1.0
  (18365, 26819)	1.0
  (18377, 26819)	1.0
  (18425, 26819)	1.0
  (18439, 26819)	1.0


 df_counts_in_sparse:
 barcode          AAACATACAATGCC-1  AAACATACATTTCC-1

In [8]:
#  exclude genes with no nonzero cells
df_counts_GT_0 = ( df_counts_in_sparse > 0 ).astype(int)
ser_gene_nonzero_counts = df_counts_GT_0.sum ( axis=1 )
print (  ' ser_gene_nonzero_counts.describe:')
print ( ser_gene_nonzero_counts.describe( percentiles = pctl_list ) )

df_counts_sel = df_counts_in_sparse.loc[ ser_gene_nonzero_counts > 0 ]
print (  '\n\n df_counts_sel: \n', df_counts_sel)
print (  '\n type( df_counts_sel )' , type ( df_counts_sel ))
print (  '\n df_counts_sel.sparse.density:  ', df_counts_sel.sparse.density)

 ser_gene_nonzero_counts.describe:
count    18501.000000
mean       822.537052
std       2362.001021
min          1.000000
1%           1.000000
5%           1.000000
10%          2.000000
25%          7.000000
50%        100.000000
75%        577.000000
90%       1993.000000
95%       3844.000000
96%       4655.000000
97%       5697.000000
98%       7683.000000
99%      11305.000000
99.5%    19164.000000
99.9%    25921.000000
max      26820.000000
dtype: float64


 df_counts_sel: 
 barcode          AAACATACAATGCC-1  AAACATACATTTCC-1  ...  TTTGCATGGTTTGG-1  \
ENSEMBL_ID                                           ...                     
ENSG00000239945                 0                 0  ...                 0   
ENSG00000237683                 0                 0  ...                 0   
ENSG00000228463                 0                 0  ...                 0   
ENSG00000228327                 0                 0  ...                 0   
ENSG00000237491                 0           

In [9]:
# verify that all cells have nonzero total count
ser_cell_totals = df_counts_sel.sum ( axis=0 ) 
print (  ' ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ) )

ser_cell_totals_GT_0 = ser_cell_totals.loc [ ser_cell_totals > 0 ]
print (  '\n ser_cell_totals_GT_0.describe:' )
print ( ser_cell_totals_GT_0.describe( percentiles = pctl_list ) )

 ser_cell_totals.describe:
count    26820.000000
mean      1501.296719
std        750.727091
min        563.000000
1%         617.000000
5%         710.000000
10%        795.000000
25%        973.000000
50%       1253.000000
75%       1827.000000
90%       2647.100000
95%       3080.000000
96%       3192.240000
97%       3338.000000
98%       3531.000000
99%       3797.000000
99.5%     4133.430000
99.9%     4751.448000
max       7168.000000
dtype: float64

 ser_cell_totals_GT_0.describe:
count    26820.000000
mean      1501.296719
std        750.727091
min        563.000000
1%         617.000000
5%         710.000000
10%        795.000000
25%        973.000000
50%       1253.000000
75%       1827.000000
90%       2647.100000
95%       3080.000000
96%       3192.240000
97%       3338.000000
98%       3531.000000
99%       3797.000000
99.5%     4133.430000
99.9%     4751.448000
max       7168.000000
dtype: float64


In [10]:
df_counts_sparse = df_counts_sel[ ser_cell_totals_GT_0.index ]
print (  ' df_counts_sparse:' )
print (  df_counts_sparse )
print (  '\n type( df_counts_sparse ) ', type ( df_counts_sparse ))
print (  '\n df_counts_sparse.sparse.density: ', df_counts_sparse.sparse.density)

ser_cell_totals = df_counts_sparse.sum ( axis=0 )
print (  '\n ser_cell_totals:' )
print (  ser_cell_totals )
print (  '\n ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ))

 df_counts_sparse:
barcode          AAACATACAATGCC-1  AAACATACATTTCC-1  ...  TTTGCATGGTTTGG-1  \
ENSEMBL_ID                                           ...                     
ENSG00000239945                 0                 0  ...                 0   
ENSG00000237683                 0                 0  ...                 0   
ENSG00000228463                 0                 0  ...                 0   
ENSG00000228327                 0                 0  ...                 0   
ENSG00000237491                 0                 0  ...                 0   
...                           ...               ...  ...               ...   
ENSG00000160298                 0                 0  ...                 0   
ENSG00000160299                 0                 0  ...                 0   
ENSG00000160305                 0                 0  ...                 0   
ENSG00000160307                 0                 0  ...                 0   
ENSG00000160310                 0            

In [11]:
df_clusters = df_clusters_in.loc [ df_counts_sparse.columns.values.tolist() ]
print ( ' df_clusters \n', df_clusters )
print ( '\n\n df_clusters.value_counts \n' )
print ( df_clusters['Cluster'].value_counts() )

 df_clusters 
                           Cluster
barcode                          
AAACATACAATGCC-1      CD4 T cells
AAACATACATTTCC-1  CD14+ Monocytes
AAACATACCAGAAA-1  CD14+ Monocytes
AAACATACCAGCTA-1  CD14+ Monocytes
AAACATACCATGCA-1      CD4 T cells
...                           ...
TTTGCATGCTAAGC-1      CD4 T cells
TTTGCATGGGACGA-1      CD4 T cells
TTTGCATGGTGAGG-1      CD4 T cells
TTTGCATGGTTTGG-1      CD4 T cells
TTTGCATGTCTTAC-1      CD4 T cells

[26820 rows x 1 columns]


 df_clusters.value_counts 

CD4 T cells          11771
CD14+ Monocytes       5517
B cells               2735
CD8 T cells           2557
NK cells              2284
FCGR3A+ Monocytes     1489
Megakaryocytes         247
Dendritic cells        220
Name: Cluster, dtype: int64


In [12]:
df_counts_sparse.to_pickle ( counts_dsn )
df_clusters.to_pickle ( clusters_dsn )