In [1]:
import pandas as pd
import numpy  as np 
 
from pathlib import Path

from scipy.io import mmread


pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = r"D:/analyze_Pearson_residuals/"

data_subfolder = "33k_PBMC"

data_path = Path ( data_folder + data_subfolder )

clusters_folder = data_folder + data_subfolder + "/kmeans/10_clusters" 
clusters_path = Path ( clusters_folder )

In [3]:
counts_pkl = "counts_sparse.pkl" # sparse Pandas data frame: cells in columns; genes in rows
clusters_pkl = "clusters.pkl"


matrix_mtx = "matrix.mtx"
genes_tsv  = "genes.tsv"
cells_tsv = "barcodes.tsv"

clusters_csv = "clusters.csv"


# pkl outputs
counts_dsn = data_path / counts_pkl
clusters_out_dsn = data_path / clusters_pkl


# inputs
matrix_dsn  = data_path / matrix_mtx
genes_dsn = data_path / genes_tsv
cells_dsn = data_path / cells_tsv
clusters_in_dsn = clusters_path / clusters_csv

In [4]:
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99, .995, .999 ]

In [5]:
df_clusters = pd.read_csv ( clusters_in_dsn  )
df_clusters.set_index ( ['Barcode'], inplace=True )
print ( ' df_clusters \n', df_clusters )
print ( '\n\n df_clusters.value_counts \n' )
print ( df_clusters['Cluster'].value_counts() )

 df_clusters 
                   Cluster
Barcode                  
AAACATTGACGACT-1        1
AAACATTGACGGGA-1        8
AAACATTGCCGCTT-1        2
AAACATTGCTCATT-1        3
AAACCGTGCCCTCA-1        5
...                   ...
TTTGCATGCACTAG-6        3
TTTGCATGCCGAAT-6        1
TTTGCATGGAGGTG-6        7
TTTGCATGGATAAG-6        3
TTTGCATGGGCATT-6        4

[33148 rows x 1 columns]


 df_clusters.value_counts 

1     7406
4     7047
2     6307
5     4164
3     3660
7     2288
8     1959
6      135
10     107
9       75
Name: Cluster, dtype: int64


In [6]:
df_genes = pd.read_csv ( genes_dsn, sep='\t', names = ['ENSEMBL_ID','gene_symbol'], usecols =[0,1] ) 
print (  ' df_genes \n\n', df_genes)

 df_genes 

             ENSEMBL_ID   gene_symbol
0      ENSG00000243485    MIR1302-10
1      ENSG00000237613       FAM138A
2      ENSG00000186092         OR4F5
3      ENSG00000238009  RP11-34P13.7
4      ENSG00000239945  RP11-34P13.8
...                ...           ...
32733  ENSG00000215635    AC145205.1
32734  ENSG00000268590         BAGE5
32735  ENSG00000251180    CU459201.1
32736  ENSG00000215616    AC002321.2
32737  ENSG00000215611    AC002321.1

[32738 rows x 2 columns]


In [7]:
# read barcodes - to index sparse Pandas data frame containing UMI counts
df_cells = pd.read_csv ( cells_dsn, sep='\t', names = ['barcodes'] ) 
print (  ' df_cells \n\n', df_cells )

barcode_list = df_cells['barcodes'].values.tolist()
print (  '\n\n len(barcode_list): ', len(barcode_list))

 df_cells 

                barcodes
0      AAACATTGACGACT-1
1      AAACATTGACGGGA-1
2      AAACATTGCCGCTT-1
3      AAACATTGCTCATT-1
4      AAACCGTGCCCTCA-1
...                 ...
33143  TTTGCATGCACTAG-6
33144  TTTGCATGCCGAAT-6
33145  TTTGCATGGAGGTG-6
33146  TTTGCATGGATAAG-6
33147  TTTGCATGGGCATT-6

[33148 rows x 1 columns]


 len(barcode_list):  33148


In [8]:
X = mmread( matrix_dsn ).astype( int )
X = X.tocsr()

X_GT_0 = ( X > 0 )
row_nonzero_count = X_GT_0.sum ( axis = 1 ) 


####  exclude genes with no nonzero counts
nonzero_row = ( row_nonzero_count >= 1 ) 
print (  ' nonzero_row.sum: \n\n', nonzero_row.sum())

 nonzero_row.sum: 

 20678


In [9]:
index_nonzero_rows = np.where( nonzero_row )[0]
print (  ' index_nonzero_rows:', index_nonzero_rows )

df_genes_sel = df_genes.iloc [ index_nonzero_rows ]
print (  '\n df_genes_sel \n\n', df_genes_sel)

 index_nonzero_rows: [    4     5     8 ... 32727 32731 32732]

 df_genes_sel 

             ENSEMBL_ID    gene_symbol
4      ENSG00000239945   RP11-34P13.8
5      ENSG00000237683     AL627309.1
8      ENSG00000228463     AP006222.2
19     ENSG00000235373  RP11-206L10.3
22     ENSG00000228327  RP11-206L10.2
...                ...            ...
32722  ENSG00000212884     AC011841.1
32726  ENSG00000215615     AL354822.1
32727  ENSG00000215764        KIR2DL2
32731  ENSG00000215700          PNRC2
32732  ENSG00000215699         SRSF10

[20678 rows x 2 columns]


In [10]:
X_sel = X[ index_nonzero_rows, : ]

df_counts_sel =  pd.DataFrame.sparse.from_spmatrix(X_sel, index=df_genes_sel['ENSEMBL_ID'], columns=barcode_list )
print (  ' df_counts_sel:', df_counts_sel)
print (  '\n type( df_counts_sel )' , type ( df_counts_sel ))
print (  '\n df_counts_sel.sparse.density:  ', df_counts_sel.sparse.density)

 df_counts_sel:                  AAACATTGACGACT-1  AAACATTGACGGGA-1  ...  TTTGCATGGATAAG-6  \
ENSEMBL_ID                                           ...                     
ENSG00000239945                 0                 0  ...                 0   
ENSG00000237683                 0                 0  ...                 0   
ENSG00000228463                 0                 0  ...                 0   
ENSG00000235373                 0                 0  ...                 0   
ENSG00000228327                 0                 0  ...                 0   
...                           ...               ...  ...               ...   
ENSG00000212884                 0                 0  ...                 0   
ENSG00000215615                 0                 0  ...                 0   
ENSG00000215764                 0                 0  ...                 0   
ENSG00000215700                 1                 0  ...                 1   
ENSG00000215699                 0               

In [11]:
# verify that all cells have nonzero total count
ser_cell_totals = df_counts_sel.sum ( axis=0 ) 
print (  ' ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ) )

ser_cell_totals_GT_0 = ser_cell_totals.loc [ ser_cell_totals > 0 ]
print (  '\n ser_cell_totals_GT_0.describe:' )
print ( ser_cell_totals_GT_0.describe( percentiles = pctl_list ) )

 ser_cell_totals.describe:
count    33148.000000
mean      2057.881682
std        968.832213
min        480.000000
1%         571.000000
5%         818.000000
10%       1062.000000
25%       1463.000000
50%       1891.000000
75%       2438.000000
90%       3216.000000
95%       3838.000000
96%       4021.000000
97%       4258.000000
98%       4640.060000
99%       5311.770000
99.5%     6061.710000
99.9%     8466.679000
max      15680.000000
dtype: float64

 ser_cell_totals_GT_0.describe:
count    33148.000000
mean      2057.881682
std        968.832213
min        480.000000
1%         571.000000
5%         818.000000
10%       1062.000000
25%       1463.000000
50%       1891.000000
75%       2438.000000
90%       3216.000000
95%       3838.000000
96%       4021.000000
97%       4258.000000
98%       4640.060000
99%       5311.770000
99.5%     6061.710000
99.9%     8466.679000
max      15680.000000
dtype: float64


In [12]:
df_counts_sparse = df_counts_sel[ ser_cell_totals_GT_0.index ]
print (  ' df_counts_sparse:' )
print (  df_counts_sparse )
print (  '\n type( df_counts_sparse ) ', type ( df_counts_sparse ))
print (  '\n df_counts_sparse.sparse.density: ', df_counts_sparse.sparse.density)

ser_cell_totals = df_counts_sparse.sum ( axis=0 )
print (  '\n ser_cell_totals:' )
print (  ser_cell_totals )
print (  '\n ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ))

 df_counts_sparse:
                 AAACATTGACGACT-1  AAACATTGACGGGA-1  ...  TTTGCATGGATAAG-6  \
ENSEMBL_ID                                           ...                     
ENSG00000239945                 0                 0  ...                 0   
ENSG00000237683                 0                 0  ...                 0   
ENSG00000228463                 0                 0  ...                 0   
ENSG00000235373                 0                 0  ...                 0   
ENSG00000228327                 0                 0  ...                 0   
...                           ...               ...  ...               ...   
ENSG00000212884                 0                 0  ...                 0   
ENSG00000215615                 0                 0  ...                 0   
ENSG00000215764                 0                 0  ...                 0   
ENSG00000215700                 1                 0  ...                 1   
ENSG00000215699                 0            

In [13]:
df_counts_sparse.to_pickle ( counts_dsn )
df_clusters.to_pickle ( clusters_out_dsn )