In [1]:
# prep_10k_heart Jupyter notebook


import pandas as pd
import numpy  as np 
 
from pathlib import Path

from scipy.io import mmread


pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = r"D:/analyze_Pearson_residuals/"

data_subfolder = "10k_heart"

data_path = Path ( data_folder + data_subfolder )

clusters_folder = data_folder + data_subfolder + "/clustering/kmeans_10_clusters" 
clusters_path = Path ( clusters_folder )

In [3]:
counts_pkl = "counts_sparse.pkl" # sparse Pandas data frame: cells in columns; genes in rows
clusters_pkl = "clusters.pkl"


matrix_mtx = "matrix.mtx"
genes_tsv  = "features.tsv"
cells_tsv = "barcodes.tsv"

clusters_csv = "clusters.csv"


# pkl outputs
counts_dsn = data_path / counts_pkl
clusters_out_dsn = data_path / clusters_pkl


# inputs
matrix_dsn  = data_path / matrix_mtx
genes_dsn = data_path / genes_tsv
cells_dsn = data_path / cells_tsv
clusters_in_dsn = clusters_path / clusters_csv

In [4]:
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99, .995, .999 ]

In [5]:
df_clusters = pd.read_csv ( clusters_in_dsn  )
df_clusters.set_index ( ['Barcode'], inplace=True )
print ( ' df_clusters \n', df_clusters )
print ( '\n\n df_clusters.value_counts \n' )
print ( df_clusters['Cluster'].value_counts() )

 df_clusters 
                     Cluster
Barcode                    
AAACCCAAGCGAGTCA-1        3
AAACCCAAGGTCATTC-1        6
AAACCCACACTGTGTA-1        5
AAACCCACATATCTGG-1        5
AAACCCACATCAACCA-1        9
...                     ...
TTTGGTTTCTGTACAG-1        1
TTTGTTGAGCTTTCCC-1        3
TTTGTTGGTGCTCTCT-1        1
TTTGTTGTCCCAGGAC-1        7
TTTGTTGTCTTCGACC-1        3

[7713 rows x 1 columns]


 df_clusters.value_counts 

1     1899
2     1542
3     1456
4     1147
5      477
6      420
7      279
8      261
9      227
10       5
Name: Cluster, dtype: int64


In [6]:
df_genes = pd.read_csv ( genes_dsn, sep='\t', names = ['ENSEMBL_ID','gene_symbol'], usecols =[0,1] ) 
print (  ' df_genes \n\n', df_genes)

 df_genes 

                ENSEMBL_ID     gene_symbol
0      ENSMUSG00000051951            Xkr4
1      ENSMUSG00000089699          Gm1992
2      ENSMUSG00000102343         Gm37381
3      ENSMUSG00000025900             Rp1
4      ENSMUSG00000025902           Sox17
...                   ...             ...
31048  ENSMUSG00000079808      AC168977.1
31049  ENSMUSG00000095041      AC149090.1
31050  ENSMUSG00000063897  CAAA01118383.1
31051  ENSMUSG00000096730        Vmn2r122
31052  ENSMUSG00000095742  CAAA01147332.1

[31053 rows x 2 columns]


In [7]:
# read barcodes - to index sparse Pandas data frame containing UMI counts
df_cells = pd.read_csv ( cells_dsn, sep='\t', names = ['barcodes'] ) 
print (  ' df_cells \n\n', df_cells )

barcode_list = df_cells['barcodes'].values.tolist()
print (  ' len(barcode_list): \n\n', len(barcode_list))

 df_cells 

                 barcodes
0     AAACCCAAGCGAGTCA-1
1     AAACCCAAGGTCATTC-1
2     AAACCCACACTGTGTA-1
3     AAACCCACATATCTGG-1
4     AAACCCACATCAACCA-1
...                  ...
7708  TTTGGTTTCTGTACAG-1
7709  TTTGTTGAGCTTTCCC-1
7710  TTTGTTGGTGCTCTCT-1
7711  TTTGTTGTCCCAGGAC-1
7712  TTTGTTGTCTTCGACC-1

[7713 rows x 1 columns]
 len(barcode_list): 

 7713


In [8]:
X = mmread( matrix_dsn ).astype( int )
X = X.tocsr()

X_GT_0 = ( X > 0 )
row_nonzero_count = X_GT_0.sum ( axis = 1 ) 


####  exclude genes with no nonzero counts
nonzero_row = ( row_nonzero_count >= 1 ) 
print (  ' nonzero_row.sum: \n\n', nonzero_row.sum())

 nonzero_row.sum: 

 22621


In [9]:
index_nonzero_rows = np.where( nonzero_row )[0]
print (  ' index_nonzero_rows:', index_nonzero_rows )

df_genes_sel = df_genes.iloc [ index_nonzero_rows ]
print (  '\n df_genes_sel \n\n', df_genes_sel)

 index_nonzero_rows: [    0     2     3 ... 31049 31050 31052]

 df_genes_sel 

                ENSEMBL_ID     gene_symbol
0      ENSMUSG00000051951            Xkr4
2      ENSMUSG00000102343         Gm37381
3      ENSMUSG00000025900             Rp1
4      ENSMUSG00000025902           Sox17
6      ENSMUSG00000033845          Mrpl15
...                   ...             ...
31041  ENSMUSG00000079800      AC125149.3
31048  ENSMUSG00000079808      AC168977.1
31049  ENSMUSG00000095041      AC149090.1
31050  ENSMUSG00000063897  CAAA01118383.1
31052  ENSMUSG00000095742  CAAA01147332.1

[22621 rows x 2 columns]


In [10]:
X_sel = X[ index_nonzero_rows, : ]

df_counts_sel =  pd.DataFrame.sparse.from_spmatrix(X_sel, index=df_genes_sel['ENSEMBL_ID'], columns=barcode_list )
print (  ' df_counts_sel:', df_counts_sel)

print (  '\n type( df_counts_sel )' , type ( df_counts_sel ))

print (  '\n df_counts_sel.sparse.density:  ', df_counts_sel.sparse.density)

 df_counts_sel:                     AAACCCAAGCGAGTCA-1  AAACCCAAGGTCATTC-1  ...  \
ENSEMBL_ID                                                  ...   
ENSMUSG00000051951                   0                   0  ...   
ENSMUSG00000102343                   0                   0  ...   
ENSMUSG00000025900                   0                   0  ...   
ENSMUSG00000025902                   0                   0  ...   
ENSMUSG00000033845                   0                   1  ...   
...                                ...                 ...  ...   
ENSMUSG00000079800                   0                   0  ...   
ENSMUSG00000079808                   0                   0  ...   
ENSMUSG00000095041                   0                   0  ...   
ENSMUSG00000063897                   0                   0  ...   
ENSMUSG00000095742                   0                   0  ...   

                    TTTGTTGTCCCAGGAC-1  TTTGTTGTCTTCGACC-1  
ENSEMBL_ID                                         

In [11]:
# verify that all cells have nonzero total count
ser_cell_totals = df_counts_sel.sum ( axis=0 ) 
print (  ' ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ) )

ser_cell_totals_GT_0 = ser_cell_totals.loc [ ser_cell_totals > 0 ]
print (  '\n\n ser_cell_totals_GT_0.describe:' )
print ( ser_cell_totals_GT_0.describe( percentiles = pctl_list ) )

 ser_cell_totals.describe:
count      7713.000000
mean      10932.245170
std       12011.192077
min         500.000000
1%          546.120000
5%          736.600000
10%        1124.200000
25%        2485.000000
50%        6379.000000
75%       15747.000000
90%       27575.400000
95%       35078.800000
96%       37300.800000
97%       39981.840000
98%       43651.840000
99%       50539.680000
99.5%     57875.000000
99.9%     88427.008000
max      139767.000000
dtype: float64


 ser_cell_totals_GT_0.describe:
count      7713.000000
mean      10932.245170
std       12011.192077
min         500.000000
1%          546.120000
5%          736.600000
10%        1124.200000
25%        2485.000000
50%        6379.000000
75%       15747.000000
90%       27575.400000
95%       35078.800000
96%       37300.800000
97%       39981.840000
98%       43651.840000
99%       50539.680000
99.5%     57875.000000
99.9%     88427.008000
max      139767.000000
dtype: float64


In [12]:
df_counts_sparse = df_counts_sel[ ser_cell_totals_GT_0.index ]
print (  ' df_counts_sparse:' )
print (  df_counts_sparse )
print (  '\n type( df_counts_sparse ) ', type ( df_counts_sparse ))
print (  '\n df_counts_sparse.sparse.density: ', df_counts_sparse.sparse.density)

ser_cell_totals = df_counts_sparse.sum ( axis=0 )
print (  '\n\n ser_cell_totals:' )
print (  ser_cell_totals )
print (  '\n ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ))

 df_counts_sparse:
                    AAACCCAAGCGAGTCA-1  AAACCCAAGGTCATTC-1  ...  \
ENSEMBL_ID                                                  ...   
ENSMUSG00000051951                   0                   0  ...   
ENSMUSG00000102343                   0                   0  ...   
ENSMUSG00000025900                   0                   0  ...   
ENSMUSG00000025902                   0                   0  ...   
ENSMUSG00000033845                   0                   1  ...   
...                                ...                 ...  ...   
ENSMUSG00000079800                   0                   0  ...   
ENSMUSG00000079808                   0                   0  ...   
ENSMUSG00000095041                   0                   0  ...   
ENSMUSG00000063897                   0                   0  ...   
ENSMUSG00000095742                   0                   0  ...   

                    TTTGTTGTCCCAGGAC-1  TTTGTTGTCTTCGACC-1  
ENSEMBL_ID                                      

In [13]:
df_counts_sparse.to_pickle ( counts_dsn )
df_clusters.to_pickle ( clusters_out_dsn )