In [1]:
import pandas as pd
import numpy  as np 
 
from scipy import sparse

from pathlib import Path

pd.set_option('display.max_columns', 5)

In [2]:
#### user specified
data_folder = "D:/analyze_Pearson_residuals/"

data_subfolder = "retinal"

data_path = Path ( data_folder + data_subfolder )

In [3]:
# output data
counts_pkl = "counts_sparse.pkl" # sparse Pandas data frame: cells in columns; genes in rows
clusters_pkl = "clusters.pkl"

# input data
counts_in_txt = "GSE63472_P14Retina_merged_digital_expression.txt"
clusters_in_txt = "retina_clusteridentities.txt"


# paths: output data
counts_dsn = data_path / counts_pkl
clusters_dsn = data_path / clusters_pkl

# paths: input data
counts_in_dsn = data_path / counts_in_txt
clusters_in_dsn = data_path / clusters_in_txt

In [4]:
# from   https://github.com/berenslab/rna-seq-tsne/blob/master/rnaseqTools.py
# changed dtype to int  2022 06 04
# return counts, instead of counts.T  2022 12 01
 
 
def sparseload(filename, sep=',', dtype=int, chunksize=1000, index_col=0, droplastcolumns=0):
    with open(filename) as file:
        genes = []
        sparseblocks = []
        for i,chunk in enumerate(pd.read_csv(filename, chunksize=chunksize, sep=sep, index_col=index_col)):
            print('.', end='', flush=True)
            if i==0:
                cells = np.array(chunk.columns)
            genes.extend(list(chunk.index))
            sparseblock = sparse.csr_matrix(chunk.values.astype(dtype))
            sparseblocks.append([sparseblock])
        counts = sparse.bmat(sparseblocks)
        print(' done')

    if droplastcolumns > 0:
        end = cells.size - droplastcolumns
        cells = cells[:end]
        counts = counts[:,:end]
        
    return (counts, np.array(genes), cells)
 

 
pctl_list = [.01,.05, .10, .25, .5, .75, .90, .95, .96, .97, .98, .99, .995, .999 ]

In [5]:
df_clusters_in = pd.read_table ( clusters_in_dsn,  delimiter='\t',  names=['Barcode', 'Cluster'] ).set_index ( ['Barcode'] )
print ( ' df_clusters_in \n', df_clusters_in )
print ( '\n\n df_clusters_in.value_counts \n' )
print ( df_clusters_in['Cluster'].value_counts() )
cells_clustered_list = df_clusters_in.index.values.tolist()

 df_clusters_in 
                  Cluster
Barcode                 
r1_GGCCGCAGTCCG        2
r1_CTTGTGCGGGAA        2
r1_GCGCAACTGCTC        2
r1_GATTGGGAGGCA        2
r1_GTGCCGCCTCTC       25
...                  ...
p1_TCAAAAGCCGGG       24
p1_ATTAAGTTCCAA       34
p1_CTGTCTGAGACC        2
p1_TAACGCGCTCCT       24
p1_ATTCTTGTTCTT       24

[44808 rows x 1 columns]


 df_clusters_in.value_counts 

24    29400
26     2217
25     1868
34     1624
33      849
27      664
30      636
29      591
31      512
28      496
2       432
20      389
17      375
9       350
7       326
32      320
3       289
12      274
22      274
23      264
16      262
21      254
37      252
1       252
11      214
6       211
10      191
8       159
19      127
14      111
36       85
18       83
5        77
15       73
4        73
39       67
38       63
35       54
13       50
Name: Cluster, dtype: int64


In [6]:
counts, genes, cells = sparseload( counts_in_dsn, sep='\t')

df_counts_in_sparse =  pd.DataFrame.sparse.from_spmatrix(counts, index=genes, columns=cells)
print (  ' df_counts_in_sparse:\n', df_counts_in_sparse)
print (  '\n type( df_counts_in_sparse )' , type ( df_counts_in_sparse ))
print (  '\n df_counts_in_sparse.sparse.density:  ', df_counts_in_sparse.sparse.density)

......................... done
 df_counts_in_sparse:
                r1_GGCCGCAGTCCG  r1_CTTGTGCGGGAA  ...  p1_TAACGCGCTCCT  \
KITL                         0                0  ...                0   
TMTC3                        3                0  ...                0   
CEP290                       1                3  ...                0   
4930430F08RIK                2                1  ...                0   
1700017N19RIK                0                0  ...                0   
...                        ...              ...  ...              ...   
VSIG1                        0                0  ...                0   
GM16390                      0                0  ...                0   
GM25207                      0                0  ...                0   
1110059M19RIK                0                0  ...                0   
GM20861                      0                0  ...                0   

               p1_ATTCTTGTTCTT  
KITL                         0  
TMT

In [7]:
cell_list = list ( cells )
cell_list_select = [ cell for cell in cell_list   if  any ( replicate in cell  for replicate in  [ 'p1', 'r4', 'r5', 'r6' ] ) ]
df_counts_select_replicates = df_counts_in_sparse [ cell_list_select ]
print (  ' df_counts_select_replicates:\n', df_counts_select_replicates)
print (  '\n type( df_counts_select_replicates )' , type ( df_counts_select_replicates ))
print (  '\n df_counts_select_replicates.sparse.density:  ', df_counts_select_replicates.sparse.density)

del ( df_counts_in_sparse )

 df_counts_select_replicates:
                r4_AACATTGAATTC  r4_TGCCACATGGGC  ...  p1_TAACGCGCTCCT  \
KITL                         0                0  ...                0   
TMTC3                        3               10  ...                0   
CEP290                      18               24  ...                0   
4930430F08RIK                8                1  ...                0   
1700017N19RIK                0                0  ...                0   
...                        ...              ...  ...              ...   
VSIG1                        0                0  ...                0   
GM16390                      0                0  ...                0   
GM25207                      0                0  ...                0   
1110059M19RIK                0                0  ...                0   
GM20861                      0                0  ...                0   

               p1_ATTCTTGTTCTT  
KITL                         0  
TMTC3                     

In [8]:
cells_select_list = df_counts_select_replicates.columns.values.tolist()
cells_select_clustered = [ cell for cell in cells_select_list  if  cell in cells_clustered_list ]
print ( 'len ( cells_select_clustered ):  ',  len ( cells_select_clustered ) )

df_counts_select_clustered = df_counts_select_replicates [ cells_select_clustered ]
print (  '\n\n df_counts_select_clustered:\n', df_counts_select_clustered)

del ( df_counts_select_replicates )

len ( cells_select_clustered ):   24769


 df_counts_select_clustered:
                r4_TGCCACATGGGC  r4_ATCGGCTACCGA  ...  p1_TAACGCGCTCCT  \
KITL                         0                2  ...                0   
TMTC3                       10                0  ...                0   
CEP290                      24                8  ...                0   
4930430F08RIK                1                1  ...                0   
1700017N19RIK                0                1  ...                0   
...                        ...              ...  ...              ...   
VSIG1                        0                0  ...                0   
GM16390                      0                0  ...                0   
GM25207                      0                0  ...                0   
1110059M19RIK                0                0  ...                0   
GM20861                      0                0  ...                0   

               p1_ATTCTTGTTCTT  
KITL              

In [9]:
#  exclude genes with no nonzero cells
df_counts_GT_0 = ( df_counts_select_clustered > 0 ).astype(int)
ser_gene_nonzero_counts = df_counts_GT_0.sum ( axis=1 )
print (  ' ser_gene_nonzero_counts.describe:')
print ( ser_gene_nonzero_counts.describe( percentiles = pctl_list ) )

df_counts_sel = df_counts_select_clustered.loc[ ser_gene_nonzero_counts > 0 ]
print (  '\n\n df_counts_sel: \n', df_counts_sel)
print (  '\n type( df_counts_sel )' , type ( df_counts_sel ))
print (  '\n df_counts_sel.sparse.density:  ', df_counts_sel.sparse.density)

 ser_gene_nonzero_counts.describe:
count    24658.000000
mean       791.160110
std       1666.723234
min          0.000000
1%           0.000000
5%           0.000000
10%          1.000000
25%          3.000000
50%         95.000000
75%        886.000000
90%       2271.300000
95%       3675.000000
96%       4211.160000
97%       4875.000000
98%       6118.000000
99%       7969.160000
99.5%    10384.750000
99.9%    16738.552000
max      24143.000000
dtype: float64


 df_counts_sel: 
                r4_TGCCACATGGGC  r4_ATCGGCTACCGA  ...  p1_TAACGCGCTCCT  \
KITL                         0                2  ...                0   
TMTC3                       10                0  ...                0   
CEP290                      24                8  ...                0   
4930430F08RIK                1                1  ...                0   
1700017N19RIK                0                1  ...                0   
...                        ...              ...  ...              ...   
G

In [10]:
# verify that all cells have nonzero total count
ser_cell_totals = df_counts_sel.sum ( axis=0 ) 
print (  ' ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ) )

ser_cell_totals_GT_0 = ser_cell_totals.loc [ ser_cell_totals > 0 ]
print (  '\n ser_cell_totals_GT_0.describe:' )
print ( ser_cell_totals_GT_0.describe( percentiles = pctl_list ) )

 ser_cell_totals.describe:
count    24769.000000
mean      1336.238766
std       1744.835097
min        294.000000
1%         306.000000
5%         356.000000
10%        395.000000
25%        496.000000
50%        817.000000
75%       1486.000000
90%       2735.000000
95%       3959.200000
96%       4408.280000
97%       5112.680000
98%       6163.800000
99%       8296.880000
99.5%    10738.280000
99.9%    19615.912000
max      50598.000000
dtype: float64

 ser_cell_totals_GT_0.describe:
count    24769.000000
mean      1336.238766
std       1744.835097
min        294.000000
1%         306.000000
5%         356.000000
10%        395.000000
25%        496.000000
50%        817.000000
75%       1486.000000
90%       2735.000000
95%       3959.200000
96%       4408.280000
97%       5112.680000
98%       6163.800000
99%       8296.880000
99.5%    10738.280000
99.9%    19615.912000
max      50598.000000
dtype: float64


In [11]:
df_counts_sparse = df_counts_sel[ ser_cell_totals_GT_0.index ]
print (  ' df_counts_sparse:' )
print (  df_counts_sparse )
print (  '\n type( df_counts_sparse ) ', type ( df_counts_sparse ))
print (  '\n df_counts_sparse.sparse.density: ', df_counts_sparse.sparse.density)

ser_cell_totals = df_counts_sparse.sum ( axis=0 )
print (  '\n ser_cell_totals:' )
print (  ser_cell_totals )
print (  '\n ser_cell_totals.describe:')
print ( ser_cell_totals.describe( percentiles = pctl_list ))

 df_counts_sparse:
               r4_TGCCACATGGGC  r4_ATCGGCTACCGA  ...  p1_TAACGCGCTCCT  \
KITL                         0                2  ...                0   
TMTC3                       10                0  ...                0   
CEP290                      24                8  ...                0   
4930430F08RIK                1                1  ...                0   
1700017N19RIK                0                1  ...                0   
...                        ...              ...  ...              ...   
GM23791                      0                0  ...                0   
GM26101                      0                0  ...                0   
4930513O06RIK                0                0  ...                0   
VSIG1                        0                0  ...                0   
GM25207                      0                0  ...                0   

               p1_ATTCTTGTTCTT  
KITL                         0  
TMTC3                        0  
CEP29

In [12]:
df_clusters = df_clusters_in.loc [ df_counts_sparse.columns.values.tolist() ]
print ( ' df_clusters \n', df_clusters )
print ( '\n\n df_clusters.value_counts \n' )
print ( df_clusters['Cluster'].value_counts() )

 df_clusters 
                  Cluster
Barcode                 
r4_TGCCACATGGGC       24
r4_ATCGGCTACCGA        2
r4_TAGATATCTTAT        2
r4_CCTGGATTGTAC        2
r4_CGGTAATAGGAA        2
...                  ...
p1_TCAAAAGCCGGG       24
p1_ATTAAGTTCCAA       34
p1_CTGTCTGAGACC        2
p1_TAACGCGCTCCT       24
p1_ATTCTTGTTCTT       24

[24769 rows x 1 columns]


 df_clusters.value_counts 

24    15709
26     1321
25     1159
34      836
33      507
27      364
30      350
29      337
28      293
31      283
20      257
2       238
17      237
9       231
7       202
32      196
3       186
16      177
23      173
12      170
22      160
37      152
21      143
11      133
1       125
10      118
6       118
8       107
19       88
14       76
5        51
15       48
18       43
4        41
39       39
35       32
13       30
38       25
36       14
Name: Cluster, dtype: int64


In [13]:
df_counts_sparse.to_pickle ( counts_dsn )
df_clusters.to_pickle ( clusters_dsn )