In [87]:
import gc

import numpy as np
import scipy
from scipy import sparse as ss
import pandas as pd
import datatable as dt

def read_raw(count_file, cell_file, gene_file):
    df = dt.fread(count_file, skip_to_line=2, header=False).to_pandas()
    print(df.columns)
    cell_df = pd.read_csv(cell_file, header=None, names=['cell'], index_col=0, squeeze=True)
    gene_df = pd.read_csv(gene_file, header=None, names=['gene'], index_col=0, squeeze=True)
    df.columns = ['gene', 'cell', 'count']
    gene_num, cell_num, total_count = df.iloc[0]
    df.drop(index=0, inplace=True)
    X = ss.csr_matrix((df.count, (df.cell, df.gene)), shape=(cell_num, gene_num))

    df = pd.DataFrame.sparse.from_spmatrix(X, index=cell_df, columns=gene_df)
    return df

In [2]:
data_dir = '/home/tiankang/wusuowei/data/single_cell/babel/'
snare_data_dir = data_dir + 'snareseq_GSE126074/'

In [3]:
count_file = 'GSE126074_AdBrainCortex_SNAREseq_cDNA.counts.mtx.gz'
cell_file = 'GSE126074_AdBrainCortex_SNAREseq_cDNA.barcodes.tsv.gz'
gene_file = 'GSE126074_AdBrainCortex_SNAREseq_cDNA.genes.tsv.gz'

In [7]:
df = dt.fread(snare_data_dir + count_file, skip_to_line=2, header=False).to_pandas()

Index(['C0', 'C1', 'C2'], dtype='object')


In [14]:
df.columns = ['gene', 'cell', 'counts']

In [10]:
gene_num, cell_num, total_count = df.iloc[0]
df.drop(index=0, inplace=True)
df

Unnamed: 0,gene,cell,count
1,3,1,1
2,13,1,1
3,17,1,1
4,45,1,1
5,60,1,1
...,...,...,...
9805809,22603,10309,1
9805810,23102,10309,1
9805811,23118,10309,1
9805812,23140,10309,1


In [19]:
df.cell -= 1
df.gene -= 1

In [23]:
X = ss.csr_matrix((df.counts, (df.cell, df.gene)), shape=(cell_num, gene_num))

In [36]:
cell_num, gene_num

(10309, 33160)

In [40]:
cell_df = pd.read_csv(snare_data_dir + cell_file, header=None, names=['cell'], squeeze=True)
gene_df = pd.read_csv(snare_data_dir + gene_file, header=None, names=['gene'], squeeze=True)
gene_df

0        0610005C13Rik
1        0610007P14Rik
2        0610009B22Rik
3        0610009E02Rik
4        0610009L18Rik
             ...      
33155       Vmn2r-ps83
33156          Vmn2r50
33157           Vpreb2
33158          n-R5s32
33159          n-R5s50
Name: gene, Length: 33160, dtype: object

In [41]:
df = pd.DataFrame.sparse.from_spmatrix(X, index=cell_df, columns=gene_df)

In [73]:
df

gene,0610005C13Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010F05Rik,0610030E20Rik,0610031O16Rik,0610037L13Rik,...,Vmn1r68,Vmn1r82,Vmn2r-ps23,Vmn2r-ps24,Vmn2r-ps45,Vmn2r-ps83,Vmn2r50,Vpreb2,n-R5s32,n-R5s50
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
09A_AAACCAACGCCT,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
09A_AAACGTAGACAC,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
09A_AAAGAATCCGTT,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
09A_AAAGGGATTCCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
09A_AAAGTAAAAGGA,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
09L_TTTGAACTAAGG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
09L_TTTGCATGCCCA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
09L_TTTTACCGATAT,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
09L_TTTTACGTCTTG,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
df.dtypes

gene
0610005C13Rik    Sparse[int64, 0]
0610007P14Rik    Sparse[int64, 0]
0610009B22Rik    Sparse[int64, 0]
0610009E02Rik    Sparse[int64, 0]
0610009L18Rik    Sparse[int64, 0]
                       ...       
Vmn2r-ps83       Sparse[int32, 0]
Vmn2r50          Sparse[int32, 0]
Vpreb2           Sparse[int32, 0]
n-R5s32          Sparse[int32, 0]
n-R5s50          Sparse[int64, 0]
Length: 33160, dtype: object

In [96]:
df = df.astype('Sparse[int16]')

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10309 entries, 09A_AAACCAACGCCT to 09L_TTTTCTATTAAG
Columns: 33160 entries, 0610005C13Rik to n-R5s50
dtypes: Sparse[int16, 0](33160)
memory usage: 56.5+ MB


In [103]:
df.sparse.to_coo().tocsr().T

<33160x10309 sparse matrix of type '<class 'numpy.int16'>'
	with 9805813 stored elements in Compressed Sparse Column format>

In [99]:
(df+1).info()

<class 'pandas.core.frame.DataFrame'>
Index: 10309 entries, 09A_AAACCAACGCCT to 09L_TTTTCTATTAAG
Columns: 33160 entries, 0610005C13Rik to n-R5s50
dtypes: Sparse[int16, 1](33160)
memory usage: 56.5+ MB


In [64]:
dummy_df = pd.DataFrame([[1,2,3],[3,4,5]])

In [66]:
dummy_df.iloc[[True,False]]

Unnamed: 0,0,1,2
0,1,2,3


In [38]:
gene_num, cell_num, total_count = X.iloc[0]

In [39]:
gene_num

33160

In [None]:
df = pd.DataFrame.sparse.from_spmatrix(sparse.csr_matrix(X.count))

In [35]:
X.columns = ['gene', 'cell', 'count']

In [36]:
X

Unnamed: 0,gene,cell,count
0,3,1,1
1,13,1,1
2,17,1,1
3,45,1,1
4,60,1,1
...,...,...,...
9805808,22603,10309,1
9805809,23102,10309,1
9805810,23118,10309,1
9805811,23140,10309,1


In [16]:
gene = X.iloc[1:, 0]
cell = X.iloc[1:, 1]
count = X.iloc[1:, 2]

In [17]:
gene

1              3
2             13
3             17
4             45
5             60
           ...  
9805809    22603
9805810    23102
9805811    23118
9805812    23140
9805813    26520
Name: %%MatrixMarket, Length: 9805813, dtype: int32

In [13]:
pd.read_csv(snare_data_dir + 'GSE126074_AdBrainCortex_SNAREseq_cDNA.barcodes.tsv.gz')

Unnamed: 0,09A_CAGCCCCGCCTT
0,09A_CGCCTACCATGA
1,09A_GATGCGCGGCTA
2,09A_GGTCCGAGTCCT
3,09A_TCTCCCGGCACC
4,09A_ACCAAATCTTGT
...,...
10303,09L_TACTAGTTCAAG
10304,09L_ATGACGGGCCCC
10305,09L_GAAACACCTCAT
10306,09L_AACGGTTTATCC
