In [29]:
from glob import glob

import anndata
import pandas as pd
from tqdm import tqdm

In [38]:
adatas = []
for fname in tqdm(glob('GSE213516_RAW/*_matrix.mtx.gz')):
    adata = anndata.read_mtx(fname).T
    
    obs = (
        pd.read_csv(
            fname.replace('_matrix.mtx.gz', '_barcodes.tsv.gz'),
            names = ['barcode']
        )
        .set_index('barcode')
    )
    
    var = (
        pd.read_csv(
            fname.replace('_matrix.mtx.gz', '_features.tsv.gz'),
            sep = '\t',
            names = ['gene_id', 'gene_name', 'feature_type']
        )
        .set_index('gene_name')
    )
    
    adata.obs = obs
    adata.var = var

    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    
    adata.obs['sample_name'] = fname.replace('_matrix.mtx.gz', '')
    
    adatas += [adata]


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [05:49<00:00, 20.58s/it]


In [42]:
alladata = anndata.concat(adatas, index_unique = '-', merge = 'same')

In [43]:
alladata

AnnData object with n_obs × n_vars = 139761 × 33538
    obs: 'sample_name'
    var: 'gene_id', 'feature_type'

In [44]:
alladata.var

Unnamed: 0_level_0,gene_id,feature_type
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1
MIR1302-2HG,ENSG00000243485,Gene Expression
FAM138A,ENSG00000237613,Gene Expression
OR4F5,ENSG00000186092,Gene Expression
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
...,...,...
AC233755.2,ENSG00000277856,Gene Expression
AC233755.1,ENSG00000275063,Gene Expression
AC240274.1,ENSG00000271254,Gene Expression
AC213203.1,ENSG00000277475,Gene Expression


In [45]:
alladata.obs

Unnamed: 0_level_0,sample_name
barcode,Unnamed: 1_level_1
AAACCTGAGACGCTTT-1-0,GSE213516_RAW/GSM6588511_F30
AAACCTGAGAGACGAA-1-0,GSE213516_RAW/GSM6588511_F30
AAACCTGAGCTACCTA-1-0,GSE213516_RAW/GSM6588511_F30
AAACCTGAGGATGGAA-1-0,GSE213516_RAW/GSM6588511_F30
AAACCTGCAAACTGCT-1-0,GSE213516_RAW/GSM6588511_F30
...,...
TTTGTCATCCTACAGA-1-16,GSE213516_RAW/GSM6588527_M74
TTTGTCATCCTCAACC-1-16,GSE213516_RAW/GSM6588527_M74
TTTGTCATCCTTGGTC-1-16,GSE213516_RAW/GSM6588527_M74
TTTGTCATCTGTGCAA-1-16,GSE213516_RAW/GSM6588527_M74


In [46]:
alladata.X

<139761x33538 sparse matrix of type '<class 'numpy.float32'>'
	with 226886859 stored elements in Compressed Sparse Row format>

In [47]:
alladata.X.data[:100]

array([ 1.,  1.,  1.,  1., 11.,  1.,  1.,  1.,  1.,  2.,  2., 29.,  1.,
        1.,  1.,  1.,  3.,  1., 14.,  1.,  1.,  1.,  1.,  1.,  5.,  1.,
        2.,  1.,  1.,  1.,  1.,  1.,  1.,  3.,  1.,  2.,  1.,  2.,  1.,
        2.,  1.,  1.,  1.,  1., 42.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,
        1.,  2.,  1.,  1.,  1.,  1.,  2.,  1.,  1.,  6.,  1.,  1.,  1.,
        1.,  1.,  1.,  3.,  1.,  5.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  2.,  2.,  1.,  9.,  1.,  1.,  2.,  1.,  1.,  1.,  4., 85.,
        1.,  1.,  1.,  1.,  1.,  2.,  1.,  3.,  1.], dtype=float32)

In [48]:
alladata.write('GSE213516.h5ad')

In [49]:
ls -lh

total 1.8G
-rwxrwxrwx 1 vale vale  12K Aug  8 22:21 [0m[01;32m'230808 Read data.ipynb'[0m*
-rwxrwxrwx 2 vale vale 1.8G Aug  8 22:22  [01;32mGSE213516.h5ad[0m*
drwxrwxrwx 1 vale vale 4.0K Aug  8 21:45  [34;42mGSE213516_RAW[0m/
