### Get the dataset from Adams et al in this publication

We'll download the raw data and try getting the raw counts for patients

https://advances.sciencemag.org/content/6/28/eaba1983

It seems that GEO has raw counts and all patient data necessary

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE136831

In [1]:
from os.path import join
import scanpy as sc
import pandas as pd

In [2]:
d = '../data/adams'
mtx_path = join(d, 'GSE136831_RawCounts_Sparse.mtx.gz')
var_path = join(d, 'GSE136831_AllCells.GeneIDs.txt.gz')
obs_path = join(d, 'GSE136831_AllCells.cellBarcodes.txt.gz')
metadata_path = join(d, 'GSE136831_AllCells.Samples.CellType.MetadataTable.txt.gz')


In [None]:
print('reading mtx...')
ad = sc.read_mtx(mtx_path, dtype='float32')

reading mtx...


In [15]:
ad

AnnData object with n_obs × n_vars = 45947 × 312928

In [41]:
print('reading obs+var...')
obs = pd.read_csv(obs_path, header=None, sep='\t')
var = pd.read_csv(var_path, sep='\t')

metadata = pd.read_csv(metadata_path, sep='\t', index_col=0)

reading obs+var...


In [23]:
# var
var.columns = ['ensembl', 'symbol']
var.index = list(var.ensembl)
# var
ad.obs = var
ad.obs

# ad.obs
# ad.var
obs.index = list(obs[0])
ad.var.index = obs.index

Unnamed: 0,ensembl,symbol
ENSG00000000003,ENSG00000000003,TSPAN6
ENSG00000000005,ENSG00000000005,TNMD
ENSG00000000419,ENSG00000000419,DPM1
ENSG00000000457,ENSG00000000457,SCYL3
ENSG00000000460,ENSG00000000460,C1orf112
...,...,...
ENSG00000227029,ENSG00000227029,ENSG00000227029
ENSG00000239708,ENSG00000239708,RN7SL782P
ENSG00000274532,ENSG00000274532,ENSG00000274532
ENSG00000277705,ENSG00000277705,ENSG00000277705


In [52]:
assert metadata.index.equals(ad.var.index)
for c in metadata:
    ad.var[c] = metadata[c]

Unnamed: 0,nUMI,nGene,CellType_Category,Manuscript_Identity,Subclass_Cell_Identity,Disease_Identity,Subject_Identity,Library_Identity
001C_AAACCTGCATCGGGTC,5477,2150,Myeloid,ncMonocyte,Monocyte_Non-Classical,Control,001C,001C
001C_AAACCTGTCAACACCA,20311,4726,Myeloid,Macrophage_Alveolar,Macrophage_Alveolar,Control,001C,001C
001C_AAACCTGTCACAGTAC,1390,881,Lymphoid,NK,NK,Control,001C,001C
001C_AAACCTGTCTGTCTAT,3968,1943,Myeloid,cMonocyte,Monocyte,Control,001C,001C
001C_AAACGGGAGACTAAGT,3036,1716,Endothelial,Lymphatic,Lymphatic-Endothelial,Control,001C,001C
...,...,...,...,...,...,...,...,...
8CO_TTGTAGGCATCACGTA,7617,2796,Lymphoid,T_Cytotoxic,T_Cytotoxic_A,COPD,8CO,8CO
8CO_TTGTAGGTCCAGATCA,3904,1811,Myeloid,cMonocyte,Monocyte,COPD,8CO,8CO
8CO_TTTATGCGTAGGCTGA,5195,2098,Endothelial,VE_Venous,Vascular-Endothelial_A,COPD,8CO,8CO
8CO_TTTGGTTTCTGCAAGT,24743,4819,Myeloid,Macrophage_Alveolar,Macrophage_Alveolar,COPD,8CO,8CO


In [None]:
for identity in set(ad.var.CellType_Category):
    print(identity)
    sel = ad[:,ad.var.CellType_Category == identity]
    sel = sel.copy().transpose()
    print(sel.shape)
    sel.write('../data/adams/adams_input_scvi_%s.h5ad' % identity, compression='lzf')

Lymphoid


... storing 'CellType_Category' as categorical
... storing 'Manuscript_Identity' as categorical


(55543, 45947)


... storing 'Subclass_Cell_Identity' as categorical
... storing 'Disease_Identity' as categorical
... storing 'Subject_Identity' as categorical
... storing 'Library_Identity' as categorical


Myeloid
(215938, 45947)


... storing 'CellType_Category' as categorical
... storing 'Manuscript_Identity' as categorical
... storing 'Subclass_Cell_Identity' as categorical
... storing 'Disease_Identity' as categorical
... storing 'Subject_Identity' as categorical
... storing 'Library_Identity' as categorical


### Macrophages

In [None]:
identity = 'Macrophage'
sel = ad[:,ad.var.Manuscript_Identity.str.contains(identity)]
sel = sel.copy().transpose()
print(sel.shape)
sel.write('../data/adams/adams_input_scvi_%s.h5ad' % identity, compression='lzf')

### All

In [None]:
ad_all = ad.copy().transpose()
print(ad_all.shape)
ad_all.write('../data/adams/adams_input_scvi_all.h5ad', compression='lzf')

In [68]:
!ls -ltrh ../data/adams/

total 11G
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 2.1G May  8 20:29 GSE136831_RawCounts_Sparse.mtx.gz
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 317K May  8 23:03 GSE136831_AllCells.GeneIDs.txt.gz
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 4.2M May  8 23:03 GSE136831_AllCells.Samples.CellType.MetadataTable.txt.gz
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 1.4M May  8 23:03 GSE136831_AllCells.cellBarcodes.txt.gz
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 329M Oct 18 09:38 adams_input_scvi_Lymphoid.h5ad
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 2.4G Oct 18 09:39 adams_input_scvi_Myeloid.h5ad
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User  95M Oct 18 09:39 adams_input_scvi_Stromal.h5ad
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User  80M Oct 18 09:40 adams_input_scvi_Endothelial.h5ad
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User  85M Oct 18 09:40 adams_input_scvi_Multiplet.h5ad
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 321M Oct 18 09:40 adams_input_scvi_Epithelial.h5ad
-rw-r--r--. 1 ignacio.ibarra OG-ICB-User 3.3G 