Integrating microarray-based spatial transcriptomics and single-cell RNA-seq reveals tissue architecture in pancreatic ductal adenocarcinomas

ShortName: moncada2020integrating

Steps of processing the data from raw to Anndata:

In [1]:
# 1, Download the raw data from GSE111672

In [47]:
import os
import scanpy as sc
import pandas as pd
import anndata as ad
from glob import glob

In [48]:
# data path of the ST data
data_path = './data/ST_PancreaticCancer'

In [49]:
# get count_file
count_file_list = glob(os.path.join(data_path, '*.tsv.gz'))
count_file_list

['./data/ST_PancreaticCancer/GSM4100723_PDAC-B-st2.tsv.gz',
 './data/ST_PancreaticCancer/GSM3405530_PDAC-A-indrop6.tsv.gz',
 './data/ST_PancreaticCancer/GSM3405534_PDAC-B-ST1.tsv.gz',
 './data/ST_PancreaticCancer/GSM4100726_PDAC-E-st1.tsv.gz',
 './data/ST_PancreaticCancer/GSM3405529_PDAC-A-indrop5.tsv.gz',
 './data/ST_PancreaticCancer/GSM4100728_PDAC-G-st1.tsv.gz',
 './data/ST_PancreaticCancer/GSM4100725_PDAC-D-st1.tsv.gz',
 './data/ST_PancreaticCancer/GSM3405527_PDAC-A-indrop3.tsv.gz',
 './data/ST_PancreaticCancer/GSM4100722_PDAC-A-st3.tsv.gz',
 './data/ST_PancreaticCancer/GSM4100717_PDAC-C-indrop1.tsv.gz',
 './data/ST_PancreaticCancer/GSM3405531_PDAC-B-indrop1.tsv.gz',
 './data/ST_PancreaticCancer/GSM3036909.tsv.gz',
 './data/ST_PancreaticCancer/GSM4100720_PDAC-C-indrop4.tsv.gz',
 './data/ST_PancreaticCancer/GSM3405533_PDAC-B-indrop3.tsv.gz',
 './data/ST_PancreaticCancer/GSM3405532_PDAC-B-indrop2.tsv.gz',
 './data/ST_PancreaticCancer/GSM4100718_PDAC-C-indrop2.tsv.gz',
 './data/ST_Pan

In [147]:
# read tsv using pandas
count_file = 'GSM4100724_PDAC-B-st3.tsv.gz'
count = pd.read_csv(f'{data_path}/{count_file}', sep='\t')

In [148]:
count = count.fillna(0)
count

Unnamed: 0,Genes,10x13,10x14,10x15,10x16,10x17,10x19,10x20,10x21,10x22,...,9x17,9x18,9x19,9x20,9x21,9x22,9x23,9x24,9x25,9x26
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1CF,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A2M,0,2,3,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,A2ML1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A3GALT2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19733,ZYG11A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19734,ZYG11B,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19735,ZYX,3,11,4,0,0,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
19736,ZZEF1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [149]:
# get cell_ID
cell_id_array = count.columns[1:]
cell_id_array

Index(['10x13', '10x14', '10x15', '10x16', '10x17', '10x19', '10x20', '10x21',
       '10x22', '10x23',
       ...
       '9x17', '9x18', '9x19', '9x20', '9x21', '9x22', '9x23', '9x24', '9x25',
       '9x26'],
      dtype='object', length=316)

In [150]:
# get geneID
gene_id_array = count.Genes
gene_id_array

0           A1BG
1           A1CF
2            A2M
3          A2ML1
4        A3GALT2
          ...   
19733     ZYG11A
19734     ZYG11B
19735        ZYX
19736      ZZEF1
19737       ZZZ3
Name: Genes, Length: 19738, dtype: object

In [151]:
# get count_matrix
count_X = count.values[:,1:].T.astype('int')
count_X

array([[ 0,  0,  0, ...,  3,  0,  0],
       [ 0,  0,  2, ..., 11,  1,  3],
       [ 0,  0,  3, ...,  4,  0,  1],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]])

In [152]:
# adata
adata = ad.AnnData(count_X)
adata.var_names = gene_id_array
adata.obs_names = cell_id_array

In [153]:
adata

AnnData object with n_obs × n_vars = 316 × 19738

In [154]:
# get spot coordinate
spot_coordinate = pd.DataFrame()
spot_coordinate['coord_ID'] = cell_id_array
spot_coordinate['x'] = spot_coordinate['coord_ID'].apply(lambda x: x.split('x')[0])
spot_coordinate['y'] = spot_coordinate['coord_ID'].apply(lambda x: x.split('x')[1])
spot_coordinate = spot_coordinate.set_index('coord_ID')
spot_coordinate


Unnamed: 0_level_0,x,y
coord_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
10x13,10,13
10x14,10,14
10x15,10,15
10x16,10,16
10x17,10,17
...,...,...
9x22,9,22
9x23,9,23
9x24,9,24
9x25,9,25


In [155]:
adata.obsm['spatial'] = spot_coordinate.loc[adata.obs_names].loc[:,['x','y']]
adata

AnnData object with n_obs × n_vars = 316 × 19738
    obsm: 'spatial'

In [156]:
adata.write_h5ad(f'{data_path}/h5ad/GSM4100724_PDAC-B-st3.h5ad')