Visualization and analysis of gene expression in tissue sections by spatial transcriptomics

ShortName: stahl2016visualization

Steps of processing the data from raw to Anndata:

In [1]:
# 1, Download the raw data from https://www.spatialresearch.org/resources-published-datasets/

In [278]:
import scanpy as sc
import pandas as pd
import anndata as ad

In [279]:
# data path of the ST data
data_path = './data/BreastCancer'

In [280]:
# file name of the count matrix and 
count_file = 'Layer4_BC_count_matrix-1.tsv'

In [281]:
# read tsv using pandas
count = pd.read_csv(f'{data_path}/{count_file}', sep='\t')

In [282]:
count
# Unnamed: 0 x y

Unnamed: 0.1,Unnamed: 0,MAPKAPK2,SLC39A8,NDUFA7,GJC3,ARPC5L,TMEM109,COL5A1,UBXN11,SHFM1,...,G6PC2,TBX5,TMEM246,PILRB,EFCAB10,STX19,WSCD1,HLA-G,GPR174,SIRT4
0,18.89x8.958,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,17.902x8.962,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13.029x9.008,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.824x8.995,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13.948x9.022,0,0,0,0,1,0,1,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,20.967x26.926,2,0,0,1,0,1,0,0,2,...,0,0,0,0,0,0,0,0,0,0
258,24.127x26.972,3,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
259,23.062x27.052,1,0,0,0,0,0,3,0,4,...,0,0,0,0,0,0,0,0,0,0
260,21.924x27.865,1,0,0,1,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [283]:
# assign cell ID
cell_id_array = [f'BC_{ID}' for ID in range(count.shape[0])]
count['cell_ID'] = cell_id_array
count = count.set_index('cell_ID')

In [284]:
cell_id_array

['BC_0',
 'BC_1',
 'BC_2',
 'BC_3',
 'BC_4',
 'BC_5',
 'BC_6',
 'BC_7',
 'BC_8',
 'BC_9',
 'BC_10',
 'BC_11',
 'BC_12',
 'BC_13',
 'BC_14',
 'BC_15',
 'BC_16',
 'BC_17',
 'BC_18',
 'BC_19',
 'BC_20',
 'BC_21',
 'BC_22',
 'BC_23',
 'BC_24',
 'BC_25',
 'BC_26',
 'BC_27',
 'BC_28',
 'BC_29',
 'BC_30',
 'BC_31',
 'BC_32',
 'BC_33',
 'BC_34',
 'BC_35',
 'BC_36',
 'BC_37',
 'BC_38',
 'BC_39',
 'BC_40',
 'BC_41',
 'BC_42',
 'BC_43',
 'BC_44',
 'BC_45',
 'BC_46',
 'BC_47',
 'BC_48',
 'BC_49',
 'BC_50',
 'BC_51',
 'BC_52',
 'BC_53',
 'BC_54',
 'BC_55',
 'BC_56',
 'BC_57',
 'BC_58',
 'BC_59',
 'BC_60',
 'BC_61',
 'BC_62',
 'BC_63',
 'BC_64',
 'BC_65',
 'BC_66',
 'BC_67',
 'BC_68',
 'BC_69',
 'BC_70',
 'BC_71',
 'BC_72',
 'BC_73',
 'BC_74',
 'BC_75',
 'BC_76',
 'BC_77',
 'BC_78',
 'BC_79',
 'BC_80',
 'BC_81',
 'BC_82',
 'BC_83',
 'BC_84',
 'BC_85',
 'BC_86',
 'BC_87',
 'BC_88',
 'BC_89',
 'BC_90',
 'BC_91',
 'BC_92',
 'BC_93',
 'BC_94',
 'BC_95',
 'BC_96',
 'BC_97',
 'BC_98',
 'BC_99',
 'BC_100',

In [285]:
# get geneID
gene_id_array = count.columns

In [286]:
gene_id_array

Index(['Unnamed: 0', 'MAPKAPK2', 'SLC39A8', 'NDUFA7', 'GJC3', 'ARPC5L',
       'TMEM109', 'COL5A1', 'UBXN11', 'SHFM1',
       ...
       'G6PC2', 'TBX5', 'TMEM246', 'PILRB', 'EFCAB10', 'STX19', 'WSCD1',
       'HLA-G', 'GPR174', 'SIRT4'],
      dtype='object', length=14809)

In [287]:
# remove "Unnamed: 0"
gene_id_array = gene_id_array[1:]

In [288]:
gene_id_array

Index(['MAPKAPK2', 'SLC39A8', 'NDUFA7', 'GJC3', 'ARPC5L', 'TMEM109', 'COL5A1',
       'UBXN11', 'SHFM1', 'PAK4',
       ...
       'G6PC2', 'TBX5', 'TMEM246', 'PILRB', 'EFCAB10', 'STX19', 'WSCD1',
       'HLA-G', 'GPR174', 'SIRT4'],
      dtype='object', length=14808)

In [289]:
# get count_matrix
count_X = count.values[:,1:].astype('int')

In [290]:
# adata
adata = ad.AnnData(count_X)
adata.var_names = gene_id_array
adata.obs_names = cell_id_array

In [291]:
adata

AnnData object with n_obs × n_vars = 262 × 14808

In [292]:
# get spot coordinate
count['x'] = count['Unnamed: 0'].apply(lambda x: x.split('x')[0])
count['y'] = count['Unnamed: 0'].apply(lambda x: x.split('x')[1])


In [293]:
adata.obsm['spatial'] = count.loc[adata.obs_names].loc[:,['x','y']]
adata

AnnData object with n_obs × n_vars = 262 × 14808
    obsm: 'spatial'

In [294]:
adata.write_h5ad(f'{data_path}/Layer4_BC.h5ad')