Spatial maps of prostate cancer transcriptomes reveal an unexplored landscape of heterogeneity

ShortName: berglund2018spatial

Steps of processing the data from raw to Anndata:

In [1]:
# 1, Download the raw data from https://www.spatialresearch.org/

In [33]:
import scanpy as sc
import pandas as pd
import anndata as ad

In [34]:
# data path of the ST data
data_path = './data/prostate-twelve'

In [35]:
# file name of the count matrix and 
count_file = 'Patient3_D1.tsv'

In [36]:
# read tsv using pandas
count = pd.read_csv(f'{data_path}/{count_file}', sep='\t')

In [37]:

count

Unnamed: 0,gene,23.1856078693x21.9329401597,23.1536210815x22.9098859532,20.9224211214x7.02217695608,20.9462830579x8.89623319645,20.9286734588x7.9887500626,20.0966052897x20.9276711279,20.0861670205x21.9316216982,17.8906218507x20.0120916928,17.9476472043x20.9689026574,...,12.9784613468x14.1143026647,13.005456092x15.0615896526,12.9620509533x12.0569368472,12.9618902624x13.0500423475,12.9636658306x10.0114279727,12.9658451181x11.0242353939,18.8844858483x6.01853955335,18.8598244413x7.10513937245,18.8959360718x8.04734923072,24.0873893339x12.0461799378
0,5_8S_rRNA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7SK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A1BG-AS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A2M,3.0,1.0,6.0,0.0,1.0,0.0,0.0,1.0,3.0,...,2.0,10.0,1.0,10.0,10.0,1.0,3.0,4.0,1.0,0.0
4,A2M-AS1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,3.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16647,ZYX,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0
16648,ZZEF1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16649,ZZZ3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16650,bP-2171C21.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# get cell_ID
cell_id_array = count.columns[1:]
cell_id_array

Index(['23.1856078693x21.9329401597', '23.1536210815x22.9098859532',
       '20.9224211214x7.02217695608', '20.9462830579x8.89623319645',
       '20.9286734588x7.9887500626', '20.0966052897x20.9276711279',
       '20.0861670205x21.9316216982', '17.8906218507x20.0120916928',
       '17.9476472043x20.9689026574', '24.0902366175x21.0129592736',
       ...
       '12.9784613468x14.1143026647', '13.005456092x15.0615896526',
       '12.9620509533x12.0569368472', '12.9618902624x13.0500423475',
       '12.9636658306x10.0114279727', '12.9658451181x11.0242353939',
       '18.8844858483x6.01853955335', '18.8598244413x7.10513937245',
       '18.8959360718x8.04734923072', '24.0873893339x12.0461799378'],
      dtype='object', length=244)

In [39]:
# get geneID
gene_id_array = count.gene

In [40]:
gene_id_array

0           5_8S_rRNA
1                 7SK
2            A1BG-AS1
3                 A2M
4             A2M-AS1
             ...     
16647             ZYX
16648           ZZEF1
16649            ZZZ3
16650    bP-2171C21.4
16651          snoU13
Name: gene, Length: 16652, dtype: object

In [None]:
# get count_matrix
count_X = count.values[:,1:].T.astype('float')

In [None]:
# adata
adata = ad.AnnData(count_X)
adata.var_names = gene_id_array
adata.obs_names = cell_id_array

In [None]:
adata.obs_names

Index(['8.79018373843x9.86403173195', '30.028782445x25.0004565341',
       '23.1319641246x21.9247835954', '30.0324472997x19.9390634491',
       '20.8494646991x8.86096705525', '20.8542118779x7.77317160819',
       '30.0001824406x16.9256114589', '30.0601936955x20.8760559214',
       '20.0162669994x20.8243039092', '10.7341388378x7.84235752839',
       ...
       '12.9538079555x18.9240619832', '12.9193649791x16.953932826',
       '12.9338253827x11.9422939622', '12.8917183138x12.9107360181',
       '12.8837265646x9.91596170841', '12.8643695108x10.9288243073',
       '18.8286904063x7.82204982683', '18.8284643501x8.90744539265',
       '15.9258743319x23.9563025192', '30.0358009089x17.877721667'],
      dtype='object', length=506)

In [None]:
# get spot coordinate
spot_coordinate = pd.DataFrame()
spot_coordinate['coord_ID'] = cell_id_array
spot_coordinate['x'] = spot_coordinate['coord_ID'].apply(lambda x: x.split('x')[0])
spot_coordinate['y'] = spot_coordinate['coord_ID'].apply(lambda x: x.split('x')[1])
spot_coordinate = spot_coordinate.set_index('coord_ID')
spot_coordinate

Unnamed: 0_level_0,x,y
coord_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
8.79018373843x9.86403173195,8.79018373843,9.86403173195
30.028782445x25.0004565341,30.028782445,25.0004565341
23.1319641246x21.9247835954,23.1319641246,21.9247835954
30.0324472997x19.9390634491,30.0324472997,19.9390634491
20.8494646991x8.86096705525,20.8494646991,8.86096705525
...,...,...
12.8643695108x10.9288243073,12.8643695108,10.9288243073
18.8286904063x7.82204982683,18.8286904063,7.82204982683
18.8284643501x8.90744539265,18.8284643501,8.90744539265
15.9258743319x23.9563025192,15.9258743319,23.9563025192


In [None]:
adata.obsm['spatial'] = spot_coordinate.loc[adata.obs_names].loc[:,['x','y']]
adata

AnnData object with n_obs × n_vars = 506 × 17630
    obsm: 'spatial'

In [None]:
adata.write_h5ad(f'{data_path}/prostate_Patient3_D1.h5ad')