Visualization and analysis of gene expression in tissue sections by spatial transcriptomics

ShortName: he2020integrating

Steps of processing the data from raw to Anndata:

In [1]:
# 1, Download the raw data from https://data.mendeley.com/datasets/29ntw7sh4r/5

In [166]:
import scanpy as sc
import pandas as pd
import anndata as ad

In [167]:
# data path of the ST data
data_path = './data/BreastCancer2021'

In [168]:
# read counts
def read_count(count_file):
    print('reading ' + count_file)
    count = pd.read_csv(f'{data_path}/{count_file}', sep='\t')
    return count

In [169]:
# read spot_coordinates
def read_spot_coordinates(spot_coordinates_file):
    print('reading ' + spot_coordinates_file)
    spot_coordinates = pd.read_csv(f'{data_path}/{spot_coordinates_file}')
    spot_coordinates = spot_coordinates.rename({'Unnamed: 0': 'coord_ID'}, axis='columns')
    spot_coordinates = spot_coordinates.set_index('coord_ID')
    return spot_coordinates

In [170]:
# read tumor_annotation
def read_tumor_annotation(tumor_annotation_file):
    print('reading ' + tumor_annotation_file)
    tumor_annotation = pd.read_csv(f'{data_path}/{tumor_annotation_file}', sep='\t')
    tumor_annotation['cell_ID'] = tumor_annotation['xcoord']
    tumor_annotation['xcoord'] = tumor_annotation['ycoord']
    tumor_annotation['ycoord'] = tumor_annotation['lab']
    tumor_annotation['lab'] = tumor_annotation['tumor']
    tumor_annotation['tumor'] = tumor_annotation['Unnamed: 4']
    tumor_annotation = tumor_annotation.drop(['Unnamed: 4'], axis=1)
    tumor_annotation['coord_ID'] = tumor_annotation.apply(lambda x: str(x['xcoord']) + 'x' + str(x['ycoord']), axis=1)
    tumor_annotation = tumor_annotation.set_index('coord_ID')
    return tumor_annotation
# Unnamed: 0 x y

In [171]:
#read meta data
meta_path = 'metadata.csv'
metadata = pd.read_csv(f'{data_path}/{meta_path}')
metadata

Unnamed: 0,type,patient,replicate,count_matrix,histology_image,spot_coordinates,tumor_annotation
0,HER2_luminal,BC23287,C1,BC23287_C1_stdata.tsv.gz,HE_BT23287_C1.jpg,spots_BT23287_C1.csv.gz,BC23287_C1_Coords.tsv.gz
1,HER2_luminal,BC23287,C2,BC23287_C2_stdata.tsv.gz,HE_BT23287_C2.jpg,spots_BT23287_C2.csv.gz,BC23287_C2_Coords.tsv.gz
2,HER2_luminal,BC23287,D1,BC23287_D1_stdata.tsv.gz,HE_BT23287_D1.jpg,spots_BT23287_D1.csv.gz,BC23287_D1_Coords.tsv.gz
3,HER2_luminal,BC23450,D2,BC23450_D2_stdata.tsv.gz,HE_BT23450_D2.jpg,spots_BT23450_D2.csv.gz,BC23450_D2_Coords.tsv.gz
4,HER2_luminal,BC23450,E1,BC23450_E1_stdata.tsv.gz,HE_BT23450_E1.jpg,spots_BT23450_E1.csv.gz,BC23450_E1_Coords.tsv.gz
...,...,...,...,...,...,...,...
63,TNBC,BC23377,C2,BC23377_C2_stdata.tsv.gz,HE_BT23377_C2.jpg,spots_BT23377_C2.csv.gz,BC23377_C2_Coords.tsv.gz
64,TNBC,BC23377,D1,BC23377_D1_stdata.tsv.gz,HE_BT23377_D1.jpg,spots_BT23377_D1.csv.gz,BC23377_D1_Coords.tsv.gz
65,TNBC,BC23803,D2,BC23803_D2_stdata.tsv.gz,HE_BC23803_D2.jpg,spots_BC23803_D2.csv.gz,BC23803_D2_Coords.tsv.gz
66,TNBC,BC23803,E1,BC23803_E1_stdata.tsv.gz,HE_BC23803_E1.jpg,spots_BC23803_E1.csv.gz,BC23803_E1_Coords.tsv.gz


In [None]:
for index, matadata_row in metadata.iterrows():
    count_file = matadata_row['count_matrix']
    spot_coordinates_file =  matadata_row['spot_coordinates']
    tumor_annotation_file = matadata_row['tumor_annotation']
    type_patient = matadata_row['type']


    #preprocess the data
    count = read_count(count_file)
    spot_coordinates = read_spot_coordinates(spot_coordinates_file)
    tumor_annotation = read_tumor_annotation(tumor_annotation_file)

    # get geneID
    gene_id_array = count.columns
    # remove "Unnamed: 0"
    gene_id_array = gene_id_array[1:]

    # get cellID
    count = count.rename({'Unnamed: 0': 'coord_ID'}, axis='columns')
    cell_id_array = count['coord_ID']

    # get count_matrix
    count_X = count.values[:,1:].astype('float')

    # adata
    adata = ad.AnnData(count_X)
    adata.var_names = gene_id_array
    adata.obs_names = cell_id_array

    # merge tumor_annotation
    adata.obs = adata.obs.merge(tumor_annotation[['lab', 'cell_ID', 'tumor']], how='left', on='coord_ID')
    # merge spot_coordinates
    adata.obs = adata.obs.merge(spot_coordinates, how='left', on='coord_ID')
    adata.obs = adata.obs.rename({'X': 'xcoord'}, axis='columns')
    adata.obs = adata.obs.rename({'Y': 'ycoord'}, axis='columns')

    # get spot coordinate
    count['x'] = count['coord_ID'].apply(lambda x: x.split('x')[0])
    count['y'] = count['coord_ID'].apply(lambda x: x.split('x')[1])
    count = count.set_index('coord_ID')
    adata.obsm['spatial'] = count.loc[adata.obs_names].loc[:,['x','y']]

    write_file_name = 'staahl2016visualization' + '_' + type_patient + '_' + '_'.join(count_file.split('_')[:-1]) + '.h5ad'
    adata.write_h5ad(f'{data_path}/h5ad/{write_file_name}')
