Slide-seq: A scalable technology for measuring genome-wide expression at high spatial resolution

ShortName: rodriques2019slide

Steps of processing the data from raw to Anndata:

In [1]:
# 1, Download the raw data from https://portals.broadinstitute.org/single_ cell/study/slide-seq-study

In [None]:
# note: the broadinstitute single cell portal contains raw data from many single cell and spatial transcriptomics studies
# one must login via an account to access and download the datasets

In [2]:
# this dataset contains more than 100 data, we process one data, puck_180819_11, as an example.

In [None]:
# all data of this slide-seq:

In [71]:
slideseq_puck_tissue_map = {
    'coronal hippocampus':['180413_7','180413_7','180413_7'],
    'coronal cerebellum':['180430_1','180430_6','180430_1','180430_5','180430_6','180430_1',
                         '180430_5','180430_6','180430_1','180430_1','180430_5','180430_6',
                         '180430_6'],
    'kidney':['180528_23','180528_23'],
    'liver':['180803_8','180803_8'],
#     'coronal olfactory bulb':['180430_3','180430_3'], nodata
    'sagittal cerebellum':['180819_9','180819_10','180819_11','180819_12','180819_9','180819_10',
                          '180819_11','180819_12','180819_24','180819_26','180819_30','180821_8',
                          '180821_9','180821_12','180819_12',],
    'sagittal hippocampus':['180528_20','180528_22','180531_13','180531_16','180531_17','180531_18',
                           '180531_19','180531_22','180531_23','180602_15','180602_16','180602_17',
                           '180602_18','180602_20','180602_22','180602_23','180602_24',
                           '180611_1','180611_2','180611_6','180620_4','180531_17','180602_20',
                           '180531_13','180531_22','180602_17','180602_20','180611_6','180611_10',
                           '180611_11','180611_12','180611_13','180611_14','180611_16','180611_3',
                           '180611_4','180611_5','180611_7','180611_8','180611_9','180615_1','180615_10',
                           '180615_11','180615_12','180615_14','180615_16','180615_17','180615_18','180615_20',
                           '180615_21','180615_22','180615_3','180615_4','180615_5','180615_6','180615_7',
                           '180615_8','180618_12','180618_13','180618_14','180618_15','180618_16',
                           '180618_18','180618_20','180618_21','180618_24','180618_3','180618_4','180618_7',
                           '180620_1','180620_3','180620_5'],
    'coronal cortex':['180819_3'],
    'sagittal cortex':['180819_19','180819_6','180819_19','180819_6','180819_5','180819_6','180819_7',
                      '180819_19','180821_3','180819_19','180819_6','180819_5','180819_6'],
    'coronal cortex 2h':['180819_1','180819_2','180819_3','180819_4'],
    'sagittal cortex 2h':['180819_13','180819_14','180728_15'],
    'sagittal cortex 3d':['180819_16','180819_18','180819_19','180821_3'],
    'sagittal cortex 2w':['180819_5','180819_6','180819_7','180819_8'],
#     'coronal human cerebellum':['180821_27','180821_27','180821_28'], nodata
    
}

In [72]:
all_used_pucks = []
for key in slideseq_puck_tissue_map.keys():
    cur_li = slideseq_puck_tissue_map[key]
    cur_li_unique = list(set(cur_li))
    slideseq_puck_tissue_map[key] = cur_li_unique
    print('{0}:{1}'.format(key,len(cur_li_unique)))
    all_used_pucks.extend(cur_li_unique)

coronal hippocampus:1
coronal cerebellum:3
kidney:1
liver:1
sagittal cerebellum:10
sagittal hippocampus:65
coronal cortex:1
sagittal cortex:5
coronal cortex 2h:4
sagittal cortex 2h:3
sagittal cortex 3d:4
sagittal cortex 2w:4


In [2]:
import scanpy as sc
import pandas as pd
import anndata as ad
data_path = './data/Slide-seq_Mouse_Cerebellum/'
count_file = 'MappedDGEForR.csv'
meta_file = 'BeadLocationsForR.csv'
# read csv with pandas
count0 = pd.read_csv(f'{data_path}/{count_file}')
meta = pd.read_csv(f'{data_path}/{meta_file}')

In [3]:
count0

Unnamed: 0,Row,CGTAGGGGGAGCCG,GGTTCCAATTTCAC,GTCGGTCTGGGAGG,AAAAAAAAAAAAAA,CCAGTGGTTTTTTT,CGACTACTTTTTTT,GTACAAATTTTTTT,ACACCATACCCCCC,CTCAACTACCCCCC,...,GTATGATTTATCGC,AGCGAGATTTATTA,GAATCGATGCGCTA,GAGGACTTGCGCGC,GCCAAACTCGCGCG,GTTCACGTCGCGAN,GTGCTCGTCGCGCG,GAGAGCCTCGCGTA,CTAAAAGTCGCGCG,CTTACGATATATAG
0,0610005C13Rik,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0610007P14Rik,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0610009B22Rik,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0610009E02Rik,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0610009L18Rik,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19125,n-R5s33,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19126,n-R5s45,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19127,n-R5s54,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19128,n-R5s93,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:

count=count0.values[:,1:].T

In [4]:
# extract cell id
cell_id_array = count0.columns[1:]
cell_id_array

Index(['CGTAGGGGGAGCCG', 'GGTTCCAATTTCAC', 'GTCGGTCTGGGAGG', 'AAAAAAAAAAAAAA',
       'CCAGTGGTTTTTTT', 'CGACTACTTTTTTT', 'GTACAAATTTTTTT', 'ACACCATACCCCCC',
       'CTCAACTACCCCCC', 'CTGGCTTAGGGGGG',
       ...
       'GTATGATTTATCGC', 'AGCGAGATTTATTA', 'GAATCGATGCGCTA', 'GAGGACTTGCGCGC',
       'GCCAAACTCGCGCG', 'GTTCACGTCGCGAN', 'GTGCTCGTCGCGCG', 'GAGAGCCTCGCGTA',
       'CTAAAAGTCGCGCG', 'CTTACGATATATAG'],
      dtype='object', length=29275)

In [5]:
# extract geneID
gene_id_array = count0['Row']
gene_id_array

0        0610005C13Rik
1        0610007P14Rik
2        0610009B22Rik
3        0610009E02Rik
4        0610009L18Rik
             ...      
19125          n-R5s33
19126          n-R5s45
19127          n-R5s54
19128          n-R5s93
19129          n-R5s98
Name: Row, Length: 19130, dtype: object

In [7]:
# get count_matrix
count_X = count.astype('float')

In [8]:
# make adata
adata = ad.AnnData(count_X)
adata.var_names = gene_id_array
adata.obs_names = cell_id_array

  adata = ad.AnnData(count_X)


In [9]:
meta = pd.read_csv(f'{data_path}/{meta_file}')
meta

Unnamed: 0,barcodes,xcoord,ycoord
0,CGTAGGGGGAGCCG,1705.258427,3916.898876
1,GGTTCCAATTTCAC,1627.118812,3745.940594
2,GTCGGTCTGGGAGG,1032.081395,2442.790698
3,AAAAAAAAAAAAAA,3688.486486,5990.315315
4,CCAGTGGTTTTTTT,3727.415094,5245.886792
...,...,...,...
29270,GTTCACGTCGCGAN,1059.189189,1525.945946
29271,GTGCTCGTCGCGCG,2732.390533,3646.325444
29272,GAGAGCCTCGCGTA,1958.486486,2828.774775
29273,CTAAAAGTCGCGCG,1253.877358,1249.839623


In [10]:
# set obs index
meta = meta.set_index('barcodes')

In [11]:
adata.obsm['spatial'] = meta.loc[adata.obs_names].loc[:,['xcoord','ycoord']]

In [12]:
adata

AnnData object with n_obs × n_vars = 29275 × 19130
    obsm: 'spatial'

In [None]:
adata.write_h5ad(f'{data_path}/puck_180819_11.h5ad')

In [3]:
# The data is now transform from raw to Anndata, then it should be processed to be accepted to SODB

In [None]:
# Run ShortName.ipynb in Anndata2SODB path