Spatial genomics enables multi-modal study of clonal heterogeneity in tissues

ShortName: zhao2022spatial

Steps of processing the data from raw to Anndata:

In [1]:
# 1, Download the raw data from https://singlecell.broadinstitute.org/single_cell/study/SCP1278/spatial-genomics-enables-multi-modal-study-of-clonal-heterogeneity-in-tissues

In [3]:
import os
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad

In [4]:
path = '/home/yzy/projects/SODB/SODB_code/SODB/revision/Raw2Anndata/SpatialProteomics/t-CyCIF/data'


In [1]:
from soview import *
import shutil
import os
import gc

scanpy==1.8.2 anndata==0.8.0 umap==0.5.2 numpy==1.20.3 scipy==1.7.3 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.13.1 python-igraph==0.9.8 pynndescent==0.5.5
squidpy==1.1.2


In [2]:
import pandas as pd

In [3]:
raw_path = 'zhiyuanyuan/PUBDT/st/slideDNAseq/original/data/ft_local/slide-DNA-seq'
h5ad_path = 'zhiyuanyuan/PUBDT/st/slideDNAseq/original/h5ad'



In [4]:
def get_adata(var_dict,count_file,delimiter,molecule):
    var_len = len(var_dict)
    count_pd = pd.read_csv(f'{raw_path}/{count_file}',delimiter=delimiter)
    count_sparse = count_pd.values
    if molecule=='dna':
        cell_len = count_sparse[:,0].max()
    elif molecule=='rna':
        cell_len = count_sparse[:,1].max()
    # print(cell_len,var_len)
    # print(count_sparse[:,1].max(),count_sparse[:,0].max())
    count_mat = np.zeros(shape=(cell_len,var_len))
    
    for i in range(count_sparse.shape[0]):
        if molecule=='dna':
            cell_idx = count_sparse[i][0]
            var_idx = count_sparse[i][1]
        elif molecule=='rna':
            cell_idx = count_sparse[i][1]
            var_idx = count_sparse[i][0]
        count = count_sparse[i][2]
        count_mat[cell_idx-1,var_idx-1] = count
    adata = ad.AnnData(count_mat)
    adata.var_names = [var_dict[i+1] for i in range(len(var_dict))]
    adata.obs_names = list(np.arange(cell_len).astype('str'))
    return adata

In [5]:
for file_name in os.listdir(raw_path):
    file_sp = file_name.split('.')
    if len(file_sp)!=3:
        continue
    print(file_sp[0],file_sp[1])
        

mouse_cerebellum_1_dna_200114_14 bead_locations
mouse_liver_met_1_dna_191114_06 sparse_counts_1Mb
human_colon_cancer_dna_4x_201027_12 bead_locations
mouse_cerebellum_1_dna_200114_14 sparse_counts_1Mb
mouse_cerebellum_1_dna_190201_21 sparse_counts_1Mb
mouse_liver_met_2_dna_200114_10 bead_locations
human_colon_cancer_dna_4x_201027_12 sparse_counts_1Mb
mouse_cerebellum_dna_4x_201027_17 bead_locations
mouse_liver_met_1_dna_191114_05 bead_locations
human_colon_cancer_3_dna_191204_19 sparse_counts_1Mb
mouse_liver_met_1_dna_191114_06 bead_locations
mouse_cerebellum_1_dna_190201_21 bead_locations
mouse_cerebellum_dna_4x_201027_17 sparse_counts_1Mb
human_colon_cancer_3_scwgs_210402_02 counts_1Mb
human_colon_cancer_4_rna_200102_06 bead_locations
mouse_liver_met_2_rna_201002_04 bead_locations
human_colon_cancer_3_scwgs_200227_01 counts_1Mb
human_colon_cancer_3_dna_191204_19 bead_locations
human_colon_cancer_4_dna_200114_13 bead_locations
human_colon_cancer_4_rna_200102_06 sparse_expression
human_

In [6]:
human_dna_var_file = f'{raw_path}/hg19_1Mb_bins.txt'
human_rna_var_file = f'{raw_path}/human_genes.txt'
mouse_dna_var_file = f'{raw_path}/mm10_1Mb_bins.txt'
mouse_rna_var_file = f'{raw_path}/mouse_genes.txt'

In [7]:
pd_human_dna = pd.read_csv(human_dna_var_file,delimiter='\t')
human_dna_var_dict = {}
for i in range(len(pd_human_dna)):
    bin_id = pd_human_dna['bin_ind'][i]
    chr_id = pd_human_dna['chr'][i]
    bin_st = pd_human_dna['bin_start'][i]
    bin_ed = pd_human_dna['bin_end'][i]    
    human_dna_var_dict[bin_id] = f'{chr_id}_{bin_st}_{bin_ed}'

In [8]:
pd_mouse_dna = pd.read_csv(mouse_dna_var_file,delimiter='\t')
mouse_dna_var_dict = {}
for i in range(len(pd_mouse_dna)):
    bin_id = pd_mouse_dna['bin_ind'][i]
    chr_id = pd_mouse_dna['chr'][i]
    bin_st = pd_mouse_dna['bin_start'][i]
    bin_ed = pd_mouse_dna['bin_end'][i]    
    mouse_dna_var_dict[bin_id] = f'{chr_id}_{bin_st}_{bin_ed}'

In [9]:
pd_human_rna = pd.read_csv(human_rna_var_file,delimiter='\t')
rna_list = list(pd_human_rna.values[:,0])
id_list = list(np.arange(len(rna_list))+1)
human_rna_var_dict = dict(zip(id_list,rna_list))

In [10]:
pd_mouse_rna = pd.read_csv(mouse_rna_var_file,delimiter='\t')
rna_list = list(pd_mouse_rna.values[:,0])
id_list = list(np.arange(len(rna_list))+1)
mouse_rna_var_dict = dict(zip(id_list,rna_list))

In [11]:
sample_file_list = [
    [
        'human_colon_cancer_3_dna_191204_19.bead_locations.csv',
        'human_colon_cancer_3_dna_191204_19.sparse_counts_1Mb.txt'
    ],
    [
        'human_colon_cancer_4_dna_200114_13.bead_locations.csv',
        'human_colon_cancer_4_dna_200114_13.sparse_counts_1Mb.txt'
    ],
    [
        'human_colon_cancer_4_rna_200102_06.bead_locations.csv',
        'human_colon_cancer_4_rna_200102_06.sparse_expression.txt'
    ],
    [
        'human_colon_cancer_dna_4x_201027_12.bead_locations.csv',
        'human_colon_cancer_dna_4x_201027_12.sparse_counts_1Mb.txt'
    ],
    [
        'mouse_cerebellum_1_dna_190201_21.bead_locations.csv',
        'mouse_cerebellum_1_dna_190201_21.sparse_counts_1Mb.txt'
    ],
    [
        'mouse_cerebellum_1_dna_200114_14.bead_locations.csv',
        'mouse_cerebellum_1_dna_200114_14.sparse_counts_1Mb.txt'
    ],
    [
        'mouse_cerebellum_dna_4x_201027_17.bead_locations.csv',
        'mouse_cerebellum_dna_4x_201027_17.sparse_counts_1Mb.txt'
    ],
    [
        'mouse_liver_met_1_dna_191114_05.bead_locations.csv',
        'mouse_liver_met_1_dna_191114_05.sparse_counts_1Mb.txt'
    ],
    [
        'mouse_liver_met_1_dna_191114_06.bead_locations.csv',
        'mouse_liver_met_1_dna_191114_06.sparse_counts_1Mb.txt'
    ],
    [
        'mouse_liver_met_2_dna_200114_10.bead_locations.csv',
        'mouse_liver_met_2_dna_200114_10.sparse_counts_1Mb.txt'
    ],
    [
        'mouse_liver_met_2_rna_201002_04.bead_locations.csv',
        'mouse_liver_met_2_rna_201002_04.sparse_expression.txt'
    ]
    
]

In [12]:
for i in range(len(sample_file_list)):
    count_file = sample_file_list[i][1]
    coord_file = sample_file_list[i][0]
    sample_name = coord_file.split('.')[0]
    if 'dna' in count_file and 'human' in count_file:
        var_dict = human_dna_var_dict
        delimiter = '\t'
        molecule = 'dna'
    if 'dna' in count_file and 'mouse' in count_file:
        var_dict = mouse_dna_var_dict
        delimiter = '\t'
        molecule = 'dna'


    if 'rna' in count_file and 'human' in count_file:
        var_dict = human_rna_var_dict
        delimiter = ','
        molecule = 'rna'


    if 'rna' in count_file and 'mouse' in count_file:
        var_dict = mouse_rna_var_dict        
        delimiter = ','
        molecule = 'rna'



    adata = get_adata(var_dict,count_file,delimiter,molecule)
    spatial_mat = coord_pd = pd.read_csv(f'{raw_path}/{coord_file}').values[:,1:3]
    adata.obsm['spatial'] = spatial_mat.astype('float')
    print(sample_name,adata.shape)
    

    adata.write_h5ad(f'{h5ad_path}/{sample_name}.h5ad')
    del adata
    gc.collect()

  adata = ad.AnnData(count_mat)


human_colon_cancer_3_dna_191204_19 (41181, 3114)
human_colon_cancer_4_dna_200114_13 (40766, 3114)
human_colon_cancer_4_rna_200102_06 (34263, 26001)
human_colon_cancer_dna_4x_201027_12 (17787, 3114)
mouse_cerebellum_1_dna_190201_21 (34468, 2738)
mouse_cerebellum_1_dna_200114_14 (31382, 2738)
mouse_cerebellum_dna_4x_201027_17 (16736, 2738)
mouse_liver_met_1_dna_191114_05 (40685, 2738)
mouse_liver_met_1_dna_191114_06 (39089, 2738)
mouse_liver_met_2_dna_200114_10 (38580, 2738)
mouse_liver_met_2_rna_201002_04 (31290, 21902)
