In [1]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
import sklearn.neighbors
import scipy.sparse as sp
from sklearn.decomposition import PCA
import anndata

In [3]:
#### preprocess
input_dir = 'D:/bio/splatter/data/file/origin/'
sample=['batch1', 'batch2']

adatas = sc.read_h5ad(os.path.join(input_dir + "2batch_6celltype.h5ad"))
import scipy.sparse
adatas.X = scipy.sparse.csr_matrix(adatas.X)

groups = adatas.obs.groupby("batch").indices
adata_list = [adatas[i] for i in groups.values()]


Batch_list = []
for i in np.arange(len(sample)):
    adata=adata_list[i]
#     adata.X = adata.layers['count']
    adata.obs_names = [x + '_' + sample[i] for x in adata.obs_names]
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
    adata = adata[:, adata.var['highly_variable']]
    Batch_list.append(adata)
    
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=sample)
vf=np.array(adata_concat.var.index)
for i in np.arange(len(sample)):
    adata1=adata_concat[adata_concat.obs['slice_name']==sample[i],:]
    cells=adata1.obs_names
    mat=pd.DataFrame(adata1.X.todense(),index=cells,columns=vf)
    coord=pd.DataFrame(adata1.obsm['spatial'],index=cells,columns=['x','y'])
    meta=adata1.obs[['celltype','slice_name']]
    meta.columns=['celltype','batch']
    meta.index=cells
    mat.to_csv(input_dir+sample[i]+"_mat.csv")
    meta.to_csv(input_dir+sample[i]+"_meta.csv")
    coord.to_csv(input_dir+sample[i]+"_coord.csv")



In [4]:
def Cal_Spatial_Net(adata, rad_cutoff=None, k_cutoff=None, model='Radius', verbose=True):
    """\
    Construct the spatial neighbor networks.

    Parameters
    ----------
    adata
        AnnData object of scanpy package.
    rad_cutoff
        radius cutoff when model='Radius'
    k_cutoff
        The number of nearest neighbors when model='KNN'
    model
        The network construction model. When model=='Radius', the spot is connected to spots whose distance is less than rad_cutoff. When model=='KNN', the spot is connected to its first k_cutoff nearest neighbors.
    
    Returns
    -------
    The spatial networks are saved in adata.uns['Spatial_Net']
    """

    assert(model in ['Radius', 'KNN'])
    if verbose:
        print('------Calculating spatial graph...')
    coor = pd.DataFrame(adata.obsm['spatial'])
    coor.index = adata.obs.index
#     coor.columns = ['imagerow', 'imagecol']

    if model == 'Radius':
        nbrs = sklearn.neighbors.NearestNeighbors(radius=rad_cutoff).fit(coor)
        distances, indices = nbrs.radius_neighbors(coor, return_distance=True)
        KNN_list = []
        for it in range(indices.shape[0]):
            KNN_list.append(pd.DataFrame(zip([it]*indices[it].shape[0], indices[it], distances[it])))
    
    if model == 'KNN':
        nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=k_cutoff+1).fit(coor)
        distances, indices = nbrs.kneighbors(coor)
        KNN_list = []
        for it in range(indices.shape[0]):
            KNN_list.append(pd.DataFrame(zip([it]*indices.shape[1],indices[it,:], distances[it,:])))

    KNN_df = pd.concat(KNN_list)
    KNN_df.columns = ['Cell1', 'Cell2', 'Distance']

    Spatial_Net = KNN_df.copy()
    Spatial_Net = Spatial_Net.loc[Spatial_Net['Distance']>0,]
    id_cell_trans = dict(zip(range(coor.shape[0]), np.array(coor.index), ))
    Spatial_Net['Cell1'] = Spatial_Net['Cell1'].map(id_cell_trans)
    Spatial_Net['Cell2'] = Spatial_Net['Cell2'].map(id_cell_trans)
    if verbose:
        print('The graph contains %d edges, %d cells.' %(Spatial_Net.shape[0], adata.n_obs))
        print('%.4f neighbors per cell on average.' %(Spatial_Net.shape[0]/adata.n_obs))

    adata.uns['Spatial_Net'] = Spatial_Net

In [7]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
input_dir = 'D:/bio/splatter/data/file/origin/'
sample=['batch1', 'batch2']

adatas = sc.read_h5ad(os.path.join(input_dir + "2batch_6celltype.h5ad"))
import scipy.sparse
adatas.X = scipy.sparse.csr_matrix(adatas.X)

groups = adatas.obs.groupby("batch").indices
adata_list = [adatas[i] for i in groups.values()]

rad = 5
knn = 4
for i in range(len(adata_list)):
    adata = adata_list[i]
    adata.obs_names = [x + '_' + sample[i] for x in adata.obs_names]
    print(adata)
#     Cal_Spatial_Net(adata, rad_cutoff=rad, k_cutoff=None, model='Radius', verbose=True)
    Cal_Spatial_Net(adata, rad_cutoff=None, k_cutoff=knn, model='KNN', verbose=True)
    if 'Spatial_Net' not in adata.uns.keys():
        raise ValueError("Spatial_Net is not existed! Run Cal_Spatial_Net first!")

    Spatial_Net = adata.uns['Spatial_Net']
    G_df = Spatial_Net.copy()
#     np.savetxt(input_dir+sample+"_edge_Radius_"+str(rad)+".csv",G_df.values[:,:2],fmt='%s')
    np.savetxt(input_dir+sample[i]+"_edge_KNN_"+str(knn)+".csv",G_df.values[:,:2],fmt='%s')
    
#     features = pd.DataFrame(adata.X.todense(), index=adata.obs.index, columns=adata.var.index)
#     features.to_csv(input_dir+sample+"_features.csv")
#     coord = pd.DataFrame(adata.obsm['spatial'], index=adata.obs.index, columns=['x', 'y'])
#     coord.to_csv(input_dir+sample+"_coord.csv")
#     labels = adata.obs[['batch', 'celltype']]
#     labels.to_csv(input_dir+sample+"_label.csv")
#     adata.obs.to_csv(input_dir+sample+"_meta.csv")
    


View of AnnData object with n_obs × n_vars = 324 × 500
    obs: 'Unnamed: 0', 'x', 'y', 'celltype', 'batch'
    obsm: 'spatial'
------Calculating spatial graph...
The graph contains 1296 edges, 324 cells.
4.0000 neighbors per cell on average.
View of AnnData object with n_obs × n_vars = 324 × 500
    obs: 'Unnamed: 0', 'x', 'y', 'celltype', 'batch'
    obsm: 'spatial'
------Calculating spatial graph...
The graph contains 1296 edges, 324 cells.
4.0000 neighbors per cell on average.


  adata.uns['Spatial_Net'] = Spatial_Net
  adata.uns['Spatial_Net'] = Spatial_Net


In [None]:
adata = sc.read_h5ad('G:/dataset/10-MERFISH/MouseMOp--SPACEL/merfish_mouse_brain/merfish_mouse_brain_slice0.h5ad')
adata

In [None]:
adata.obsm['spatial']

In [None]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
sample_names = [str(i) for i in range(7)]
input_dir = 'G:/dataset/10-MERFISH/MouseMOp--SPACEL/merfish_mouse_brain/'
knn = 6

for sample in sample_names:
    print(sample)
    adata = sc.read_h5ad(input_dir + 'merfish_mouse_brain_slice' + sample + '.h5ad')
    adata.obs['batch'] = 'L'+sample
    adata.obsm['spatial'] = adata.obsm['spatial'][['X','Y']].values
    print(adata)
#     Cal_Spatial_Net(adata, rad_cutoff=rad, k_cutoff=None, model='Radius', verbose=True)
    Cal_Spatial_Net(adata, rad_cutoff=None, k_cutoff=knn, model='KNN', verbose=True)
    if 'Spatial_Net' not in adata.uns.keys():
        raise ValueError("Spatial_Net is not existed! Run Cal_Spatial_Net first!")

    Spatial_Net = adata.uns['Spatial_Net']
    G_df = Spatial_Net.copy()
#     np.savetxt(input_dir+sample+"_edge_Radius_"+str(rad)+".csv",G_df.values[:,:2],fmt='%s')
    np.savetxt(input_dir+sample+"_edge_KNN_"+str(knn)+".csv",G_df.values[:,:2],fmt='%s')
    
    features = pd.DataFrame(adata.X, index=adata.obs.index, columns=adata.var.index)
    features.to_csv(input_dir+sample+"_features.csv")
    coord = pd.DataFrame(adata.obsm['spatial'], index=adata.obs.index, columns=['x', 'y'])
    coord.to_csv(input_dir+sample+"_coord.csv")
    labels = adata.obs[['batch', 'class_label']]
    labels.columns = ['batch', 'celltype']
    labels.to_csv(input_dir+sample+"_label.csv")
#     adata.obs.to_csv(input_dir+sample+"_meta.csv")

In [25]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
sample_names = ['E9.5_E1S1', 'E10.5_E2S1', 'E11.5_E1S1']
input_dir = 'G:/dataset/11-MOSTA/input/'
knn = 6

for sample in sample_names:
    print(sample)
    adata = sc.read_h5ad(input_dir + sample + '.h5ad')
    adata.obs['batch'] = sample
    adata.obs.rename(index=lambda s: s + '_'+sample, inplace=True)
    print(adata)
    print(adata.obs_names)
#     Cal_Spatial_Net(adata, rad_cutoff=rad, k_cutoff=None, model='Radius', verbose=True)
    Cal_Spatial_Net(adata, rad_cutoff=None, k_cutoff=knn, model='KNN', verbose=True)
    if 'Spatial_Net' not in adata.uns.keys():
        raise ValueError("Spatial_Net is not existed! Run Cal_Spatial_Net first!")

    Spatial_Net = adata.uns['Spatial_Net']
    G_df = Spatial_Net.copy()
#     np.savetxt(input_dir+sample+"_edge_Radius_"+str(rad)+".csv",G_df.values[:,:2],fmt='%s')
    np.savetxt(input_dir+sample+"_edge_KNN_"+str(knn)+".csv",G_df.values[:,:2],fmt='%s')
    
    features = pd.DataFrame(adata.X.todense(), index=adata.obs.index, columns=adata.var.index)
    features.to_csv(input_dir+sample+"_features.csv")
    coord = pd.DataFrame(adata.obsm['spatial'], index=adata.obs.index, columns=['x', 'y'])
    coord.to_csv(input_dir+sample+"_coord.csv")
    labels = adata.obs[['batch', 'annotation']]
    labels.columns = ['batch', 'celltype']
    labels.to_csv(input_dir+sample+"_label.csv")
#     adata.obs.to_csv(input_dir+sample+"_meta.csv")

E9.5_E1S1
AnnData object with n_obs × n_vars = 5913 × 25568
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'annotation', 'Regulon - 2310011J03Rik', 'Regulon - 5730507C01Rik', 'Regulon - Alx1', 'Regulon - Alx3', 'Regulon - Alx4', 'Regulon - Ar', 'Regulon - Arid3a', 'Regulon - Arid3c', 'Regulon - Arnt2', 'Regulon - Arx', 'Regulon - Ascl1', 'Regulon - Atf1', 'Regulon - Atf4', 'Regulon - Atf5', 'Regulon - Atf6', 'Regulon - Atf7', 'Regulon - Bach1', 'Regulon - Bach2', 'Regulon - Barhl1', 'Regulon - Barx1', 'Regulon - Batf', 'Regulon - Bcl11a', 'Regulon - Bcl3', 'Regulon - Bcl6', 'Regulon - Bcl6b', 'Regulon - Bclaf1', 'Regulon - Bdp1', 'Regulon - Bhlha15', 'Regulon - Bhlhe22', 'Regulon - Bhlhe23', 'Regulon - Bhlhe41', 'Regulon - Bmyc', 'Regulon - Boll', 'Regulon - Bptf', 'Regulon - Brca1', 'Regulon - Brf1', 'Regulon - Brf2', 'Regulon - Bsx', 'Regulon - Cdx1', 'Regulon - Cdx2', 'Regulon - Cebpa', 'Regulon - Cebpz', 'Regulon - Chd1', 'Regulon - C

The graph contains 35478 edges, 5913 cells.
6.0000 neighbors per cell on average.
E10.5_E2S1
AnnData object with n_obs × n_vars = 8494 × 22385
    obs: 'SCT_snn_res.0.8', 'annotation', 'lineage', 'nCount_RNA', 'nCount_SCT', 'nFeature_RNA', 'nFeature_SCT', 'orig.ident', 'percent.mt', 'primaryanno', 'refindedanno', 'seurat_clusters', 'simpleanno', 'annotation3', 'Regulon - 5730507C01Rik', 'Regulon - AU041133', 'Regulon - Acaa1b', 'Regulon - Adnp', 'Regulon - Alx1', 'Regulon - Alx3', 'Regulon - Alx4', 'Regulon - Arnt2', 'Regulon - Ascl1', 'Regulon - Atf1', 'Regulon - Atf2', 'Regulon - Atf3', 'Regulon - Atf6b', 'Regulon - Atf7', 'Regulon - Atoh1', 'Regulon - Atoh7', 'Regulon - B230307C23Rik', 'Regulon - Bach2', 'Regulon - Barhl1', 'Regulon - Barhl2', 'Regulon - Barx1', 'Regulon - Batf', 'Regulon - Bbx', 'Regulon - Bcl3', 'Regulon - Bcl6', 'Regulon - Bcl6b', 'Regulon - Bclaf1', 'Regulon - Bhlha15', 'Regulon - Bhlhe23', 'Regulon - Bhlhe41', 'Regulon - Bmyc', 'Regulon - Borcs8', 'Regulon - Br

The graph contains 50964 edges, 8494 cells.
6.0000 neighbors per cell on average.
E11.5_E1S1
AnnData object with n_obs × n_vars = 30124 × 26854
    obs: 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'annotation', 'Regulon - A1cf', 'Regulon - Acaa1a', 'Regulon - Adnp', 'Regulon - Alx1', 'Regulon - Alx3', 'Regulon - Alx4', 'Regulon - Arid3a', 'Regulon - Arid3c', 'Regulon - Arid5b', 'Regulon - Arx', 'Regulon - Ascl2', 'Regulon - Atf2', 'Regulon - Atf5', 'Regulon - Atf6', 'Regulon - Atf6b', 'Regulon - Atoh1', 'Regulon - Barhl1', 'Regulon - Barhl2', 'Regulon - Barx2', 'Regulon - Bcl3', 'Regulon - Bclaf1', 'Regulon - Bhlhe22', 'Regulon - Bmyc', 'Regulon - Brf1', 'Regulon - Brf2', 'Regulon - Bsx', 'Regulon - Cdx2', 'Regulon - Cdx4', 'Regulon - Cebpa', 'Regulon - Cebpb', 'Regulon - Cebpd', 'Regulon - Cebpe', 'Regulon - Cebpg', 'Regulon - Chd1', 'Regulon - Cic', 'Regulon - Clock', 'Regulon - Cpeb1', 'Regulon - Creb1', 'Regulon - Creb3', 'Regulon - Creb3l1

The graph contains 180744 edges, 30124 cells.
6.0000 neighbors per cell on average.


In [10]:
knn=10
rad=10
input_dir = 'G:/dataset/11-MOSTA/input/SPIRAL_input/'
sample="E10.5_E2S1"
feat=pd.read_csv(input_dir+sample+"_mat.csv",header=0,index_col=0,sep=',')
coord=pd.read_csv(input_dir+sample+"_coord.csv",header=0,index_col=0,sep=',')
coord.columns=['x','y']
adata = sc.AnnData(feat)
adata.var_names_make_unique()
adata.X=sp.csr_matrix(adata.X)
adata.obsm["spatial"] = coord.loc[:,['x','y']].to_numpy()
Cal_Spatial_Net(adata, rad_cutoff=rad, k_cutoff=knn, model='KNN', verbose=True)
if 'highly_variable' in adata.var.columns:
    adata_Vars =  adata[:, adata.var['highly_variable']]
else:
    adata_Vars = adata
features = pd.DataFrame(adata_Vars.X.toarray()[:, ], index=adata_Vars.obs.index, columns=adata_Vars.var.index)
cells = np.array(features.index)
cells_id_tran = dict(zip(cells, range(cells.shape[0])))
if 'Spatial_Net' not in adata.uns.keys():
    raise ValueError("Spatial_Net is not existed! Run Cal_Spatial_Net first!")

Spatial_Net = adata.uns['Spatial_Net']
G_df = Spatial_Net.copy()
np.savetxt(input_dir+sample+"_edge_KNN_"+str(knn)+".csv",G_df.values[:,:2],fmt='%s')

------Calculating spatial graph...
The graph contains 84940 edges, 8494 cells.
10.0000 neighbors per cell on average.


# sample

In [None]:
rad=150
dirs="/data02/tguo/space_batch_effect/human_DLPFC_10x/"
sample_name=[151507,151508,151509,151510,151669,151670,151671,151672,151673,151674,151675,151676]
IDX=np.array([0,8])
# IDX=np.arange(len(sample_name))
flags=str(sample_name[IDX[0]])
for i in np.arange(1,len(IDX)):
    flags=flags+'-'+str(sample_name[IDX[i]])
for i in IDX:
    sample1=sample_name[i]
    features=pd.read_csv(dirs+"gtt_input_scanpy/"+flags+'_'+str(sample1)+"_features.txt",header=0,index_col=0,sep=',')
    meta=pd.read_csv(dirs+"gtt_input_scanpy/"+flags+'_'+str(sample1)+"_label.txt",header=0,index_col=0,sep=',')
    coord=pd.read_csv(dirs+"gtt_input_scanpy/"+flags+'_'+str(sample1)+"_positions.txt",header=0,index_col=0,sep=',')
    # meta=meta.iloc[:meta.shape[0]-1,:]
    adata = sc.AnnData(features)
    adata.var_names_make_unique()
    adata.X=sp.csr_matrix(adata.X)
    adata.obsm["spatial"] = coord.loc[:,['x','y']].to_numpy()
    Cal_Spatial_Net(adata, rad_cutoff=rad, k_cutoff=None, model='Radius', verbose=True)
    if 'highly_variable' in adata.var.columns:
        adata_Vars =  adata[:, adata.var['highly_variable']]
    else:
        adata_Vars = adata
    features = pd.DataFrame(adata_Vars.X.toarray()[:, ], index=adata_Vars.obs.index, columns=adata_Vars.var.index)
     = np.array(features.index)
    cells_id_tran = dict(zip(cells, range(cells.shape[0])))
    if 'Spatial_Net' not in adata.uns.keys():
        raise ValueError("Spatial_Net is not existed! Run Cal_Spatial_Net first!")

    Spatial_Net = adata.uns['Spatial_Net']
    G_df = Spatial_Net.copy()
    np.savetxt(dirs+"gtt_input_scanpy/"+flags+'_'+str(sample1)+"_edge_Radius_"+str(rad)+".csv",G_df.values[:,:2],fmt='%s')

    # G_df['Cell1'] = G_df['Cell1'].map(cells_id_tran)
    # G_df['Cell2'] = G_df['Cell2'].map(cells_id_tran)
    # adj = sp.coo_matrix((np.ones(G_df.shape[0]), (G_df['Cell1'], G_df['Cell2'])), shape=(adata.n_obs, adata.n_obs))
    # adj+=adj.T.multiply(adj.T>adj)-adj.multiply(adj.T>adj)
    # features=torch.FloatTensor(features.values)

# preprocess

In [5]:
sample=['E9.5_E1S1', 'E10.5_E2S1', 'E11.5_E1S1']
Batch_list = []
for i in np.arange(len(sample)):
    adata=sc.read_h5ad("G:/dataset/11-MOSTA/input/"+sample[i]+".h5ad")
    adata.X = adata.layers['count']
    adata.obs_names = [x + '_' + sample[i] for x in adata.obs_names]
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=5000)
    adata = adata[:, adata.var['highly_variable']]
    Batch_list.append(adata)
    
adata_concat = anndata.concat(Batch_list, label="slice_name", keys=sample)
vf=np.array(adata_concat.var.index)
for i in np.arange(len(sample)):
    adata1=adata_concat[adata_concat.obs['slice_name']==sample[i],:]
    cells=adata1.obs_names
    mat=pd.DataFrame(adata1.X.todense(),index=cells,columns=vf)
    coord=pd.DataFrame(adata1.obsm['spatial'],index=cells,columns=['x','y'])
    meta=adata1.obs[['annotation','slice_name']]
    meta.columns=['celltype','batch']
    meta.index=cells
    mat.to_csv("G:/dataset/11-MOSTA/input/SPIRAL_input/"+sample[i]+"_mat.csv")
    meta.to_csv("G:/dataset/11-MOSTA/input/SPIRAL_input/"+sample[i]+"_meta.csv")
    coord.to_csv("G:/dataset/11-MOSTA/input/SPIRAL_input/"+sample[i]+"_coord.csv")

  concat_annot[label] = label_col
