Firstly, we load the packages required for running the modified code

In [None]:
import os 
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
from get_data import run
import scanpy as sc
from sklearn import metrics
from scipy.spatial import distance
from utils import *
from torch_geometric.nn import HypergraphConv
from MVCLST import train
import anndata
from scipy.stats import zscore
from utils_copy import clustering
from sklearn.metrics import silhouette_score
from augment import *
from scipy.spatial.distance import pdist, squareform

Next, we will set the data we are about to process. Here, we set the data as 151673 from the DLPFC dataset, as well as the output folder address after running the data, and some values that need to be pre-set in advance, such as the number of clusters in the cluster.

In [None]:
sample_list = ['151673']
for sample in sample_list:
    data_path = "./data/151673" 
    data_name = sample
    save_path = data_path+"/"+sample+"/"+'chebyshev'+str(i) #### save path
    save_path_figure = Path(os.path.join(save_path, "Figure", data_name))
    save_path_figure.mkdir(parents=True, exist_ok=True)
    if data_name in ['151669','151670','151671','151672']:
        n_domains = 5
    else:
        n_domains = 6
    data = run(save_path = save_path, 
        platform = "Visium",
        pca_n_comps = 128,
        pre_epochs = 800,
        vit_type='vit_b',#'vit'
        )
    if sample=="151671":
        i=6	
    else:
        i=0

We will read our data labels and data here, where we use _get-data to read the data we need and process it into the appropriate format.

In [None]:
df_meta = pd.read_csv(data_path+'/'+data_name+'/metadata.tsv', sep='\t')
adata =data._get_adata(data_path, data_name)

Enhance data.

In [None]:
adata = data._get_augment(adata, adjacent_weight = 1, neighbour_k =6)

The preparation and screening of data are referred to as our data preprocessing here.

In [None]:
adata1=adata.copy()
adata.X = adata.obsm["augment_gene_data"].astype(float)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=3000)
adata_X = sc.pp.normalize_total(adata, target_sum=1, exclude_highly_expressed=True, inplace=False)['X']
adata_X = sc.pp.log1p(adata_X)
adata_X = sc.pp.scale(adata_X)

We will perform dimensionality reduction on the data here, which will be used in the subsequent pre clustering.

In [None]:
inputs1 = sc.pp.pca(adata_X, n_comps=128)
inputs = sc.pp.pca(adata_X, n_comps=1000)

Pre clustering, where we process the pre clustered data, including transforming it into a graph structure and masking the neighbor relationships in the pre clustering results.

In [None]:
cluster_label,_=cluster(adata,inputs1,df_meta,n_domains)
cluster_adj=create_adjacency_matrix(cluster_label)
adj_augment = adata.obsm["weights_matrix_all"]
adj_augment=sim2adj(adj_augment,6)
adj_pure=sim2adj(adata.obsm["weights_matrix_nomd"],6)
adj_pure=cluster_adj*adj_pure
adata1.obsm['weights_matrix_all']=adj_pure

The operation here includes determining the neighbor relationships of the second view generated by masking the neighbor relationships through pre clustering, and enhancing the original data through the neighbor relationships of the second view to generate the expression data of the second view.

In [None]:
adata1=find_adjacent_spot(
adata1,
use_data = "raw",
neighbour_k = 6,
weights='weights_matrix_all',
verbose = False,
)
adata1=augment_gene_data(
adata1,
use_data = "raw",
adjacent_weight = 1,
)
adata1.X = adata1.obsm["augment_gene_data"].astype(float)

Preprocess the data from the second view, reduce the dimensionality of the preprocessed data, and convert it with the data from the previous first view to prepare the data for the model.

In [None]:
adata1_X = sc.pp.normalize_total(adata1, target_sum=1, exclude_highly_expressed=True, inplace=False)['X']
adata1_X = sc.pp.log1p(adata1_X)
adata1_X = sc.pp.scale(adata1_X)
inputs2 = sc.pp.pca(adata1_X, n_comps=1000)
X=inputs.copy()
X2=inputs2.copy()
X=torch.tensor(X,dtype=torch.float)
X2=torch.tensor(X2,dtype=torch.float)
adj_pure = adata.obsm["weights_matrix_nomd"]
adj_pure = cluster_adj*adj_pure
adj_pure =sim2adj(adj_pure ,6)
adj_augment=torch.tensor(adj_augment,dtype=torch.float)
adj_pure=torch.tensor(adj_pure,dtype=torch.float)
print("done")

Feed the prepared data into the model for training, including adata data, enhanced gene expression data and neighbor structure from the first view, and enhanced gene expression data and neighbor structure from the second view. Label data is only sent here to monitor the feature extraction during the data learning process. And cluster number data.

In [None]:
best_features=train(adata,X,X2,adj_pure,adj_augment,df_meta,n_domains)

We use the features extracted by the trained model for the next clustering operation and generate spatial domain partition images.

In [None]:
_,ARI=cluster(adata,best_features,df_meta,n_domains,refined=True)
data.plot_domains(adata, data_name)
print(adata)
adata.write(os.path.join(save_path, f'{data_name}.h5ad'),compression="gzip")