In [1]:
import os
import pandas as pd
import numpy as np
import anndata as ad
import h5py
import json
from SVExp import *

In [None]:
### MODE: 1. Data generating; 2. Tools validating.
MODE = 2

if MODE == 1:
    ### Path and settings
    dataset_name = 'your_data'
    data_dir = './'+dataset_name+'/' # Real data path
    out_dir = './simulation_SVExp/'
    sim_dir = out_dir+str(dataset_name)+'/' # Simulated data path
    species = 'human'
    target_domain = [['1'],['2'],['3'],['4']] # ROI
    target_correlation = [0, 1, 2, 3] # Correlation corresponding to ROI. 0: negative linear; 1: positive linear; 2: non-linear; 3: mixed
    simulate_nums = [100, 100, 100, 100] # SVI numbers you want to generate for each patterns
    l_range = [[3, 3.3]] # alpha_1 and alpha_2's range
    noise_level = [0] # Noise level of Simulated data, from 0 to 1
    
    ### Input your ST data
    print('reading data...')
    df_count = pd.read_csv(data_dir+'count.csv', sep='\t', index_col=0) # gene expression data
    df_count.index = [str(x) for x in df_count.index]
    loc = pd.read_csv(data_dir+'loc.csv', sep='\t', index_col=0) # spot location data
    loc.index = [str(x) for x in loc.index]
    loc = loc.loc[df_count.index]
    df_domain = pd.read_excel(data_dir+'domain.xlsx', header=0, index_col=0) # spot labels for ROI selecting
    df_domain.index = [str(x) for x in df_domain.index]

    ### Generate synthetic data
    if not os.path.exists(sim_dir):
        os.makedirs(sim_dir)
    data_generate(species, df_count, loc, df_domain, target_domain, target_correlation, l_range, noise_level, simulate_nums, sim_dir)

elif MODE == 2:
    dataset_name = 'your_data'
    tool_res_dir = 'your_path'
    lr_file=tool_res_dir+'/statistics.csv' # The tool's statistics on spots for each SVI
    cluster_file=tool_res_dir+'/raw_cluster.csv' # The tool's SVIs clustering results
    domain_file='./'+dataset_name+'/domain.xlsx' # spot labels the same as MODE 1
    out_dir='./simulation_SVExp/'+dataset_name+'/'
    ground_truth_file=out_dir+'/ground_truth.csv' # ground truth clusters (MODE 1's output)
    
    roi_with_c={'linear':'Healthy_1', 'negative-linear':'IDC_8', 'non-linear':'IDC_4', 'mixed':'IDC_2'} # Dictionary corresponding to MODE 1's setting of ROIs and Correlations
    
    ### Validation
    lrs = pd.read_csv(lr_file, index_col=0)
    clusters = pd.read_csv(cluster_file, index_col='g').sort_index()
    domains = pd.read_excel(domain_file, header=0, index_col=0)
    domains = np.array(domains.iloc[:,2].tolist())
    ground_truth = pd.read_csv(ground_truth_file, sep='\t', index_col=0).sort_index()
    validate(lrs, clusters, roi_with_c, domains, ground_truth, out_dir)
    plot_validation(out_dir+'si_score.csv', out_dir)

  0%|                                                   | 0/100 [09:57<?, ?it/s]