In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import anndata as ad
import scanpy.external as sce
from sklearn import preprocessing
import pickle5 as pickle
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
import sklearn

eps=1e-100
import seaborn as sns


### concat four sections

In [None]:
X1 = sc.read_h5ad('./source_data/single_cell_embedding/section1.h5ad')
X2 = sc.read_h5ad('./source_data/single_cell_embedding/section4.h5ad')
X3 = sc.read_h5ad('./source_data/single_cell_embedding/section3.h5ad')
X4 = sc.read_h5ad('./source_data/single_cell_embedding/section2.h5ad')

In [None]:
X1,X2,X3,X4

In [None]:
ad_concat = ad.concat([X1,X2,X3,X4],label='domain')

### plot UMAP of integrated embeddings

In [None]:
for method_name in ['FuseMap_celltype','harmony','scanorama','scvi','seurat','SIMBA_sc','MultiVI','SIMBA_mc','stabmap','Mario']:
    ad_merged=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{method_name}.h5ad')
    if ad_concat.shape[0]==ad_merged.shape[0]:
        plt_ad = ad_concat
    else:
        plt_ad = ad_concat[ad_merged.obs.index,:]
        
    plt_ad.obsm['X_umap']=ad_merged.obsm['X_umap']


        
    fig, ax = plt.subplots(figsize=(3,3))
    sc.pl.umap(plt_ad,color='domain',ax=ax,title=f'{method_name} sampleID')

    fig, ax = plt.subplots(figsize=(6,6))
    sc.pl.umap(plt_ad,color='gtTaxonomyRank4',ax=ax,title=f'{method_name} unified cell type label')
    

### definition of metric functions

In [None]:
import sys
sys.path.insert(0,'./')
from utils import *


bio conservation

### metric 1

In [None]:
SAMPLE_cell=100000
random_seed_list=[0,1,2,3,4,5,6,7,8,9]
method_name = ['FuseMap_celltype','harmony','scanorama','scvi','seurat','SIMBA_sc','MultiVI','SIMBA_mc','stabmap','Mario']


In [None]:
for random_seed in random_seed_list:
    for key in method_name:
        print(f'-------key: {key} --------------')

        adata=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{key}.h5ad')
                
        np.random.seed(random_seed)
        adata = adata[np.random.permutation(adata.shape[0])[:SAMPLE_cell],:]
        map_sample_value = map_sample(adata, )

        save_obj(map_sample_value, f'source_data/single_cell_embedding/metrics_value/MAP_{key}_{random_seed}')


### metric 2

In [None]:
for random_seed in random_seed_list:
    for key in method_name:
        print(f'-------key: {key} --------------')

        adata=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{key}.h5ad')
        
        np.random.seed(random_seed)
        adata = adata[np.random.permutation(adata.shape[0])[:SAMPLE_cell],:]
        map_sample_value = ave_sw_sample_all(adata, )

        save_obj(map_sample_value, f'source_data/single_cell_embedding/metrics_value/ASW_{key}_{random_seed}')


### metric 3

In [None]:
for random_seed in random_seed_list:
    for key in method_name:
        print(f'-------key: {key} --------------')

        adata=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{key}.h5ad')
        
        np.random.seed(random_seed)
        adata = adata[np.random.permutation(adata.shape[0])[:SAMPLE_cell],:]
        map_sample_value = nc_sample_all(adata, )

        save_obj(map_sample_value, f'source_data/single_cell_embedding/metrics_value/NC_{key}_{random_seed}')

batch effect

### metric 1

In [None]:
for random_seed in random_seed_list:
    for key in method_name:
        print(f'-------key: {key} --------------')

        adata=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{key}.h5ad')

        np.random.seed(random_seed)
        adata = adata[np.random.permutation(adata.shape[0])[:SAMPLE_cell],:]
        map_sample_value = batch_entropy_sample_all(adata, )

        save_obj(map_sample_value, f'source_data/single_cell_embedding/metrics_value/BE_{key}_{random_seed}')

### metric 2

In [None]:
for random_seed in random_seed_list:
    for key in method_name:
        print(f'-------key: {key} --------------')

        adata=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{key}.h5ad')

        np.random.seed(random_seed)
        adata = adata[np.random.permutation(adata.shape[0])[:SAMPLE_cell],:]
        map_sample_value = sas_sample_all(adata, random_seed=random_seed)

        save_obj(map_sample_value, f'source_data/single_cell_embedding/metrics_value/SAS_{key}_{random_seed}')


### metric 3

In [None]:
for random_seed in random_seed_list:
    for key in method_name:
        print(f'-------key: {key} --------------')

        adata=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{key}.h5ad')

        np.random.seed(random_seed)
        adata = adata[np.random.permutation(adata.shape[0])[:SAMPLE_cell],:]
        map_sample_value = aswb_sample_all(adata)

        save_obj(map_sample_value, f'source_data/single_cell_embedding/metrics_value/ASWB_{key}_{random_seed}')

### metric 4

In [None]:
for random_seed in random_seed_list:
    for key in method_name:
        print(f'-------key: {key} --------------')

        adata=sc.read_h5ad(f'source_data/single_cell_embedding/adata_{key}.h5ad')

        np.random.seed(random_seed)
        adata = adata[np.random.permutation(adata.shape[0])[:SAMPLE_cell],:]
        map_sample_value = gc_sample_all(adata)

        save_obj(map_sample_value, f'source_data/single_cell_embedding/metrics_value/GC_{key}_{random_seed}')

# summarize and plot figures

need to decompress source_data/single_cell_embedding/metrics_value/data.zip

In [None]:
from sklearn.preprocessing import MinMaxScaler

value_all_dict={}
value_all_dict['metric']=[]
value_all_dict['method']=[]
value_all_dict['value']=[]
value_all_dict['seed']=[]
method_name = ['FuseMap','Harmony','scVI','SeuratV3','StabMap','Scanorama','SIMBA_sc','MultiVI','SIMBA_mc','Mario']

for random_seed in [0,1,2,3,4,5,6,7,8,9]:

    for metric_key in ['MAP','ASW','NC','BE','SAS','ASWB','GC']:
        list_all=[]
        for key in method_name:

            with (open(f'source_data/single_cell_embedding/metrics_value/data/{metric_key}_{key}_{random_seed}.pkl', "rb")) as openfile:
                value = pickle.load(openfile)
            list_all.append(np.mean(value))

        list_all=np.array(list_all).reshape(-1, 1)
        scaler = MinMaxScaler()
        scaler.fit(list_all)
        for i,j in zip(np.array(list_all).flatten(),method_name):
            value_all_dict['metric'].append(metric_key)
            value_all_dict['method'].append(j)
            value_all_dict['value'].append(i)
            value_all_dict['seed'].append(random_seed)
value_all_dict = pd.DataFrame(value_all_dict)


In [None]:
def plot_barplot(value_all_dict,key):
    df_new = value_all_dict.loc[value_all_dict['metric']==key,:]


    LINEWIDTH_ALL =0.5

    sns.set(rc={'figure.facecolor':(0,0,0,0)})


    PROPS = {
        'boxprops':{'edgecolor':'black'},
        'medianprops':{'color':'black'},
        'whiskerprops':{'color':'black'},
        'capprops':{'color':'black'}
    }


    fig = plt.figure(figsize=(4.5,4), dpi= 300)
    sns.set_style('ticks')
    g=sns.stripplot(x=df_new['method'], y=df_new['value'],palette='Paired',
                    linewidth=LINEWIDTH_ALL,size=3,edgecolor='k')

    g=sns.barplot(data=df_new, x="method", y="value",palette='Paired')

    adjust_box_widths(fig, 1.5)

    plt.ylim([-0.03,1]) # <--- set the ticks first
    plt.yticks(fontsize=5)
    plt.xticks(fontsize=5)

    for axis in ['top', 'bottom', 'left', 'right']:
        g.spines[axis].set_linewidth(LINEWIDTH_ALL)  # change width

    g.xaxis.set_tick_params(width=LINEWIDTH_ALL,direction='inout')
    g.yaxis.set_tick_params(width=LINEWIDTH_ALL,direction='inout')
    plt.legend([],[], frameon=False)
    plt.title(key)
    plt.show()

In [None]:
for metric_key in ['MAP','ASW','NC','BE','SAS','ASWB','GC']:
    plot_barplot(value_all_dict,metric_key)