In [1]:
import os
os.chdir('/home/mohsen/projects/scarches/')

In [2]:
import scanpy as sc
import scarches as sca
import seaborn as sns
import scIB as scib
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
def compute_metrics(latent_adata, adata, data_name, method, rqr=None, batch_key='study', label_key='cell_type'):
    latent_adata.obsm['X_pca'] = latent_adata.X
    print(adata.shape, latent_adata.shape)
    n_batches = len(adata.obs[batch_key].unique().tolist())
    
    scores = scib.metrics.metrics(adata, latent_adata, batch_key, label_key, 
                                  nmi_=True, ari_=True, silhouette_=True, pcr_=True, graph_conn_=True, 
                                  isolated_labels_=True, hvg_score_=False)
    scores = scores.T
    scores = scores[['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'ASW_label/batch', 
                     'PCR_batch', 'isolated_label_F1', 'isolated_label_silhouette', 'graph_conn']]
    
    ebm = sca.metrics.entropy_batch_mixing(latent_adata, batch_key, n_neighbors=15)
    knn = sca.metrics.knn_purity(latent_adata, label_key, n_neighbors=15)
    
    scores['EBM'] = ebm
    scores['KNN'] = knn
    scores['method'] = method
    scores['data'] = data_name
    scores['rqr'] = rqr / n_batches if rqr is not None else None
    
    return scores

In [4]:
def compute_rqr_metrics(dataset):
    scores = None

    if dataset == 'mouse_brain':
        adata = sc.read(f'/home/mohsen/data/{dataset}/mouse_brain_subsampled_normalized_hvg.h5ad')
        batch_key, label_key = 'study', 'cell_type'
        ratios = [1, 2, 3]
    elif dataset == 'pancreas':
        adata = sc.read(f'/home/mohsen/data/{dataset}/{dataset}_normalized.h5ad')
        batch_key, label_key = 'study', 'cell_type'
        ratios = [1, 2, 3, 4]
    else:
        adata = sc.read(f'/home/mohsen/data/PBMC/Immune_ALL_human_wo_villani_rqr_normalized_hvg.h5ad')
        batch_key, label_key = 'condition', 'final_annotation'
        ratios = [1, 4, 8]
        
    scores = None
    for method in ['scvi', 'scanvi', 'CVAE', 'trVAE', 'DCA']:
        for ratio in ratios:
            print(f'**** Running for {method} with ratio = {ratio} ****')
            if method in ['CVAE', 'trVAE', 'DCA']:
                latent_adata = sc.read(f'/home/mohsen/data/scArches/ref_query_ratio/{method}/{dataset}/{ratio}/all.h5ad')
            elif method in ['scvi', 'scanvi']:
                if dataset == 'pbmc':
                    test_num = 1 + ratio // 4
                    figure_num = 6
                else:
                    test_number = ratio
                    figure_num = 3
                latent_adata = sc.read(f'/home/mohsen/data/scArches/scVI_scANVI/figure_{figure_num}/{method}/test_{test_num}_first_cond/full_data.h5ad')
                
                latent_adata.obs[batch_key] = latent_adata.obs['batch'].values
                latent_adata.obs[label_key] = latent_adata.obs['celltype'].values
                            
            df = compute_metrics(latent_adata, adata, dataset, method, ratio, batch_key, label_key)
            scores = pd.concat([scores, df], axis=0) if scores is not None else df
            
    if dataset in ['pancreas', 'mouse_brain']:
        for method in ['Harmony', 'Liger', 'Scanorama', 'Seurat', 'mnnCorrect', 'Conos']:
            print(f'**** Running for {method} ****')
            latent_adata = sc.read(f'/home/mohsen/data/scArches/Full Integration Results/{dataset}/{method}/result_adata.h5ad')
            df = compute_metrics(latent_adata, adata, dataset, method, None, batch_key, label_key)
            scores = pd.concat([scores, df], axis=0) if scores is not None else df
            
    return scores

In [5]:
scores = compute_rqr_metrics('pbmc')
scores

**** Running for scvi with ratio = 1 ****
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
**** Running for scvi with ratio = 4 ****
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
**** Running for scvi with ratio = 8 ****
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
**** Running for scanvi with ratio = 1 ****
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
**** Running for scanvi with ratio = 4 ****
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
**** Running for scanvi with ratio = 8 ****
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regre

Unnamed: 0,NMI_cluster/label,ARI_cluster/label,ASW_label,ASW_label/batch,PCR_batch,isolated_label_F1,isolated_label_silhouette,graph_conn,EBM,KNN,method,data,rqr
0,0.765077,0.668575,0.54912,0.757866,0.080864,0.68926,0.525876,0.938998,0.310513,0.816735,scvi,pbmc,0.111111
0,0.798299,0.698489,0.572445,0.879352,0.639314,0.656553,0.51682,0.984147,0.404976,0.835023,scvi,pbmc,0.444444
0,0.770797,0.640171,0.571546,0.890192,0.774754,0.68224,0.514952,0.98511,0.415126,0.855069,scvi,pbmc,0.888889
0,0.801401,0.732055,0.576094,0.802844,0.061964,0.708961,0.544731,0.877319,0.334333,0.805186,scanvi,pbmc,0.111111
0,0.843345,0.735626,0.590659,0.87035,0.584863,0.696211,0.521875,0.921111,0.424256,0.85628,scanvi,pbmc,0.444444
0,0.892182,0.896625,0.636724,0.881006,0.658837,0.741548,0.519281,0.982858,0.404439,0.907112,scanvi,pbmc,0.888889
0,0.806924,0.737635,0.574449,0.843846,0.294633,0.760093,0.541645,0.98344,0.289249,0.820619,CVAE,pbmc,0.111111
0,0.84296,0.758593,0.604889,0.851519,0.42022,0.856468,0.519832,0.97799,0.244509,0.872473,CVAE,pbmc,0.444444
0,0.834331,0.736581,0.59829,0.867313,0.630909,0.747576,0.515062,0.972913,0.308172,0.874469,CVAE,pbmc,0.888889
0,0.793353,0.682984,0.576799,0.869146,0.56287,0.678618,0.548028,0.965646,0.329721,0.788575,trVAE,pbmc,0.111111


In [None]:
scores = compute_rqr_metrics('mouse_brain')