In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from evaluation.utils import entropy_batch_mixing, knn_purity
import scanpy as sc
import seaborn as sns
import scIB as scib
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
models= ['scanvi']
datasets = ['pbmc','pancreas','brain']
versions = ['first', 'deep']
ratios = [5,4,3,2,1]

In [3]:
def compute_metrics(latent_adata, adata, model, dataset, rqr=None, batch_key='study', label_key='cell_type'):
    latent_adata.obsm['X_pca'] = latent_adata.X
    print(adata.shape, latent_adata.shape)
    n_batches = len(adata.obs[batch_key].unique().tolist())
    
    scores = scib.metrics.metrics(adata, latent_adata, batch_key, label_key, 
                                  nmi_=True, ari_=True, silhouette_=True, pcr_=True, graph_conn_=True, 
                                  isolated_labels_=True, hvg_score_=False)
    scores = scores.T
    scores = scores[['NMI_cluster/label', 'ARI_cluster/label', 'ASW_label', 'ASW_label/batch', 
                     'PCR_batch', 'isolated_label_F1', 'isolated_label_silhouette', 'graph_conn']]
    
    ebm = entropy_batch_mixing(latent_adata, batch_key, n_neighbors=15)
    knn = knn_purity(latent_adata, label_key, n_neighbors=15)
    
    scores['EBM'] = ebm
    scores['KNN'] = knn
    scores['method'] = model
    scores['data'] = dataset
    scores['rqr'] = rqr / n_batches if rqr is not None else None
    scores.rqr = scores.rqr.round(2)
    scores['reference_time'] = 0.0
    scores['query_time'] = 0.0
    
    return scores

## Calculating metrics for all ratios:

In [4]:
for dataset in datasets:   
    if dataset == 'pbmc':
        adata = sc.read(os.path.expanduser(f'~/Documents/benchmarking_datasets/Immune_ALL_human_wo_villani_rqr_normalized_hvg.h5ad'))
        batch_key = 'condition'
        label_key = 'final_annotation'
        number = 4
    elif dataset == 'brain':
        adata = sc.read(os.path.expanduser(f'~/Documents/benchmarking_datasets/mouse_brain_subsampled_normalized_hvg.h5ad'))
        batch_key = 'study'
        label_key = 'cell_type'
        number = 4
    elif dataset == 'pancreas':
        adata = sc.read(os.path.expanduser(f'~/Documents/benchmarking_datasets/pancreas_normalized.h5ad'))
        batch_key = 'study'
        label_key = 'cell_type'
        number = 5
    #adata = adata_all.raw.to_adata()
    for version in versions:
        scores = None
        for model in models:
            for ratio in ratios:
                if ratio == 5 and dataset in ['pbmc','brain']:
                    continue
                '''
                elif ratio == 5 and dataset == 'pancreas' and model == 'scanvi':
                    continue
                elif ratio == 4 and dataset in ['pbmc','brain'] and model == 'scanvi':
                    continue
                '''
                test_num = ratio
                latent_adata = sc.read(os.path.expanduser(f'~/Documents/benchmarking_results/rqr/{model}/{dataset}/test_{number}_{version}_cond/label_ratio_{ratio}/full_data.h5ad'))
                latent_adata.obs[batch_key] = latent_adata.obs['batch'].values
                latent_adata.obs[label_key] = latent_adata.obs['celltype'].values  
                df = compute_metrics(latent_adata, adata, model, dataset, ratio, batch_key, label_key)
                scores = pd.concat([scores, df], axis=0) if scores is not None else df
        scores.to_csv(os.path.expanduser(f'~/Documents/benchmarking_results/rqr/{dataset}scanvi_full_{version}.csv'), index=False)

(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.4228828399988768
KNN-P: 0.870865674721432
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.43742429721098847
KNN-P: 0.8491819803137189
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.4431728963181458
KNN-P: 0.8499092800155041
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.4318912511333321
KNN-P: 0.8518755065672919
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.42761284997826793
KNN-P: 0.8582755198351506
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.42827242172384805
KNN-P: 0.849036856358669
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.43173799576669636
KNN-P: 0.8460761961031289
(20522, 4000) (20522, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 9


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.43133624556507705
KNN-P: 0.84282191286477
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.36498147012740617
KNN-P: 0.8919676737352664
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.36573219383621064
KNN-P: 0.8972523396304708
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.3645342228956999
KNN-P: 0.8923765676695226
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.35479940378209207
KNN-P: 0.8863009277484648
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.34825879230396994
KNN-P: 0.8681515625938019
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.36636516285773063
KNN-P: 0.8937503734172124
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.36025599130719316
KNN-P: 0.8917366790443784
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.3682171001348845
KNN-P: 0.8857793108864918
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.3704848465704946
KNN-P: 0.8864864878670782
(15681, 1000) (15681, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 5


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.3709824269627778
KNN-P: 0.8659899955091055
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 4


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.251441385356224
KNN-P: 0.9294329034079072
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 4


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.24633826987381607
KNN-P: 0.9288045081640574
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 4


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.2642384350570412
KNN-P: 0.9182674354424274
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 4


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.25862006702130536
KNN-P: 0.9138306391394118
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


Calculating EBM with n_cat = 4
EBM: 0.2519411678722951
KNN-P: 0.931830400981864
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


Calculating EBM with n_cat = 4
EBM: 0.23334664731163443
KNN-P: 0.9336512572865298
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 4


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.2447574959131403
KNN-P: 0.9212542358884537
(332129, 4000) (332129, 10)
clustering...
NMI...
ARI...
silhouette score...
PC regression...
isolated labels...
Graph connectivity...
Calculating EBM with n_cat = 4


  _,labs = connected_components(adata_post_sub.uns['neighbors']['connectivities'], connection='strong')


EBM: 0.23506846124182626
KNN-P: 0.9140000326615235
