In [1]:
import shutil
import scanpy as sc
from tqdm import tqdm

from benchmodels import *
from benchdb import *
from benchutils import *
from benchplots import *

  from .autonotebook import tqdm as notebook_tqdm
Seed set to 0


In [2]:
%load_ext autoreload
%autoreload 2

# data

## allen-brain

In [None]:
# brain
dss = ["ALM", "MTG", "VISp"]

dss = {s: load_adata(f"../data/{s}.h5ad") for s in tqdm(dss)}
key = 'labels34'

100%|██████████| 3/3 [00:20<00:00,  6.89s/it]


## scIB pancreas

In [2]:
# pancreas
dss = [
    "pancreas_celseq",
    "pancreas_celseq2",
    "pancreas_fluidigmc1",
    "pancreas_inDrop1",
    "pancreas_inDrop2",
    "pancreas_inDrop3",
    "pancreas_inDrop4",
    "pancreas_smarter",
    "pancreas_smartseq2",
]


dss = {s: load_adata(f"../data/{s}.h5ad") for s in tqdm(dss)}
key = 'celltype'

100%|██████████| 9/9 [00:06<00:00,  1.30it/s]


## PBMC

In [None]:
dss = [
    "pbmc_10Xv2",
    "pbmc_10Xv3",
    "pbmc_CEL-Seq",
    "pbmc_Drop-Seq",
    "pbmc_inDrop",
    "pbmc_Seq-Well",
    "pbmc_Smart-Seq2",
]
dss = {s: load_adata(f"../data/{s}.h5ad") for s in tqdm(dss)}
key = 'labels'

100%|██████████| 7/7 [00:39<00:00,  5.63s/it]


## mag

In [None]:
dss = [
    "mag_young",
    "mag_old",
]
dss = {s: load_adata(f"../data/{s}.h5ad") for s in tqdm(dss)}
key = 'celltype'

100%|██████████| 2/2 [00:09<00:00,  4.70s/it]


## cellbench

In [2]:
dss = [
    "cellbench_10x_5cl",
    "cellbench_CelSeq2_5cl",
]
dss = {s: load_adata(f"../data/{s}.h5ad") for s in tqdm(dss)}
key = 'ground_truth'

# convert to str to avoid weirdness
for d in dss:
    dss[d].obs[key] = "label_" + dss[d].obs[key].astype(str)


100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


# eval

example of how to benchmark query/ref pairwise combinations across the dataset chosen above

In [3]:
db = load_benchdb("bench.json")
models = [
    RefCM, Clustifyr, CIPR, SingleRcluster,
    CellTypist, SVM, Seurat, SingleR, SCMAPCell, SCMAPCluster, 
    # SCANVI, SCALEX
    ] 

kwargs = {
    "scmapcell": {'w_agree': 1, 'threshold': 0.0},
    "scmapcluster": {'threshold': 0.0},
    "RefCM": {"discovery_threshold":0.0}
    }

In [4]:
for model in models:
    m = model()
    
    for rid in dss:
                
        ref = dss[rid]
        prep_adata(ref, target_sum=10_000 if m.id_ == 'CellTypist' else None)

    
        if m.id_ in ['scANVI', 'SCALEX']:
            shutil.rmtree("SCALEX/q", ignore_errors=True)
            shutil.rmtree("SCALEX/q_r", ignore_errors=True)
            shutil.rmtree("SCALEX/r", ignore_errors=True)
    
        with suppress_all_console():
            m.setref(ref, key, **kwargs)
        
        for qid in dss:
            
            if qid == rid:
                continue
            
            q = dss[qid]
            prep_adata(q, target_sum=10_000 if m.id_ == 'CellTypist' else None)
            
            
            with suppress_all_console():
                a = m.annotate(q, key, **kwargs)
                
            a.eval_(q, key)
            
            print(f"{m.id_:<15} : {qid:>25} | {rid:<25} : cacc {a.cacc:.3f} | {m.time}")

            add_bench(db, m.id_, qid, rid, a)
        
        save_benchdb(db, "bench.json")

RefCM           :     cellbench_CelSeq2_5cl | cellbench_10x_5cl         : cacc 1.000 | time:       0.289s | setref:          N/A | annot:       0.289s
RefCM           :         cellbench_10x_5cl | cellbench_CelSeq2_5cl     : cacc 1.000 | time:       0.302s | setref:          N/A | annot:       0.302s
clustifyr       :     cellbench_CelSeq2_5cl | cellbench_10x_5cl         : cacc 1.000 | time:       0.477s | setref:       0.404s | annot:       0.073s
clustifyr       :         cellbench_10x_5cl | cellbench_CelSeq2_5cl     : cacc 1.000 | time:       0.502s | setref:       0.029s | annot:       0.472s
CIPR            :     cellbench_CelSeq2_5cl | cellbench_10x_5cl         : cacc 0.325 | time:       1.628s | setref:       0.207s | annot:       1.421s
CIPR            :         cellbench_10x_5cl | cellbench_CelSeq2_5cl     : cacc 0.322 | time:       8.078s | setref:       0.047s | annot:       8.031s
SingleRcluster  :     cellbench_CelSeq2_5cl | cellbench_10x_5cl         : cacc 1.000 | time:  