In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import multigrate as mtg
import scanpy as sc

sc.logging.print_versions()

scanpy==1.4.6 anndata==0.7.3 umap==0.4.6 numpy==1.19.4 scipy==1.5.4 pandas==1.1.4 scikit-learn==0.23.2 statsmodels==0.12.1 python-igraph==0.8.3 louvain==0.6.1


# Metrics

## Gayoso 2020

In [2]:
rna = sc.read('../data/gayoso-2020/expressions.h5ad')
rna

AnnData object with n_obs × n_vars = 30293 × 4000
    obs: 'batch_indices', 'n_genes', 'percent_mito', 'leiden_subclusters', 'cell_types', 'tissue', 'batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts'
    var: 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'batch_colors', 'cell_types_colors', 'neighbors', 'pca', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'count'
    obsp: 'connectivities', 'distances'

In [3]:
latent = sc.read('../data/integrated/seurat/gayoso-seurat-20.h5ad')
latent

AnnData object with n_obs × n_vars = 30293 × 20
    obs: 'cell_type', 'nCount_ADT', 'nFeature_ADT', 'RNA.weight', 'ADT.weight'
    obsp: 'connectivities'

In [4]:
latent.obsm['latent'] = latent.X
latent.obs['cell_type'] = rna.obs['cell_types'].astype('category')
latent.obs['batch_indices'] = rna.obs['batch_indices'].astype('category')
latent

AnnData object with n_obs × n_vars = 30293 × 20
    obs: 'cell_type', 'nCount_ADT', 'nFeature_ADT', 'RNA.weight', 'ADT.weight', 'batch_indices'
    obsm: 'latent'
    obsp: 'connectivities'

In [5]:
metrics = mtg.metrics.metrics(None, latent,
                      batch_key='batch_indices',
                      label_key='cell_type',
                      isolated_label_f1=False,
                      pcr_batch=False,
                      embed='latent', 
                      save='gayoso-seurat.csv', 
                      method='seurat')
metrics

Clustering...
ASW label/batch...
Graph connectivity...
ASW label...
NMI cluster/label...
ARI cluster/label...
Isolated label silhouette...


Unnamed: 0,score
ASW_label/batch,0.736559
graph_conn,0.90062
ASW_label,0.556477
NMI_cluster/label,0.630685
ARI_cluster/label,0.422286
isolated_label_silhouette,0.615715


## Kotliarov 2020

In [6]:
rna = sc.read('../data/kotliarov-2020/expressions.h5ad')
rna

AnnData object with n_obs × n_vars = 52117 × 3999
    obs: 'batch', 'cluster_level2', 'cluster_level3', 'sample', 'cell_type', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt'
    var: 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'batch_colors', 'cell_type_colors', 'cluster_level2_colors', 'cluster_level3_colors', 'neighbors', 'pca', 'sample_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'count'
    obsp: 'connectivities', 'distances'

In [7]:
latent = sc.read('../data/integrated/seurat/kotliarov-seurat.h5ad')
latent

AnnData object with n_obs × n_vars = 52117 × 20
    obs: 'cell_type', 'nCount_ADT', 'nFeature_ADT', 'RNA.weight', 'ADT.weight'
    obsp: 'connectivities'

In [8]:
latent.obsm['latent'] = latent.X
latent.obs['cell_type'] = rna.obs['cell_type'].astype('category')
latent.obs['batch'] = rna.obs['batch'].astype('category')
latent

AnnData object with n_obs × n_vars = 52117 × 20
    obs: 'cell_type', 'nCount_ADT', 'nFeature_ADT', 'RNA.weight', 'ADT.weight', 'batch'
    obsm: 'latent'
    obsp: 'connectivities'

In [9]:
metrics = mtg.metrics.metrics(None, latent,
                      batch_key='batch',
                      label_key='cell_type',
                      isolated_label_f1=False,
                      pcr_batch=False,
                      embed='latent', 
                      save='kotliarov-seurat.csv', 
                      method='seurat')
metrics

Clustering...
ASW label/batch...
Graph connectivity...
ASW label...
NMI cluster/label...
ARI cluster/label...
Isolated label silhouette...


Unnamed: 0,score
ASW_label/batch,0.95959
graph_conn,0.987425
ASW_label,0.604222
NMI_cluster/label,0.779305
ARI_cluster/label,0.73439
isolated_label_silhouette,0.582749


## Hao 2020

In [10]:
rna = sc.read('../data/hao-2020/expressions.h5ad')
rna

AnnData object with n_obs × n_vars = 161764 × 4000
    obs: 'nCount_ADT', 'nFeature_ADT', 'nCount_RNA', 'nFeature_RNA', 'orig.ident', 'lane', 'donor', 'time', 'celltype.l1', 'celltype.l2', 'celltype.l3', 'Phase', 'cell_type'
    var: 'features', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'celltype.l1_colors', 'celltype.l2_colors', 'celltype.l3_colors', 'neighbors'
    obsm: 'X_apca', 'X_aumap', 'X_pca', 'X_spca', 'X_umap', 'X_wnn.umap'
    varm: 'PCs', 'SPCA'
    layers: 'count'
    obsp: 'distances'

In [11]:
latent = sc.read('../data/integrated/seurat/hao-seurat.h5ad')
latent

AnnData object with n_obs × n_vars = 161764 × 20
    obs: 'cell_type', 'nCount_ADT', 'nFeature_ADT', 'RNA.weight', 'ADT.weight'
    obsp: 'connectivities'

In [12]:
latent.obsm['latent'] = latent.X
latent.obs['cell_type'] = rna.obs['cell_type'].astype('category')
latent.obs['donor'] = rna.obs['donor'].astype('category')
latent

AnnData object with n_obs × n_vars = 161764 × 20
    obs: 'cell_type', 'nCount_ADT', 'nFeature_ADT', 'RNA.weight', 'ADT.weight', 'donor'
    obsm: 'latent'
    obsp: 'connectivities'

In [13]:
metrics = mtg.metrics.metrics(None, latent,
                      batch_key='donor',
                      label_key='cell_type',
                      isolated_label_f1=False,
                      pcr_batch=False,
                      embed='latent', 
                      save='hao-seurat.csv', 
                      method='seurat')
metrics

Clustering...
ASW label/batch...
Graph connectivity...
ASW label...
NMI cluster/label...
ARI cluster/label...
Isolated label silhouette...


Unnamed: 0,score
ASW_label/batch,0.875531
graph_conn,0.868425
ASW_label,0.634199
NMI_cluster/label,0.699621
ARI_cluster/label,0.579198
isolated_label_silhouette,0.587845


## 10xpbmc10k 2020

In [14]:
rna = sc.read('../data/10xpbmc10k-2020/expressions.h5ad')
rna

AnnData object with n_obs × n_vars = 10000 × 4000
    obs: 'cell_type'
    layers: 'count'

In [15]:
latent = sc.read('../data/integrated/seurat/10xpbmc10k-seurat.h5ad')
latent

AnnData object with n_obs × n_vars = 10000 × 20
    obs: 'cell_type', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight'
    obsp: 'connectivities'

In [16]:
latent.obsm['latent'] = latent.X
latent.obs['cell_type'] = rna.obs['cell_type'].astype('category')
latent

AnnData object with n_obs × n_vars = 10000 × 20
    obs: 'cell_type', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight'
    obsm: 'latent'
    obsp: 'connectivities'

In [17]:
metrics = mtg.metrics.metrics(None, latent,
                      batch_key=None,
                      label_key='cell_type',
                      isolated_label_f1=False,
                      isolated_label_asw=False,
                      pcr_batch=False,
                      asw_batch=False,
                      embed='latent', 
                      save='10xpbmc10k-seurat.csv', 
                      method='seurat')
metrics

Clustering...
Graph connectivity...
ASW label...
NMI cluster/label...
ARI cluster/label...


Unnamed: 0,score
graph_conn,0.993777
ASW_label,0.605867
NMI_cluster/label,0.804092
ARI_cluster/label,0.665098


## Chen 2019

In [18]:
rna = sc.read('../data/chen-2019/expressions.h5ad')
rna

AnnData object with n_obs × n_vars = 4793 × 4000
    obs: 'Batch', 'Barcode', 'cell_type', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt'
    var: 'gene_symbols', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'Batch_colors', 'cell_type_colors', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    layers: 'count'
    obsp: 'connectivities', 'distances'

In [19]:
latent = sc.read('../data/integrated/seurat/chen-seurat.h5ad')
latent

AnnData object with n_obs × n_vars = 4793 × 20
    obs: 'cell_type', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight'
    obsp: 'connectivities'

In [20]:
latent.obsm['latent'] = latent.X
latent.obs['cell_type'] = rna.obs['cell_type'].astype('category')
latent

AnnData object with n_obs × n_vars = 4793 × 20
    obs: 'cell_type', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight'
    obsm: 'latent'
    obsp: 'connectivities'

In [21]:
metrics = mtg.metrics.metrics(None, latent,
                      batch_key=None,
                      label_key='cell_type',
                      isolated_label_f1=False,
                      isolated_label_asw=False,
                      pcr_batch=False,
                      asw_batch=False,
                      embed='latent', 
                      save='chen-seurat.csv', 
                      method='seurat')
metrics

Clustering...
Graph connectivity...
ASW label...
NMI cluster/label...
ARI cluster/label...


Unnamed: 0,score
graph_conn,0.930371
ASW_label,0.519594
NMI_cluster/label,0.561031
ARI_cluster/label,0.424486


## Cao 2018

In [22]:
rna = sc.read('../data/cao-2018/expressions.h5ad')
rna

AnnData object with n_obs × n_vars = 7362 × 4000
    obs: 'source', 'replicate', 'experiment', 'tsne_1', 'tsne_2', 'cell_type', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt'
    var: 'gene_type', 'gene_short_name', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'cell_type_colors', 'neighbors', 'replicate_colors', 'umap'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    layers: 'count'
    obsp: 'connectivities', 'distances'

In [23]:
latent = sc.read('../data/integrated/seurat/cao-seurat.h5ad')
latent

AnnData object with n_obs × n_vars = 7362 × 20
    obs: 'cell_type', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight'
    obsp: 'connectivities'

In [24]:
latent.obsm['latent'] = latent.X
latent.obs['cell_type'] = rna.obs['cell_type'].astype('category')
latent.obs['replicate'] = rna.obs['replicate'].astype('category')
latent

AnnData object with n_obs × n_vars = 7362 × 20
    obs: 'cell_type', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight', 'replicate'
    obsm: 'latent'
    obsp: 'connectivities'

In [25]:
metrics = mtg.metrics.metrics(None, latent,
                      batch_key='replicate',
                      label_key='cell_type',
                      isolated_label_f1=False,
                      pcr_batch=False,
                      embed='latent', 
                      save='cao-seurat.csv', 
                      method='seurat')
metrics

Clustering...
ASW label/batch...
Graph connectivity...
ASW label...
NMI cluster/label...
ARI cluster/label...
Isolated label silhouette...


Unnamed: 0,score
ASW_label/batch,0.863298
graph_conn,0.692209
ASW_label,0.561667
NMI_cluster/label,0.623704
ARI_cluster/label,0.542221
isolated_label_silhouette,0.565712
