In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import mudata as mu
from muon import atac as ac

from cfgen.paths import DATA_DIR

from cfgen.eval.compute_evaluation_metrics import compute_evaluation_metrics

Here we show how to compute evaluation metrics as presented in the paper for the multimodal example. For this, you need the real single-cell anndata and a generated one.

**Utils functions**

In [2]:
def add_to_dict(d, metrics):
    for metric in metrics:
        if metric not in d:
            d[metric] = [metrics[metric]]
        else:
            d[metric]+=[metrics[metric]]
    return d

Start dictionary to collect metrics 

In [3]:
results_atac = {}
results_rna = {}

Read the real data 

In [4]:
adata_real = mu.read(DATA_DIR / "processed/atac/pbmc/pbmc10k_multiome_test.h5mu")

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


## Preprocess the real data 

RNA 

In [5]:
adata_real_rna = adata_real["rna"]

# Bring back counts 
adata_real_rna.X = adata_real_rna.layers["X_counts"].copy()
# Compute HVG (don't subset)
sc.pp.highly_variable_genes(adata_real_rna,
                            flavor="seurat_v3",
                            n_top_genes=2000,
                            layer="X_counts",
                            subset=False)
vars_rna = adata_real_rna.var.copy()

# Pick 30 pcs
sc.pp.normalize_total(adata_real_rna, target_sum=1e4)
sc.pp.log1p(adata_real_rna)
sc.tl.pca(adata_real_rna, n_comps=30)

ATAC 

In [6]:
adata_real_atac = adata_real["atac"]
# Bring back counts 
adata_real_atac.obs["cell_type"] = adata_real_rna.obs["cell_type"]  # Harmonize annotation
adata_real_atac.X = adata_real_atac.layers["X_counts"].copy()
ac.pp.tfidf(adata_real_atac, scale_factor=1e4)
# Compute highly variable peaks 
sc.pp.highly_variable_genes(adata_real_atac, n_top_genes=10000, subset=False)
vars_atac = adata_real_atac.var.copy()
sc.tl.pca(adata_real_rna, n_comps=30)

Initialize unique cell types 

In [7]:
celltype_unique = np.unique(adata_real_rna.obs["cell_type"])  # unique cell type 
adata_real_rna = adata_real_rna[:, adata_real_rna.var.highly_variable]
adata_real_atac = adata_real_atac[:, adata_real_atac.var.highly_variable]

# Collect generated data 

RNA generated 

In [8]:
adata_generated_path_rna = DATA_DIR / "generated/pbmc10k_multimodal/generated_cells_0_rna.h5ad"
adata_generated_rna = sc.read_h5ad(adata_generated_path_rna)
adata_generated_rna.var = vars_rna
adata_generated_rna = adata_generated_rna[:, adata_generated_rna.var.highly_variable]
adata_generated_rna.obsm["X_pca"] = adata_generated_rna.X.toarray().dot(adata_real_rna.varm["PCs"])

  adata_generated_rna.obsm["X_pca"] = adata_generated_rna.X.toarray().dot(adata_real_rna.varm["PCs"])


ATAC generated 

In [9]:
adata_generated_path_atac = DATA_DIR / "generated/pbmc10k_multimodal/generated_cells_0_atac.h5ad"
adata_generated_atac = sc.read_h5ad(adata_generated_path_atac)
adata_generated_atac.var = vars_atac
ac.pp.tfidf(adata_generated_atac, scale_factor=1e4)
adata_generated_atac = adata_generated_atac[:, adata_generated_atac.var.highly_variable]
adata_generated_atac.obsm["X_pca"] = adata_generated_atac.X.toarray().dot(adata_real_atac.varm["PCs"])

  adata_generated_atac.obsm["X_pca"] = adata_generated_atac.X.toarray().dot(adata_real_atac.varm["PCs"])


## Compute metrics

In [10]:
for ct in celltype_unique:
    adata_real_ct_atac = adata_real_atac[adata_real_atac.obs["cell_type"]==ct]
    adata_real_ct_rna = adata_real_rna[adata_real_rna.obs["cell_type"]==ct]
    adata_generated_rna_ct = adata_generated_rna[adata_generated_rna.obs["cell_type"]==ct]
    adata_generated_atac_ct = adata_generated_atac[adata_generated_atac.obs["cell_type"]==ct]
    results_rna_ct = compute_evaluation_metrics(adata_real_ct_rna, 
                                                            adata_generated_rna_ct, 
                                                            "cell_type",
                                                            "celldreamer_rna",
                                                            nn=10, 
                                                            original_space=True, 
                                                            knn_pca=None, 
                                                            knn_data=None)

    results_atac_ct = compute_evaluation_metrics(adata_real_ct_atac, 
                                                                adata_generated_atac_ct,
                                                                "cell_type", 
                                                                "celldreamer_atac",
                                                                nn=10, 
                                                                original_space=True,
                                                                knn_pca=None, 
                                                                knn_data=None)

    results_rna_ct["ct"] = ct
    results_atac_ct["ct"] = ct
    results_rna = add_to_dict(results_rna, results_rna_ct)
    results_atac = add_to_dict(results_atac, results_atac_ct)

Evaluating for celldreamer_rna
Real (79, 2000)
Generated (83, 2000)
Evaluating for celldreamer_atac
Real (79, 10000)
Generated (83, 10000)
Evaluating for celldreamer_rna
Real (97, 2000)
Generated (83, 2000)
Evaluating for celldreamer_atac
Real (97, 10000)
Generated (83, 10000)
Evaluating for celldreamer_rna
Real (21, 2000)
Generated (22, 2000)
Evaluating for celldreamer_atac
Real (21, 10000)
Generated (22, 10000)
Evaluating for celldreamer_rna
Real (366, 2000)
Generated (358, 2000)
Evaluating for celldreamer_atac
Real (366, 10000)
Generated (358, 10000)
Evaluating for celldreamer_rna
Real (84, 2000)
Generated (93, 2000)
Evaluating for celldreamer_atac
Real (84, 10000)
Generated (93, 10000)
Evaluating for celldreamer_rna
Real (151, 2000)
Generated (158, 2000)
Evaluating for celldreamer_atac
Real (151, 10000)
Generated (158, 10000)
Evaluating for celldreamer_rna
Real (105, 2000)
Generated (106, 2000)
Evaluating for celldreamer_atac
Real (105, 10000)
Generated (106, 10000)
Evaluating for 

## Print metrics 

In [11]:
results_rna_df = pd.DataFrame(results_rna)
results_atac_df = pd.DataFrame(results_atac)

Cell type metrics

In [12]:
results_rna_df.groupby("ct").mean()

Unnamed: 0_level_0,1-Wasserstein_PCA,2-Wasserstein_PCA,Linear_MMD_PCA,RBF_MMD_PCA
ct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CD56 (bright) NK cells,13.901363,13.964614,164.205963,0.949198
CD56 (dim) NK cells,14.048255,14.101911,170.092728,0.944428
MAIT T cells,14.268748,14.293383,160.173111,1.135228
classical monocytes,13.617093,13.694747,153.147125,0.717228
effector CD8 T cells,14.366707,14.430833,166.116928,0.862776
intermediate monocytes,13.538783,13.618487,152.128418,0.761678
memory B cells,13.775281,13.825853,162.347321,0.887582
memory CD4 T cells,14.195701,14.251613,172.447113,0.847939
myeloid DC,13.217821,13.347065,129.638779,0.776392
naive B cells,13.994611,14.031411,165.519226,0.992349


In [13]:
results_atac_df.groupby("ct").mean()

Unnamed: 0_level_0,1-Wasserstein_PCA,2-Wasserstein_PCA,Linear_MMD_PCA,RBF_MMD_PCA
ct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CD56 (bright) NK cells,17.549646,18.153809,141.890717,1.115963
CD56 (dim) NK cells,17.073234,17.71109,166.354004,1.132415
MAIT T cells,17.802936,18.763143,95.825935,1.255773
classical monocytes,18.084646,19.047541,136.628464,0.757537
effector CD8 T cells,23.598734,25.774977,227.464279,0.998872
intermediate monocytes,19.016929,20.085134,143.168152,0.789143
memory B cells,16.921557,17.679772,123.306808,0.84078
memory CD4 T cells,17.051328,18.060486,98.650558,0.826709
myeloid DC,18.982511,20.158217,18.45129,0.793022
naive B cells,15.08401,15.625309,89.797333,0.860073
