# Read real and generated datasets 

In [89]:
import scanpy as sc
from celldreamer.eval.compute_evaluation_metrics import process_labels, compute_evaluation_metrics
from scipy import sparse
import pandas as pd

In [90]:
def add_to_dict(d, metrics):
    for metric in metrics:
        if metric not in d:
            d[metric] = [metrics[metric]]
        else:
            d[metric]+=[metrics[metric]]
    return d

In [105]:
adata_real_path = "/home/icb/alessandro.palma/environment/celldreamer/project_folder/datasets/processed_full_genome/hlca_core/hlca_core_test.h5ad"
adata_real = sc.read_h5ad(adata_real_path)

In [106]:
n_obs = adata_real.shape[0]

In [107]:
results_celldreamer = {}
results_scvi = {} 
results_scdiff = {}

for i in range(3):
    # adata_generated_path_celldreamer = f"/home/icb/alessandro.palma/environment/celldreamer/project_folder/datasets/generated/pbmc3k/generated_cells_{i}.h5ad"
    # adata_generated_celldreamer = sc.read_h5ad(adata_generated_path_celldreamer)

    # adata_generated_path_scvi = f"/home/icb/alessandro.palma/environment/celldreamer/project_folder/baseline_experiments/scvi/pbmc3k/generated/pbmc3k_{i}.h5ad"
    # adata_generated_scvi = sc.read_h5ad(adata_generated_path_scvi)
    # adata_generated_scvi.X = sparse.csr_matrix(adata_generated_scvi.X)
    # adata_generated_scvi = process_labels(adata_real, adata_generated_scvi, "cell_type", categorical_obs=True)

    adata_generated_path_scDiffusion = f"/home/icb/alessandro.palma/environment/celldreamer/project_folder/baseline_experiments/scDiffusion/generated/tabula_muris/generated_cells_{i}.h5ad"
    adata_generated_scDiffusion = sc.read_h5ad(adata_generated_path_scDiffusion)[-n_obs:]
    adata_generated_scDiffusion.X = sparse.csr_matrix(adata_generated_scDiffusion.X.copy())
    adata_generated_scDiffusion = process_labels(adata_real, adata_generated_scDiffusion, "cell_type", categorical_obs=False)

    results_celldreamer_i = compute_evaluation_metrics(adata_real, adata_generated_celldreamer, "cell_type", conditional=True, nn=10)
    results_scvi_i = compute_evaluation_metrics(adata_real, adata_generated_scvi, "cell_type", conditional=True, nn=10)
    results_scdiff_i = compute_evaluation_metrics(adata_real, adata_generated_scDiffusion, "cell_type", conditional=True, nn=10)

    results_celldreamer = add_to_dict(results_celldreamer, results_celldreamer_i)
    results_scvi = add_to_dict(results_scvi, results_scvi_i)
    results_scdiff = add_to_dict(results_scdiff, results_scdiff_i)

KeyboardInterrupt: 

In [113]:
a = sc.read_h5ad(f"/home/icb/alessandro.palma/environment/celldreamer/project_folder/baseline_experiments/scDiffusion/generated/tabula_muris/generated_cells_{i}.h5ad")

In [115]:
a.X

<110560x19734 sparse matrix of type '<class 'numpy.float32'>'
	with 456611850 stored elements in Compressed Sparse Row format>

In [94]:
results_celldreamer_df = pd.DataFrame(results_celldreamer)
results_scvi_df = pd.DataFrame(results_scvi)
results_scdiff_df = pd.DataFrame(results_scdiff)

In [95]:
results_celldreamer_df.mean(0)

1-Wasserstein_PCA       18.212159
2-Wasserstein_PCA       18.359532
Linear_MMD_PCA         208.269852
Poly_MMD_PCA         41245.282552
1-Wasserstein           32.323120
2-Wasserstein           32.412764
Linear_MMD               8.713562
Poly_MMD             10913.864583
KNN identity             0.106061
KNN identity PCA         0.000000
precision                0.326389
recall                   0.199495
density                  1.390467
coverage                 0.987374
precision_PCA            0.779672
recall_PCA               0.001894
density_PCA              0.396717
coverage_PCA             0.314394
KNN category             0.727904
KNN category PCA         0.730429
dtype: float64

In [96]:
results_scvi_df.mean(0)

1-Wasserstein_PCA       19.005442
2-Wasserstein_PCA       19.137676
Linear_MMD_PCA         244.383372
Poly_MMD_PCA         56094.671875
1-Wasserstein           32.041432
2-Wasserstein           32.104638
Linear_MMD               2.021483
Poly_MMD              2165.619792
KNN identity             0.635732
KNN identity PCA         0.000000
precision                0.250631
recall                   0.532828
density                  0.351073
coverage                 0.943813
precision_PCA            0.653409
recall_PCA               0.000000
density_PCA              0.236932
coverage_PCA             0.218434
KNN category             0.797980
KNN category PCA         0.775253
dtype: float64

In [97]:
results_scdiff_df.mean(0)

1-Wasserstein_PCA        22.238991
2-Wasserstein_PCA        22.503291
Linear_MMD_PCA          484.615346
Poly_MMD_PCA         326801.156250
1-Wasserstein            27.431624
2-Wasserstein            27.589853
Linear_MMD              229.644613
Poly_MMD             469771.395833
KNN identity              0.000000
KNN identity PCA          0.000000
precision                 0.893939
recall                    0.247475
density                   2.682449
coverage                  0.823232
precision_PCA             0.179924
recall_PCA                0.001894
density_PCA               0.035480
coverage_PCA              0.104798
KNN category              0.766414
KNN category PCA          0.798611
dtype: float64

In [141]:
adata_train = sc.read_h5ad("/home/icb/alessandro.palma/environment/celldreamer/project_folder/datasets/processed/classifier_experiment/pbmc_covid_train.h5ad")

In [143]:
adata_test = sc.read_h5ad("/home/icb/alessandro.palma/environment/celldreamer/project_folder/datasets/processed/classifier_experiment/pbmc_covid_test.h5ad")

In [147]:
adata_train.layers["X_counts"].A.max(1)

array([30., 40., 99., ..., 11., 13., 14.], dtype=float32)

In [148]:
adata_test.layers["X_counts"].A.max(1)

array([119.,  41.,  27., ...,  10.,  24.,   8.], dtype=float32)

In [151]:
adata_train.X.A.max(1)

array([3.4277358, 3.7100825, 4.6003613, ..., 2.4603353, 2.6023011,
       2.7006817], dtype=float32)

In [152]:
adata_test.X.A.max(1)

array([118.87081  ,  40.58785  ,  26.829372 , ...,   9.832416 ,
        23.13024  ,   7.9390764], dtype=float32)

In [154]:
adata_aug = sc.read_h5ad("/home/icb/alessandro.palma/environment/celldreamer/project_folder/datasets/processed/classifier_experiment/augmented/pbmc_covid_augmented_prop.h5ad")