In [1]:
import scanpy as sc
import numpy as np

In [2]:
ad_shuffled = sc.read("export_data//tabula_sapiens_subset_shuffled_uce_adata.h5ad")
ad_shuffled

AnnData object with n_obs × n_vars = 100000 × 19112
    obs: 'donor', 'idx', 'n_genes'
    var: 'ensembl_id', 'gene_symbol', 'genome', 'mt', 'ercc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std', 'n_cells'
    obsm: 'X_uce'

In [3]:
ad_shuffled_unshuffle = ad_shuffled[np.argsort(ad_shuffled.obs["idx"].values)].copy()
ad_shuffled_unshuffle

AnnData object with n_obs × n_vars = 100000 × 19112
    obs: 'donor', 'idx', 'n_genes'
    var: 'ensembl_id', 'gene_symbol', 'genome', 'mt', 'ercc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std', 'n_cells'
    obsm: 'X_uce'

In [4]:
ad_shuffled.obsm["X_uce"]

array([[-0.03719243, -0.03832808, -0.02143534, ...,  0.01128549,
        -0.00344243, -0.00220032],
       [-0.09075368, -0.01524959,  0.00078456, ..., -0.012646  ,
         0.01236705, -0.00080781],
       [-0.00200172, -0.00326886,  0.00899855, ...,  0.00251592,
        -0.00091363, -0.01873168],
       ...,
       [ 0.02946423,  0.0008341 ,  0.00860818, ..., -0.01432771,
        -0.00768381,  0.00337972],
       [-0.00669338,  0.00927865, -0.00934948, ...,  0.0053122 ,
        -0.00956197,  0.02493194],
       [-0.05000715, -0.03622312,  0.02516385, ...,  0.01570737,
        -0.01682933, -0.03734508]], shape=(100000, 1280), dtype=float32)

In [5]:
def get_cosine_similarity(A, B):
    # Written by GPT 5.1 Thinking 12/1/25
    A = A.astype(np.float32)
    B = B.astype(np.float32)
    
    # Dot product for each row pair → shape (100000,)
    dot = np.einsum('ij,ij->i', A, B)  # or (A * B).sum(axis=1)
    
    # L2 norms for each row → shape (100000,)
    normA = np.linalg.norm(A, axis=1)
    normB = np.linalg.norm(B, axis=1)
    
    # Cosine similarity for each row pair → shape (100000,)
    eps = 1e-8  # to avoid division by zero
    cos_sim = dot / (normA * normB + eps)
    return cos_sim

In [6]:
cs_random_pair = get_cosine_similarity(ad_shuffled_unshuffle.obsm["X_uce"], ad_shuffled.obsm["X_uce"])
cs_random_pair

array([0.26070076, 0.11646733, 0.30022547, ..., 0.30008838, 0.22064386,
       0.29229802], shape=(100000,), dtype=float32)

In [7]:
cs_random_pair.mean(), cs_random_pair.std()

(np.float32(0.3345018), np.float32(0.1645341))

In [8]:
ad_unshuffled = sc.read("export_data/tabula_sapiens_subset_unshuffled_uce_adata.h5ad")
ad_unshuffled

AnnData object with n_obs × n_vars = 100000 × 19112
    obs: 'donor', 'idx', 'n_genes'
    var: 'ensembl_id', 'gene_symbol', 'genome', 'mt', 'ercc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std', 'n_cells'
    obsm: 'X_uce'

In [9]:
cs_different_seed = get_cosine_similarity(ad_shuffled_unshuffle.obsm["X_uce"], ad_unshuffled.obsm["X_uce"])
cs_different_seed

array([0.90474707, 0.9359961 , 0.9480248 , ..., 0.9843436 , 0.97018653,
       0.9881486 ], shape=(100000,), dtype=float32)

In [10]:
cs_different_seed.mean(), cs_different_seed.std()

(np.float32(0.95744807), np.float32(0.03155395))

# check for nearest cell types

In [11]:
full_ad = sc.read("export_data/uce_tabula_33_8_umap.h5ad")
full_ad

AnnData object with n_obs × n_vars = 1194952 × 1280
    obs: 'donor', 'tissue', 'anatomical_position', 'method', 'cdna_plate', 'library_plate', 'notes', 'cdna_well', 'old_index', 'assay', 'sample_id', 'sample', 'replicate', '10X_run', '10X_barcode', 'ambient_removal', 'donor_method', 'donor_assay', 'donor_tissue', 'donor_tissue_assay', 'cell_ontology_class', 'cell_ontology_id', 'compartment', 'broad_cell_class', 'free_annotation', 'manually_annotated', 'published_2022', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ercc', 'pct_counts_ercc', '_scvi_batch', '_scvi_labels', 'scvi_leiden_donorassay_full', 'age', 'sex', 'ethnicity', 'n_genes'
    uns: 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [12]:
cell_type_vcs = full_ad.obs["cell_ontology_class"].value_counts()
check_cell_types = list(cell_type_vcs[cell_type_vcs >= 5000].index)
check_cell_types

['b cell',
 'cd4-positive, alpha-beta t cell',
 'fibroblast',
 'cd8-positive, alpha-beta t cell',
 'macrophage',
 'neutrophil',
 'stromal cell of ovary',
 'basal cell',
 'endothelial cell',
 'plasma cell',
 'bladder urothelial cell',
 'monocyte',
 'mesenchymal stem cell',
 'classical monocyte',
 'smooth muscle cell',
 'capillary endothelial cell',
 'luminal epithelial cell of mammary gland',
 'erythrocyte',
 'natural killer cell',
 'type ii pneumocyte',
 'enterocyte of epithelium proper of duodenum',
 'stratified squamous epithelial cell',
 'acinar cell of salivary gland',
 'pericyte',
 'conjunctival epithelial cell',
 'luminal cell of prostate epithelium',
 'cardiac endothelial cell',
 'naive thymus-derived cd4-positive, alpha-beta t cell',
 'kidney epithelial cell',
 'hepatocyte',
 't cell',
 'enterocyte of epithelium proper of ileum',
 'skeletal muscle satellite stem cell',
 'mast cell',
 'blood vessel smooth muscle cell',
 'regular atrial cardiac myocyte',
 'mononuclear phagocyte',

In [13]:
from tqdm.auto import tqdm

cts_to_cs = {}
for ct in tqdm(check_cell_types):
    ad_ct = full_ad[full_ad.obs["cell_ontology_class"] == ct]
    X = ad_ct.X.toarray()
    rd_idx = np.random.choice(np.arange(X.shape[0]), size=X.shape[0], replace=False)
    X_shuffled = X[rd_idx]
    cts_to_cs[ct] = get_cosine_similarity(X, X_shuffled).mean()

  0%|          | 0/42 [00:00<?, ?it/s]

In [14]:
np.array(list(cts_to_cs.values())).mean()

np.float32(0.7410902)

In [15]:
np.array(list(cts_to_cs.values())).std()

np.float32(0.08838029)

# Check scGPT

In [17]:
!cp /lfs/local/0/yanay/scGPT_code/full_scGPT_3000.h5ad export_data/full_scGPT_3000.h5ad

In [18]:
subsample_ad = sc.read("export_data/scgpt_tabula_subsample_3000hvg_scgpt.h5ad")
full_scgpt_ad = sc.read("export_data/full_scGPT_3000.h5ad")

In [19]:
full_scgpt_ad_sub = full_scgpt_ad[subsample_ad.obs_names]

In [20]:
full_scgpt_x = full_scgpt_ad_sub.obsm["X_scGPT"].toarray()

In [21]:
subsample_scgpt_x = subsample_ad.obsm["X_scGPT"]

In [22]:
cs_scgpt = get_cosine_similarity(full_scgpt_x, subsample_scgpt_x)

In [23]:
cs_scgpt.mean(), cs_scgpt.std()

(np.float32(0.7975327), np.float32(0.04251705))