# 3D UMAP Export for Web Viewer

Export helper to move a parameterized AnnData UMAP run into the web viewer format.

Workflow:
1. Configure run + export paths.
2. Load the selected UMAP and optional full AnnData for metadata/genes.
3. Export viewer assets and sanity-check file sizes.

## Environment

Lightweight setup so the notebook works whether it's launched from the repo root or the `notebooks/` directory.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import sys
import gc

import scanpy as sc
import anndata as ad

HERE = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()

def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / "pyproject.toml").exists():
            return candidate
    return start


PROJECT_ROOT = find_project_root(HERE)
SRC_DIR = PROJECT_ROOT / "src"
if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))
    
sc.settings.verbosity = 3

## Configuration

Set file locations and export options. Update `EXPERIMENT_FILE` to the run you want to export.

In [3]:
# File locations
DATASET_NAME = "suo"
DATA_ROOT = PROJECT_ROOT / "data"
RAW_DIR = DATA_ROOT / "raw"
EXPERIMENT_DIR = DATA_ROOT / "experiments"
EXPORT_DIR = Path("/Users/kemalinecik/git_nosync/_/cellucid/assets/exports") / DATASET_NAME

# Inputs/outputs
EXPERIMENT_RAW_FILE = EXPERIMENT_DIR / Path("suo_v02_02_2_tardis_30_latent_subset-unreserved.h5ad")
COMPLETE_ADATA_FILE = RAW_DIR / "suo_subset.h5ad"

# Export options
UMAP_KEY = "X_umap"            # 3D layout to send to the viewer
LATENT_KEY = None              # e.g., "X_pca"; None uses adata.X
VAR_GENE_ID_COLUMN = "hgnc"
GENE_IDENTIFIERS = None        # e.g., ["Gene1", "Gene2"] or None for all
CENTROID_OUTLIER_Q = 0.90
CENTROID_MIN_POINTS = 10

In [None]:
n_neighbors = 30
min_dist = 0.5
dist_str = f"{min_dist:.2f}".replace('.', '_')
EXPERIMENT_FILE = EXPERIMENT_DIR / Path(f"suo_v02_02_2_tardis_30_latent_subset-unreserved|k__{n_neighbors}|min_dist_{dist_str}.h5ad")

if True:
    RANDOM_SEED = 0
    adata = ad.read_h5ad(EXPERIMENT_RAW_FILE)
    del adata.uns, adata.obsm, adata.obsp
    adata = adata[adata.obs['LVL0'] == "Haematopoeitic_lineage"].copy()
    
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state=RANDOM_SEED)
    sc.tl.umap(adata, n_components=3, min_dist=min_dist, random_state=RANDOM_SEED)
    
    adata.write_h5ad(EXPERIMENT_FILE)
    del adata
    gc.collect()

computing neighbors
    using data matrix X directly
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:01:04)
computing UMAP


## Load UMAP run

Load the selected AnnData object and confirm the file exists.

In [None]:
if EXPERIMENT_FILE is None or not EXPERIMENT_FILE.exists():
    raise FileNotFoundError(
        f"UMAP file not found. Set EXPERIMENT_FILE to a valid .h5ad under {EXPERIMENT_DIR}"
    )

adata = ad.read_h5ad(EXPERIMENT_FILE)
if UMAP_KEY not in adata.obsm:
    raise KeyError(f"UMAP_KEY {UMAP_KEY!r} not found in adata.obsm")

umap_coords = adata.obsm[UMAP_KEY]
adata

AnnData object with n_obs × n_vars = 841922 × 24
    obs: 'sample_ID', 'organ', 'age', 'cell_type', 'sex', 'sex_inferred', 'concatenated_integration_covariates', 'integration_donor', 'integration_biological_unit', 'integration_sample_status', 'integration_library_platform_coarse', 'n_genes', 'LVL3', 'LVL2', 'LVL1', 'LVL0', '_scvi_batch', '_scvi_labels'
    uns: 'neighbors', 'umap'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

## Quick UMAP stats

Lightweight sanity check on the loaded coordinates.

In [None]:
umap_stats = {
    "mean": umap_coords.mean(axis=0).tolist(),
    "std": umap_coords.std(axis=0).tolist(),
    "n_cells": adata.n_obs,
}
umap_stats

{'mean': [1.0719739198684692, 9.685429573059082, 1.9216009378433228],
 'std': [5.597991466522217, 3.9112212657928467, 5.851239204406738],
 'n_cells': 841922}

## Load full annotated data

Load the complete AnnData to supply metadata (`obs`) and gene expression matrices aligned to the UMAP run.

In [None]:
if not COMPLETE_ADATA_FILE.exists():
    raise FileNotFoundError(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")

adata_complete = ad.read_h5ad(COMPLETE_ADATA_FILE)
# adata_complete = adata_complete[:, adata_complete.var["highly_variable"] == 1]
adata_complete = adata_complete[adata.obs.index].copy()

# Normalize counts to 1e4 per cell and log-transform for export
sc.pp.normalize_total(adata_complete, target_sum=1e4)
sc.pp.log1p(adata_complete)

adata_complete

AnnData object with n_obs × n_vars = 841922 × 12288
    obs: 'handle_anndata', 'study', 'sample_ID', 'organ', 'age', 'cell_type', 'integration_donor', 'integration_biological_unit', 'integration_sample_status', 'integration_library_platform_coarse', 'concatenated_integration_covariates'
    var: 'hgnc'
    uns: 'rank_genes_groups', 'log1p'

In [None]:
adata_complete.X[0].A

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        3.29690132]])

## Export for web viewer

Selects the latent space (uses `LATENT_KEY` if provided, else `adata.X`) and writes `points.bin` plus obs/var manifests for the HTML viewer in `index.html`.

In [None]:
from cellucid.prepare_data import export_data_for_web

In [None]:
export_data_for_web(
    X_umap=umap_coords,
    latent_space=adata.X,
    obs=adata_complete.obs,
    var=adata_complete.var,
    gene_expression=adata_complete.X,
    connectivities=adata.obsp['connectivities'],
    var_gene_id_column=VAR_GENE_ID_COLUMN,
    gene_identifiers=GENE_IDENTIFIERS,
    centroid_outlier_quantile=CENTROID_OUTLIER_Q,
    centroid_min_points=CENTROID_MIN_POINTS,
    force=False,
    var_quantization = 8,
    obs_continuous_quantization = 8,
    obs_categorical_dtype = "auto",
    compression = 6,
    # Dataset info
    out_dir=EXPORT_DIR,
    dataset_name=DATASET_NAME,
    dataset_description="Mapping the developing human immune system across organs",
    source_name="CellxGene",
    source_url="https://cellxgene.cziscience.com/collections/b1a879f6-5638-48d3-8f64-f6592c1b1561"
)

Export Settings:
  Output directory: /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo
  Compression: gzip level 6
  Var (gene) quantization: 8-bit
  Obs continuous quantization: 8-bit
  Obs categorical dtype: auto
✓ Wrote positions to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/points.bin.gz (gzip)
✓ Wrote obs manifest (11 fields: 1 continuous, 10 categorical) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/obs_manifest.json with binaries in obs/


Exporting genes: 100%|██████████| 12288/12288 [04:13<00:00, 48.38it/s]


✓ Wrote var manifest (12288 genes, 8-bit quantized, gzip level 6) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/var_manifest.json
  Extracting unique edges from 841,922 cells...
  Found 29,570,137 unique edges, max 402 neighbors/cell
  Sorting edges for optimal compression...
✓ Wrote connectivity (29,570,137 edges, max 402 neighbors/cell, uint32) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/connectivity
✓ Wrote dataset identity to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/dataset_identity.json


## Validate export artifacts

Spot-check file sizes (MB), manifest stats, and total obs/var directory sizes.

In [None]:
import json
from pathlib import Path

BYTES_IN_MB = 1024 * 1024

def size_mb(path: Path) -> float:
    return round(path.stat().st_size / BYTES_IN_MB, 3) if path.exists() else 0

def dir_stats(path: Path) -> dict:
    if not path.exists():
        return {"size_mb": 0, "files": 0}
    total_bytes = 0
    file_count = 0
    for p in path.rglob("*"):
        if p.is_file():
            file_count += 1
            total_bytes += p.stat().st_size
    return {"size_mb": round(total_bytes / BYTES_IN_MB, 3), "files": file_count}

points_path = EXPORT_DIR / "points.bin"
obs_manifest_path = EXPORT_DIR / "obs_manifest.json"
var_manifest_path = EXPORT_DIR / "var_manifest.json"
obs_dir = EXPORT_DIR / "obs"
var_dir = EXPORT_DIR / "var"

obs_manifest = json.loads(obs_manifest_path.read_text()) if obs_manifest_path.exists() else None
var_manifest = json.loads(var_manifest_path.read_text()) if var_manifest_path.exists() else None

{
    "paths": {
        "export_dir": EXPORT_DIR,
        "points": points_path,
        "obs_manifest": obs_manifest_path,
        "var_manifest": var_manifest_path,
        "obs_dir": obs_dir,
        "var_dir": var_dir,
    },
    "sizes_mb": {
        "points": size_mb(points_path),
        "obs_manifest": size_mb(obs_manifest_path),
        "var_manifest": size_mb(var_manifest_path),
    },
    "dir_sizes_mb": {
        "obs": dir_stats(obs_dir),
        "var": dir_stats(var_dir),
    },
    "manifest_stats": {
        "obs": None if obs_manifest is None else {
            "n_points": obs_manifest.get("n_points"),
            "fields": len(obs_manifest.get("fields", [])),
            "centroid_outlier_quantile": obs_manifest.get("centroid_outlier_quantile"),
        },
        "var": None if var_manifest is None else {
            "n_points": var_manifest.get("n_points"),
            "fields": len(var_manifest.get("fields", [])),
            "var_gene_id_column": var_manifest.get("var_gene_id_column"),
        },
    },
}

{'paths': {'export_dir': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo'),
  'points': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/points.bin'),
  'obs_manifest': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/obs_manifest.json'),
  'var_manifest': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/var_manifest.json'),
  'obs_dir': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/obs'),
  'var_dir': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/var')},
 'sizes_mb': {'points': 0, 'obs_manifest': 0.028, 'var_manifest': 0.44},
 'dir_sizes_mb': {'obs': {'size_mb': 8.357, 'files': 21},
  'var': {'size_mb': 758.984, 'files': 12288}},
 'manifest_stats': {'obs': {'n_points': 841922,
   'fields': 0,
   'centroid_outlier_quantile': 0.9},
  'var': {'n_points': 841922, 'fields': 12288, 'var_gene_id_column': 'hgnc'}}}

Done. Serve `index.html` from the repo root to view the exported data.