# 3D UMAP Export for Web Viewer

Export helper to move a parameterized AnnData UMAP run into the web viewer format.

Workflow:
1. Configure run + export paths.
2. Load the selected UMAP and optional full AnnData for metadata/genes.
3. Export viewer assets and sanity-check file sizes.

## Environment

Lightweight setup so the notebook works whether it's launched from the repo root or the `notebooks/` directory.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import sys

import scanpy as sc
import anndata as ad

HERE = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()


def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / "pyproject.toml").exists():
            return candidate
    return start


PROJECT_ROOT = find_project_root(HERE)
SRC_DIR = PROJECT_ROOT / "src"
if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

from cellucid.prepare_data import export_data_for_web


## Configuration

Set file locations and export options. Update `EXPERIMENT_FILE` to the run you want to export.

In [3]:
# File locations
DATA_ROOT = PROJECT_ROOT / "data"
RAW_DIR = DATA_ROOT / "raw"
EXPERIMENT_DIR = DATA_ROOT / "experiments" / "umap_parameter_sweep"
EXPORT_DIR = PROJECT_ROOT / "exports"

# Inputs/outputs
EXPERIMENT_FILE = EXPERIMENT_DIR / Path("adata|k__90|min_dist_0_10.h5ad")
COMPLETE_ADATA_FILE = RAW_DIR / "adata_unified_20250925_001_complete.h5ad"

# Export options
UMAP_KEY = "X_umap"            # 3D layout to send to the viewer
LATENT_KEY = None              # e.g., "X_pca"; None uses adata.X
VAR_GENE_ID_COLUMN = "converted_id"
GENE_IDENTIFIERS = None        # e.g., ["Gene1", "Gene2"] or None for all
CENTROID_OUTLIER_Q = 0.90
CENTROID_MIN_POINTS = 10


## Load UMAP run

Load the selected AnnData object and confirm the file exists.

In [4]:
if EXPERIMENT_FILE is None or not EXPERIMENT_FILE.exists():
    raise FileNotFoundError(
        f"UMAP file not found. Set EXPERIMENT_FILE to a valid .h5ad under {EXPERIMENT_DIR}"
    )

adata = ad.read_h5ad(EXPERIMENT_FILE)
if UMAP_KEY not in adata.obsm:
    raise KeyError(f"UMAP_KEY {UMAP_KEY!r} not found in adata.obsm")

umap_coords = adata.obsm[UMAP_KEY]
adata

AnnData object with n_obs × n_vars = 162259 × 17
    obs: 'handle_anndata', 'lab', 'protocol', 'sample', 'stage', 'cell_type_authors', 'cell_type_lv1', 'protocol|lab|stage|sample', 'prep_n_counts', 'prep_n_genes_by_counts', 'prep_pct_counts_mt', 'prep_pct_counts_ribo', 'prep_pct_counts_bcr', 'prep_pct_counts_tcr', 'prep_score_mt', 'prep_score_ribo', 'prep_score_bcr', 'prep_score_tcr', 'prep_score_cc', 'prep_score_cc_s', 'prep_score_cc_g2m', 'prep_score_scanpy_cc_s', 'prep_score_scanpy_cc_g2m', 'prep_scanpy_cc_phase', 'prep_scanpy_sc_cycling_strength', 'qc_fail', 'qc_fail_reasons', 'cell_type_lv1||protocol|lab|stage|sample'
    uns: 'neighbors', 'umap'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

## Quick UMAP stats

Lightweight sanity check on the loaded coordinates.

In [5]:
umap_stats = {
    "mean": umap_coords.mean(axis=0).tolist(),
    "std": umap_coords.std(axis=0).tolist(),
    "n_cells": adata.n_obs,
}
umap_stats

{'mean': [8.495719909667969, 4.791225433349609, 5.743679046630859],
 'std': [2.0218584537506104, 2.4178414344787598, 1.4710114002227783],
 'n_cells': 162259}

## Load full annotated data

Load the complete AnnData to supply metadata (`obs`) and gene expression matrices aligned to the UMAP run.

In [6]:
if not COMPLETE_ADATA_FILE.exists():
    raise FileNotFoundError(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")

adata_complete = ad.read_h5ad(COMPLETE_ADATA_FILE)
# adata_complete = adata_complete[:, adata_complete.var["highly_variable"] == 1]
adata_complete = adata_complete[adata.obs.index].copy()

# Normalize counts to 1e4 per cell and log-transform for export
sc.pp.normalize_total(adata_complete, target_sum=1e4)
sc.pp.log1p(adata_complete)


In [7]:
adata_complete.X[0].A

array([[0.        , 1.04850999, 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Export for web viewer

Selects the latent space (uses `LATENT_KEY` if provided, else `adata.X`) and writes `points.bin` plus obs/var manifests for the HTML viewer in `index.html`.

In [8]:
if LATENT_KEY:
    if LATENT_KEY not in adata.obsm:
        raise KeyError(f"LATENT_KEY {LATENT_KEY!r} not found in adata.obsm")
    latent_space = adata.obsm[LATENT_KEY]
else:
    latent_space = adata.X

export_data_for_web(
    X_umap=umap_coords,
    latent_space=latent_space,
    obs=adata_complete.obs,
    var=adata_complete.var,
    gene_expression=adata_complete.X,
    connectivities=adata.obsp['connectivities'],
    var_gene_id_column=VAR_GENE_ID_COLUMN,
    gene_identifiers=GENE_IDENTIFIERS,
    out_dir=EXPORT_DIR,
    centroid_outlier_quantile=CENTROID_OUTLIER_Q,
    centroid_min_points=CENTROID_MIN_POINTS,
    force=False,
    var_quantization = 8,
    obs_continuous_quantization = 8,
    obs_categorical_dtype = "auto",
    compression = 6,
)

Export Settings:
  Output directory: /Users/kemalinecik/git_nosync/cellucid/exports
  Compression: gzip level 6
  Var (gene) quantization: 8-bit
  Obs continuous quantization: 8-bit
  Obs categorical dtype: auto
✓ Wrote positions to /Users/kemalinecik/git_nosync/cellucid/exports/points.bin.gz (gzip)
  ⚠ outlier quantiles 'cell_type_authors_outliers': 1 NaN values (0.0%) → mapped to missing marker
  ⚠ outlier quantiles 'cell_type_lv1||protocol|lab|stage|sample_outliers': 136 NaN values (0.1%) → mapped to missing marker
✓ Wrote obs manifest (28 fields) to /Users/kemalinecik/git_nosync/cellucid/exports/obs_manifest.json with binaries in obs/


Exporting genes: 100%|██████████| 44910/44910 [03:54<00:00, 191.64it/s]


✓ Wrote var manifest (44910 genes, 8-bit quantized, gzip level 6) to /Users/kemalinecik/git_nosync/cellucid/exports/var_manifest.json
✓ Wrote connectivity manifest (10,537,096 unique edges, ~129.0 neighbors/cell) to /Users/kemalinecik/git_nosync/cellucid/exports/connectivity_manifest.json


## Validate export artifacts

Spot-check file sizes (MB), manifest stats, and total obs/var directory sizes.

In [9]:
import json
from pathlib import Path

BYTES_IN_MB = 1024 * 1024

def size_mb(path: Path) -> float:
    return round(path.stat().st_size / BYTES_IN_MB, 3) if path.exists() else 0

def dir_stats(path: Path) -> dict:
    if not path.exists():
        return {"size_mb": 0, "files": 0}
    total_bytes = 0
    file_count = 0
    for p in path.rglob("*"):
        if p.is_file():
            file_count += 1
            total_bytes += p.stat().st_size
    return {"size_mb": round(total_bytes / BYTES_IN_MB, 3), "files": file_count}

points_path = EXPORT_DIR / "points.bin"
obs_manifest_path = EXPORT_DIR / "obs_manifest.json"
var_manifest_path = EXPORT_DIR / "var_manifest.json"
obs_dir = EXPORT_DIR / "obs"
var_dir = EXPORT_DIR / "var"

obs_manifest = json.loads(obs_manifest_path.read_text()) if obs_manifest_path.exists() else None
var_manifest = json.loads(var_manifest_path.read_text()) if var_manifest_path.exists() else None

{
    "paths": {
        "export_dir": EXPORT_DIR,
        "points": points_path,
        "obs_manifest": obs_manifest_path,
        "var_manifest": var_manifest_path,
        "obs_dir": obs_dir,
        "var_dir": var_dir,
    },
    "sizes_mb": {
        "points": size_mb(points_path),
        "obs_manifest": size_mb(obs_manifest_path),
        "var_manifest": size_mb(var_manifest_path),
    },
    "dir_sizes_mb": {
        "obs": dir_stats(obs_dir),
        "var": dir_stats(var_dir),
    },
    "manifest_stats": {
        "obs": None if obs_manifest is None else {
            "n_points": obs_manifest.get("n_points"),
            "fields": len(obs_manifest.get("fields", [])),
            "centroid_outlier_quantile": obs_manifest.get("centroid_outlier_quantile"),
        },
        "var": None if var_manifest is None else {
            "n_points": var_manifest.get("n_points"),
            "fields": len(var_manifest.get("fields", [])),
            "var_gene_id_column": var_manifest.get("var_gene_id_column"),
        },
    },
}

{'paths': {'export_dir': PosixPath('/Users/kemalinecik/git_nosync/cellucid/exports'),
  'points': PosixPath('/Users/kemalinecik/git_nosync/cellucid/exports/points.bin'),
  'obs_manifest': PosixPath('/Users/kemalinecik/git_nosync/cellucid/exports/obs_manifest.json'),
  'var_manifest': PosixPath('/Users/kemalinecik/git_nosync/cellucid/exports/var_manifest.json'),
  'obs_dir': PosixPath('/Users/kemalinecik/git_nosync/cellucid/exports/obs'),
  'var_dir': PosixPath('/Users/kemalinecik/git_nosync/cellucid/exports/var')},
 'sizes_mb': {'points': 0, 'obs_manifest': 0.126, 'var_manifest': 8.543},
 'dir_sizes_mb': {'obs': {'size_mb': 3.481, 'files': 40},
  'var': {'size_mb': 785.407, 'files': 44910}},
 'manifest_stats': {'obs': {'n_points': 162259,
   'fields': 28,
   'centroid_outlier_quantile': 0.9},
  'var': {'n_points': 162259,
   'fields': 44910,
   'var_gene_id_column': 'converted_id'}}}

Done. Serve `index.html` from the repo root to view the exported data.