# Multi-Dimensional UMAP Export for Web Viewer

Export helper to compute and export 1D, 2D, and 3D UMAP embeddings for the web viewer.

Workflow:
1. Configure run + export paths.
2. Compute UMAP embeddings for multiple dimensions (1D, 2D, 3D).
3. Load the embeddings and optional full AnnData for metadata/genes.
4. Export viewer assets with multi-dimensional support.
5. Sanity-check file sizes and embeddings metadata.

## Environment

Lightweight setup so the notebook works whether it's launched from the repo root or the `notebooks/` directory.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import sys
import gc
import pickle
import pandas as pd
import scanpy as sc
import anndata as ad

HERE = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()

def find_project_root(start: Path) -> Path:
    for candidate in [start, *start.parents]:
        if (candidate / "pyproject.toml").exists():
            return candidate
    return start


PROJECT_ROOT = find_project_root(HERE)
SRC_DIR = PROJECT_ROOT / "src"
if SRC_DIR.exists() and str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))
    
sc.settings.verbosity = 3

## Configuration

Set file locations and export options. Update `EXPERIMENT_FILE` to the run you want to export.

In [3]:
# File locations
DATASET_NAME = "suo"
DATA_ROOT = PROJECT_ROOT / "data"
RAW_DIR = DATA_ROOT / "raw"
EXPERIMENT_DIR = DATA_ROOT / "experiments" 
EXPORT_DIR = Path("/Users/kemalinecik/git_nosync/_/cellucid/assets/exports") / DATASET_NAME

# Inputs/outputs
EXPERIMENT_FILE = EXPERIMENT_DIR / Path("suo_developmental_complete_with_3d_umap.h5ad")
COMPLETE_ADATA_FILE = RAW_DIR / "suo_developmental_complete.h5ad"
COMPLETE_ADATA_VAR = RAW_DIR / "dataset_complete_Suo_varnames.pickle"

# Export options
UMAP_KEY = "X_umap"            # 3D layout to send to the viewer
LATENT_KEY = None              # e.g., "X_pca"; None uses adata.X
VAR_GENE_ID_COLUMN = None
GENE_IDENTIFIERS = None        # e.g., ["Gene1", "Gene2"] or None for all
CENTROID_OUTLIER_Q = 0.90
CENTROID_MIN_POINTS = 10

In [4]:
import numpy as np

# UMAP parameters
n_neighbors = 15
min_dist = 0.5
RANDOM_SEED = 0


def compute_umap_embedding(adata_source, n_components: int, min_dist: float, random_state: int) -> np.ndarray:
    """
    Compute UMAP embedding with specified dimensions without mutating the source AnnData.
    
    Creates a minimal copy with the neighbor graph and required representation,
    computes UMAP, and returns just the embedding array.
    
    Parameters
    ----------
    adata_source : AnnData
        Source AnnData with precomputed neighbors (must have .obsp['connectivities'] and .obsp['distances'])
    n_components : int
        Number of UMAP dimensions (1, 2, 3, or 4)
    min_dist : float
        UMAP min_dist parameter
    random_state : int
        Random seed for reproducibility
        
    Returns
    -------
    np.ndarray
        UMAP embedding of shape (n_cells, n_components)
    """
    # Get the representation used for neighbors (needed by UMAP for init)
    neighbors_params = adata_source.uns.get('neighbors', {}).get('params', {})
    use_rep = neighbors_params.get('use_rep', None)
    
    # Create minimal AnnData with the neighbor graph
    adata_temp = ad.AnnData(
        obs=adata_source.obs[[]],  # Empty obs dataframe with correct index
        obsp={
            'connectivities': adata_source.obsp['connectivities'],
            'distances': adata_source.obsp['distances']
        }
    )
    
    # Copy the representation if it was used for neighbors
    if use_rep is not None and use_rep in adata_source.obsm:
        adata_temp.obsm[use_rep] = adata_source.obsm[use_rep]
    
    # Copy the neighbors metadata that scanpy needs
    adata_temp.uns['neighbors'] = adata_source.uns['neighbors'].copy()
    
    # Compute UMAP on the temporary object
    sc.tl.umap(adata_temp, n_components=n_components, min_dist=min_dist, random_state=random_state)
    
    # Extract just the embedding
    embedding = adata_temp.obsm['X_umap'].copy()
    
    # Clean up
    del adata_temp
    
    return embedding

if False: 
    # Load and subset data
    adata = ad.read_h5ad(COMPLETE_ADATA_FILE)
    adata = adata[adata.obs['LVL0'] == "Haematopoeitic_lineage"].copy()

    # Compute neighbors once (this is shared across all UMAP dimensions)
    print("Computing neighbors...")
    sc.pp.neighbors(adata, n_neighbors=n_neighbors, random_state=RANDOM_SEED, use_rep="scanvi")

    # Compute UMAP embeddings for each dimensionality
    print("\nComputing 1D UMAP...")
    adata.obsm['X_umap_1d'] = compute_umap_embedding(adata, n_components=1, min_dist=min_dist, random_state=RANDOM_SEED)

    print("\nComputing 2D UMAP...")
    adata.obsm['X_umap_2d'] = compute_umap_embedding(adata, n_components=2, min_dist=min_dist, random_state=RANDOM_SEED)

    print("\nComputing 3D UMAP...")
    adata.obsm['X_umap_3d'] = compute_umap_embedding(adata, n_components=3, min_dist=min_dist, random_state=RANDOM_SEED)

    # Note: 4D is reserved for future development - uncomment when ready
    # print("\nComputing 4D UMAP...")
    # adata.obsm['X_umap_4d'] = compute_umap_embedding(adata, n_components=4, min_dist=min_dist, random_state=RANDOM_SEED)

    # Also store default X_umap as 3D for backward compatibility
    adata.obsm['X_umap'] = adata.obsm['X_umap_3d'].copy()

    print(f"\nUMAP embeddings computed:")
    print(f"  X_umap_1d: {adata.obsm['X_umap_1d'].shape}")
    print(f"  X_umap_2d: {adata.obsm['X_umap_2d'].shape}")
    print(f"  X_umap_3d: {adata.obsm['X_umap_3d'].shape}")
    # print(f"  X_umap_4d: {adata.obsm['X_umap_4d'].shape}")

    # Save the experiment file
    adata.write_h5ad(EXPERIMENT_FILE)
    print(f"\nSaved to {EXPERIMENT_FILE}")

    del adata
    gc.collect()

## Load UMAP run

Load the selected AnnData object and confirm the file exists.

In [5]:
if EXPERIMENT_FILE is None or not EXPERIMENT_FILE.exists():
    raise FileNotFoundError(
        f"UMAP file not found. Set EXPERIMENT_FILE to a valid .h5ad under {EXPERIMENT_DIR}"
    )

adata = ad.read_h5ad(EXPERIMENT_FILE)
adata.var = pd.read_pickle(COMPLETE_ADATA_VAR)

# Check for multi-dimensional UMAP embeddings
umap_keys = {
    '1d': 'X_umap_1d',
    '2d': 'X_umap_2d', 
    '3d': 'X_umap_3d',
    # '4d': 'X_umap_4d',  # Reserved for future
}

available_umaps = {}
for dim, key in umap_keys.items():
    if key in adata.obsm:
        available_umaps[dim] = adata.obsm[key]
        print(f"✓ Found {key}: shape {adata.obsm[key].shape}")
    else:
        print(f"✗ Missing {key}")

if not available_umaps:
    # Fallback to legacy X_umap
    if UMAP_KEY in adata.obsm:
        print(f"Using legacy {UMAP_KEY}")
        available_umaps['3d'] = adata.obsm[UMAP_KEY]
    else:
        raise KeyError(f"No UMAP embeddings found in adata.obsm")

# For backward compatibility, also set umap_coords to 3D
umap_coords = available_umaps.get('3d', list(available_umaps.values())[0])

print(f"\nAvailable dimensions: {list(available_umaps.keys())}")
adata

✓ Found X_umap_1d: shape (561947, 1)
✓ Found X_umap_2d: shape (561947, 2)
✓ Found X_umap_3d: shape (561947, 3)

Available dimensions: ['1d', '2d', '3d']


AnnData object with n_obs × n_vars = 561947 × 8192
    obs: 'sample_ID', 'organ', 'age', 'cell_type', 'sex', 'sex_inferred', 'concatenated_integration_covariates', 'integration_donor', 'integration_biological_unit', 'integration_sample_status', 'integration_library_platform_coarse', 'n_genes', '_scvi_batch', '_scvi_labels', 'LVL3', 'LVL2', 'LVL1', 'LVL0'
    var: 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'neighbors', 'rank_genes_groups'
    obsm: 'Unintegrated', 'X_pca', 'X_umap', 'X_umap_1d', 'X_umap_2d', 'X_umap_3d', 'harmony', 'invae', 'scanvi', 'scvi', 'tardis_1', 'tardis_2'
    obsp: 'connectivities', 'distances'

## Quick UMAP stats

Lightweight sanity check on all loaded UMAP embeddings (1D, 2D, 3D).

In [6]:
# Stats for all available UMAP dimensions
umap_stats = {}
for dim, coords in available_umaps.items():
    umap_stats[dim] = {
        "shape": coords.shape,
        "mean": coords.mean(axis=0).tolist(),
        "std": coords.std(axis=0).tolist(),
        "min": coords.min(axis=0).tolist(),
        "max": coords.max(axis=0).tolist(),
    }

print(f"UMAP stats for {adata.n_obs} cells:")
for dim, stats in umap_stats.items():
    print(f"\n{dim.upper()}:")
    print(f"  Shape: {stats['shape']}")
    print(f"  Mean: {[f'{x:.3f}' for x in stats['mean']]}")
    print(f"  Std:  {[f'{x:.3f}' for x in stats['std']]}")

umap_stats

UMAP stats for 561947 cells:

1D:
  Shape: (561947, 1)
  Mean: ['5.295']
  Std:  ['16.212']

2D:
  Shape: (561947, 2)
  Mean: ['5.165', '1.608']
  Std:  ['7.449', '6.884']

3D:
  Shape: (561947, 3)
  Mean: ['5.214', '1.725', '1.519']
  Std:  ['4.778', '4.356', '4.828']


{'1d': {'shape': (561947, 1),
  'mean': [5.295407295227051],
  'std': [16.212360382080078],
  'min': [-27.436325073242188],
  'max': [38.51380157470703]},
 '2d': {'shape': (561947, 2),
  'mean': [5.165143966674805, 1.6078412532806396],
  'std': [7.448610305786133, 6.88424825668335],
  'min': [-9.85355281829834, -11.497267723083496],
  'max': [17.377145767211914, 17.219484329223633]},
 '3d': {'shape': (561947, 3),
  'mean': [5.213593006134033, 1.7251689434051514, 1.5194395780563354],
  'std': [4.778425216674805, 4.356046676635742, 4.828305721282959],
  'min': [-6.086704730987549, -7.9113993644714355, -6.787397384643555],
  'max': [14.03857707977295, 12.185367584228516, 13.62077808380127]}}

## Load full annotated data

Load the complete AnnData to supply metadata (`obs`) and gene expression matrices aligned to the UMAP run.

In [7]:
if not COMPLETE_ADATA_FILE.exists():
    raise FileNotFoundError(f"Complete AnnData file not found at {COMPLETE_ADATA_FILE}")

adata_complete = ad.read_h5ad(COMPLETE_ADATA_FILE)
# adata_complete = adata_complete[:, adata_complete.var["highly_variable"] == 1]
adata_complete = adata_complete[adata.obs.index].copy()

# Normalize counts to 1e4 per cell and log-transform for export
sc.pp.normalize_total(adata_complete, target_sum=1e4)
sc.pp.log1p(adata_complete)

adata_complete

normalizing counts per cell
    finished (0:00:00)


AnnData object with n_obs × n_vars = 561947 × 8192
    obs: 'sample_ID', 'organ', 'age', 'cell_type', 'sex', 'sex_inferred', 'concatenated_integration_covariates', 'integration_donor', 'integration_biological_unit', 'integration_sample_status', 'integration_library_platform_coarse', 'n_genes', '_scvi_batch', '_scvi_labels', 'LVL3', 'LVL2', 'LVL1', 'LVL0'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'rank_genes_groups', 'log1p'
    obsm: 'Unintegrated', 'X_pca', 'harmony', 'invae', 'scanvi', 'scvi', 'tardis_1', 'tardis_2'

In [8]:
adata_complete.X[0].A

array([[0., 0., 0., ..., 0., 0., 0.]])

## Export for web viewer

Selects the latent space (uses `LATENT_KEY` if provided, else `adata.X`) and writes `points.bin` plus obs/var manifests for the HTML viewer in `index.html`.

In [9]:
from cellucid.prepare_data import export_data_for_web

In [10]:
export_data_for_web(
    # Multi-dimensional UMAP embeddings
    X_umap_1d=available_umaps.get('1d'),
    X_umap_2d=available_umaps.get('2d'),
    X_umap_3d=available_umaps.get('3d'),
    # X_umap_4d is reserved for future development
    
    # Other data
    latent_space=adata.obsm["scanvi"],
    obs=adata.obs,
    var=adata.var,
    gene_expression=adata.X,
    connectivities=adata.obsp['connectivities'],
    var_gene_id_column=VAR_GENE_ID_COLUMN,
    gene_identifiers=GENE_IDENTIFIERS,
    centroid_outlier_quantile=CENTROID_OUTLIER_Q,
    centroid_min_points=CENTROID_MIN_POINTS,
    force=False,
    var_quantization=8,
    obs_continuous_quantization=8,
    obs_categorical_dtype="auto",
    compression=6,
    # Dataset info
    out_dir=EXPORT_DIR,
    dataset_name=DATASET_NAME,
    dataset_description="Mapping the developing human immune system across organs",
    source_name="E-MTAB-11341",
    source_url="https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11341"
)

Export Settings:
  Output directory: /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo
  Compression: gzip level 6
  Var (gene) quantization: 8-bit
  Obs continuous quantization: 8-bit
  Obs categorical dtype: auto
  Available dimensions: [1, 2, 3]
  Default dimension: 3D
  Coordinate normalization (per-dimension, aspect-ratio preserved):
    1D: range 65.95 → [-1, 1]
    2D: range 28.72 → [-1, 1]
    3D: range 20.41 → [-1, 1]
✓ Wrote 1D positions (561,947 cells × 1 dims) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/points_1d.bin.gz (gzip)
✓ Wrote 2D positions (561,947 cells × 2 dims) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/points_2d.bin.gz (gzip)
✓ Wrote 3D positions (561,947 cells × 3 dims) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/points_3d.bin.gz (gzip)
✓ Wrote obs manifest (18 fields: 3 continuous, 15 categorical) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/obs_manifest.json with binaries in

Exporting genes: 100%|██████████| 8192/8192 [01:27<00:00, 93.40it/s] 


✓ Wrote var manifest (8192 genes, 8-bit quantized, gzip level 6) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/var_manifest.json
  Extracting unique edges from 561,947 cells...
  Found 6,277,388 unique edges, max 170 neighbors/cell
  Sorting edges for optimal compression...
✓ Wrote connectivity (6,277,388 edges, max 170 neighbors/cell, uint32) to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/connectivity
✓ Wrote dataset identity to /Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/dataset_identity.json


## Validate export artifacts

Spot-check file sizes (MB), manifest stats, and total obs/var directory sizes.

In [11]:
import json
from pathlib import Path

BYTES_IN_MB = 1024 * 1024

def size_mb(path: Path) -> float:
    return round(path.stat().st_size / BYTES_IN_MB, 3) if path.exists() else 0

def dir_stats(path: Path) -> dict:
    if not path.exists():
        return {"size_mb": 0, "files": 0}
    total_bytes = 0
    file_count = 0
    for p in path.rglob("*"):
        if p.is_file():
            file_count += 1
            total_bytes += p.stat().st_size
    return {"size_mb": round(total_bytes / BYTES_IN_MB, 3), "files": file_count}

# Multi-dimensional point files
points_files = {
    'points_1d': EXPORT_DIR / "points_1d.bin.gz",
    'points_2d': EXPORT_DIR / "points_2d.bin.gz",
    'points_3d': EXPORT_DIR / "points_3d.bin.gz",
    'points (legacy)': EXPORT_DIR / "points.bin.gz",
}

obs_manifest_path = EXPORT_DIR / "obs_manifest.json"
var_manifest_path = EXPORT_DIR / "var_manifest.json"
dataset_identity_path = EXPORT_DIR / "dataset_identity.json"
obs_dir = EXPORT_DIR / "obs"
var_dir = EXPORT_DIR / "var"

obs_manifest = json.loads(obs_manifest_path.read_text()) if obs_manifest_path.exists() else None
var_manifest = json.loads(var_manifest_path.read_text()) if var_manifest_path.exists() else None
dataset_identity = json.loads(dataset_identity_path.read_text()) if dataset_identity_path.exists() else None

# Check which point files exist
point_sizes = {}
for name, path in points_files.items():
    if path.exists():
        point_sizes[name] = size_mb(path)
        print(f"✓ {name}: {point_sizes[name]} MB")
    else:
        print(f"✗ {name}: not found")

# Show embeddings metadata
if dataset_identity and 'embeddings' in dataset_identity:
    embeddings_meta = dataset_identity['embeddings']
    print(f"\nEmbeddings metadata:")
    print(f"  Available dimensions: {embeddings_meta.get('available_dimensions')}")
    print(f"  Default dimension: {embeddings_meta.get('default_dimension')}D")

{
    "paths": {
        "export_dir": EXPORT_DIR,
        "obs_manifest": obs_manifest_path,
        "var_manifest": var_manifest_path,
        "dataset_identity": dataset_identity_path,
        "obs_dir": obs_dir,
        "var_dir": var_dir,
    },
    "sizes_mb": {
        **point_sizes,
        "obs_manifest": size_mb(obs_manifest_path),
        "var_manifest": size_mb(var_manifest_path),
        "dataset_identity": size_mb(dataset_identity_path),
    },
    "dir_sizes_mb": {
        "obs": dir_stats(obs_dir),
        "var": dir_stats(var_dir),
    },
    "manifest_stats": {
        "obs": None if obs_manifest is None else {
            "n_points": obs_manifest.get("n_points"),
            "fields": len(obs_manifest.get("fields", [])),
            "centroid_outlier_quantile": obs_manifest.get("centroid_outlier_quantile"),
        },
        "var": None if var_manifest is None else {
            "n_points": var_manifest.get("n_points"),
            "fields": len(var_manifest.get("fields", [])),
            "var_gene_id_column": var_manifest.get("var_gene_id_column"),
        },
        "embeddings": None if dataset_identity is None else dataset_identity.get("embeddings"),
    },
}

✓ points_1d: 1.925 MB
✓ points_2d: 3.893 MB
✓ points_3d: 5.899 MB
✗ points (legacy): not found

Embeddings metadata:
  Available dimensions: [1, 2, 3]
  Default dimension: 3D


{'paths': {'export_dir': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo'),
  'obs_manifest': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/obs_manifest.json'),
  'var_manifest': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/var_manifest.json'),
  'dataset_identity': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/dataset_identity.json'),
  'obs_dir': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/obs'),
  'var_dir': PosixPath('/Users/kemalinecik/git_nosync/_/cellucid/assets/exports/suo/var')},
 'sizes_mb': {'points_1d': 1.925,
  'points_2d': 3.893,
  'points_3d': 5.899,
  'obs_manifest': 0.088,
  'var_manifest': 0.264,
  'dataset_identity': 0.002},
 'dir_sizes_mb': {'obs': {'size_mb': 9.421, 'files': 33},
  'var': {'size_mb': 70.238, 'files': 8192}},
 'manifest_stats': {'obs': {'n_points': 561947,
   'fields': 0,
   'centroid_outlier_quantile': 0.9},
  'var': {'n_poi

Done. Serve `index.html` from the repo root to view the exported data.