# QC: Synthetic Data Generator Validation

This notebook validates the synthetic data generator (`starfinder.testdata`) by checking:
1. Dataset structure and file existence
2. Two-base encoding correctness
3. Spot positions and visualization
4. Channel assignments
5. 3D visualization with napari

In [None]:
# Setup - imports
import sys
sys.path.insert(0, "../src/python")

import json
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from starfinder.testdata import get_preset_config
from starfinder.testdata.synthetic import (
    encode_barcode_to_colors,
    TEST_CODEBOOK,
    COLOR_TO_CHANNEL,
)
from starfinder.io import load_multipage_tiff, load_image_stacks

# Path to mini dataset
MINI_DATASET_PATH = Path("fixtures/synthetic/mini")
print(f"Dataset path: {MINI_DATASET_PATH.absolute()}")
print(f"Dataset exists: {MINI_DATASET_PATH.exists()}")

## 1. Verify Dataset Structure

Check that all expected files and directories exist in the mini dataset.

In [None]:
# Check all files/dirs exist
def check_dataset_structure(dataset_path: Path):
    """Verify all expected files and directories exist."""
    checks = []
    
    # Top-level files
    checks.append(("codebook.csv", (dataset_path / "codebook.csv").exists()))
    checks.append(("ground_truth.json", (dataset_path / "ground_truth.json").exists()))
    
    # Load config to get expected structure
    config = get_preset_config("mini")
    
    # Check FOV directories
    for fov_idx in range(config.n_fovs):
        fov_id = f"FOV_{fov_idx + 1:03d}"
        fov_dir = dataset_path / fov_id
        checks.append((f"{fov_id}/", fov_dir.exists()))
        
        # Check round directories
        for round_idx in range(1, config.n_rounds + 1):
            round_dir = fov_dir / f"round{round_idx}"
            checks.append((f"{fov_id}/round{round_idx}/", round_dir.exists()))
            
            # Check channel files
            for ch in range(config.n_channels):
                tiff_path = round_dir / f"ch{ch:02d}.tif"
                checks.append((f"{fov_id}/round{round_idx}/ch{ch:02d}.tif", tiff_path.exists()))
    
    # Print results
    print("Dataset Structure Check:")
    print("=" * 50)
    all_passed = True
    for name, exists in checks:
        status = "PASS" if exists else "FAIL"
        if not exists:
            all_passed = False
        print(f"  [{status}] {name}")
    
    print("=" * 50)
    print(f"Overall: {'ALL CHECKS PASSED' if all_passed else 'SOME CHECKS FAILED'}")
    return all_passed

check_dataset_structure(MINI_DATASET_PATH)

## 2. Validate Two-Base Encoding

Verify that the two-base encoding works correctly for all genes in TEST_CODEBOOK.

In [None]:
# Print encoding for all genes in TEST_CODEBOOK, verify CACGC -> 4422
print("Two-Base Encoding Validation:")
print("=" * 60)
print(f"{'Gene':<10} {'Barcode':<10} {'Reversed':<10} {'Color Seq':<12} {'Expected'}")
print("-" * 60)

# Known expected encoding for CACGC
# CACGC reversed = CGCAC
# CG -> 4, GC -> 4, CA -> 2, AC -> 2 => "4422"
expected_encodings = {
    "CACGC": "4422",  # Explicitly stated in task
}

all_valid = True
for gene, barcode in TEST_CODEBOOK:
    color_seq = encode_barcode_to_colors(barcode)
    reversed_barcode = barcode[::-1]
    
    # Check if we have an expected value
    expected = expected_encodings.get(barcode, "--")
    if expected != "--":
        match = "MATCH" if color_seq == expected else "MISMATCH"
        if color_seq != expected:
            all_valid = False
    else:
        match = ""
    
    print(f"{gene:<10} {barcode:<10} {reversed_barcode:<10} {color_seq:<12} {expected:<8} {match}")

print("=" * 60)
print(f"\nVerification: CACGC -> {encode_barcode_to_colors('CACGC')} (expected: 4422)")
print(f"Result: {'PASS' if encode_barcode_to_colors('CACGC') == '4422' else 'FAIL'}")

## 3. Verify Spot Positions

Load ground_truth.json and overlay spots on max projection.

In [None]:
# Load ground_truth.json, overlay spots on max projection using matplotlib

# Load ground truth
with open(MINI_DATASET_PATH / "ground_truth.json") as f:
    ground_truth = json.load(f)

print(f"Ground truth version: {ground_truth['version']}")
print(f"Image shape (Z, Y, X): {ground_truth['image_shape']}")
print(f"Number of rounds: {ground_truth['n_rounds']}")
print(f"Number of channels: {ground_truth['n_channels']}")
print(f"Number of FOVs: {len(ground_truth['fovs'])}")

# Get FOV_001 data
fov_data = ground_truth["fovs"]["FOV_001"]
spots = fov_data["spots"]
print(f"\nNumber of spots in FOV_001: {len(spots)}")

# Load round1 images and create max projection
round1_dir = MINI_DATASET_PATH / "FOV_001" / "round1"
max_proj = None
for ch in range(4):
    img = load_multipage_tiff(round1_dir / f"ch{ch:02d}.tif", convert_uint8=False)
    ch_max = img.max(axis=0)
    if max_proj is None:
        max_proj = ch_max.astype(np.float32)
    else:
        max_proj = np.maximum(max_proj, ch_max)

# Plot with spot overlays
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.imshow(max_proj, cmap="gray")
ax.set_title("FOV_001 Round1 Max Projection with Ground Truth Spots")

# Overlay spots
colors = plt.cm.tab10.colors
gene_to_color = {gene: colors[i % len(colors)] for i, (gene, _) in enumerate(TEST_CODEBOOK)}

for spot in spots:
    z, y, x = spot["position"]
    gene = spot["gene"]
    ax.scatter(x, y, c=[gene_to_color[gene]], s=50, marker="o", alpha=0.7, edgecolors="white", linewidths=0.5)

# Add legend
for gene, color in gene_to_color.items():
    ax.scatter([], [], c=[color], label=gene, s=50)
ax.legend(loc="upper right", title="Genes")

ax.set_xlabel("X (pixels)")
ax.set_ylabel("Y (pixels)")
plt.tight_layout()
plt.show()

# Print first few spots
print("\nFirst 5 spots:")
for spot in spots[:5]:
    print(f"  {spot['gene']}: pos={spot['position']}, barcode={spot['barcode']}, color_seq={spot['color_seq']}")

## 4. Verify Spots Appear in Correct Channels

Check that each spot appears in the expected channel based on its color_seq.

In [None]:
# Check that spots appear in expected channels based on color_seq

def verify_spot_in_channel(img_stack, z, y, x, expected_intensity_above_bg=50):
    """Check if a spot exists at the given location.
    
    Returns the intensity at the spot location and whether it's above background.
    """
    # Get local region around spot
    z_min, z_max = max(0, z-1), min(img_stack.shape[0], z+2)
    y_min, y_max = max(0, y-2), min(img_stack.shape[1], y+3)
    x_min, x_max = max(0, x-2), min(img_stack.shape[2], x+3)
    
    local_region = img_stack[z_min:z_max, y_min:y_max, x_min:x_max]
    max_intensity = local_region.max()
    
    # Estimate background from image corners
    bg_estimate = np.mean([
        img_stack[:, :10, :10].mean(),
        img_stack[:, :10, -10:].mean(),
        img_stack[:, -10:, :10].mean(),
        img_stack[:, -10:, -10:].mean(),
    ])
    
    is_above_bg = max_intensity > (bg_estimate + expected_intensity_above_bg)
    return max_intensity, is_above_bg, bg_estimate

# Load all round1 channels
round1_dir = MINI_DATASET_PATH / "FOV_001" / "round1"
channel_stacks = []
for ch in range(4):
    img = load_multipage_tiff(round1_dir / f"ch{ch:02d}.tif", convert_uint8=False)
    channel_stacks.append(img)

print("Spot Channel Verification (Round 1):")
print("=" * 80)
print(f"{'Spot':<6} {'Gene':<8} {'Position':<15} {'Expected Ch':<12} {'Intensities (ch0-3)':<30} {'Pass'}")
print("-" * 80)

n_passed = 0
n_total = len(spots)

for i, spot in enumerate(spots):
    z, y, x = spot["position"]
    gene = spot["gene"]
    color_seq = spot["color_seq"]
    
    # Round 1 corresponds to first color in sequence (index 0)
    expected_color = color_seq[0]
    expected_channel = COLOR_TO_CHANNEL[expected_color]
    
    # Get intensities at spot location for all channels
    intensities = []
    for ch in range(4):
        max_int, is_above, bg = verify_spot_in_channel(channel_stacks[ch], z, y, x)
        intensities.append(max_int)
    
    # Check if expected channel has highest intensity
    max_ch = np.argmax(intensities)
    passed = max_ch == expected_channel
    if passed:
        n_passed += 1
    
    pos_str = f"({z},{y},{x})"
    int_str = ", ".join([f"{int(v):3d}" for v in intensities])
    status = "PASS" if passed else f"FAIL (got ch{max_ch})"
    
    print(f"{i:<6} {gene:<8} {pos_str:<15} ch{expected_channel:<11} [{int_str}]  {status}")

print("=" * 80)
print(f"Summary: {n_passed}/{n_total} spots appear in correct channel ({100*n_passed/n_total:.1f}%)")

## 5. napari 3D Visualization with Spots

Visualize the volume in 3D with spot markers using napari.

In [None]:
# napari with viewer.add_points() for spots
import napari

# Load round1 as 4-channel stack (Z, Y, X, C)
round1_dir = MINI_DATASET_PATH / "FOV_001" / "round1"
channel_order = [f"ch{i:02d}" for i in range(4)]
volume = load_image_stacks(round1_dir, channel_order=channel_order)
print(f"Loaded volume shape (Z, Y, X, C): {volume.shape}")

# Extract spot positions as (Z, Y, X) array
spot_positions = np.array([spot["position"] for spot in spots])
spot_genes = [spot["gene"] for spot in spots]

# Create gene-to-color mapping for napari
unique_genes = list(set(spot_genes))
gene_colors = {
    "GeneA": "red",
    "GeneB": "green",
    "GeneC": "blue",
    "GeneD": "yellow",
    "GeneE": "magenta",
    "GeneF": "cyan",
    "GeneG": "orange",
    "GeneH": "pink",
}
spot_colors = [gene_colors.get(g, "white") for g in spot_genes]

# Create napari viewer
viewer = napari.Viewer()

# Add each channel as a separate image layer
channel_names = ["Ch0 (Color 1)", "Ch1 (Color 2)", "Ch2 (Color 3)", "Ch3 (Color 4)"]
channel_colormaps = ["red", "green", "blue", "magenta"]
for ch in range(4):
    viewer.add_image(
        volume[:, :, :, ch],
        name=channel_names[ch],
        colormap=channel_colormaps[ch],
        blending="additive",
        visible=True,
    )

# Add spot positions as points
viewer.add_points(
    spot_positions,
    name="Ground Truth Spots",
    size=8,
    face_color=spot_colors,
    edge_color="white",
    edge_width=0.5,
    symbol="disc",
)

# Add text annotations for genes (optional, can be slow for many spots)
viewer.add_points(
    spot_positions,
    name="Gene Labels",
    size=0,
    text={
        "string": spot_genes,
        "size": 8,
        "color": "white",
        "anchor": "upper_left",
    },
)

print("\nnapari viewer opened with:")
print(f"  - 4 channel image layers")
print(f"  - {len(spots)} ground truth spot markers")
print("\nUse the layer controls to toggle visibility and adjust contrast.")

## Summary Checklist

| Check | Description | Status |
|-------|-------------|--------|
| 1 | Dataset structure (all files/dirs exist) | Run cell 2 |
| 2 | Two-base encoding (CACGC -> 4422) | Run cell 3 |
| 3 | Spot positions overlay on max projection | Run cell 4 |
| 4 | Spots appear in correct channels | Run cell 5 |
| 5 | 3D visualization in napari | Run cell 6 |

### Expected Results

- **Structure**: All 16 TIFF files (4 rounds x 4 channels) should exist for FOV_001
- **Encoding**: `CACGC` should encode to `4422` using the two-base color-space encoding
- **Spots**: 20 spots should be visible in the max projection, colored by gene
- **Channels**: Each spot should have highest intensity in the channel corresponding to its color_seq
- **3D View**: napari should show all 4 channels with spot markers at correct Z positions