# Notebook E — 05_create_training_data.ipynb (Prepare Dataset)

This notebook loads annotated point clouds (or labeled folders) and generates a training dataset (features + labels) for the classifier.

## Concepts
- **Features**: Geometric and color features extracted from points (XYZ, Normals, Colors, Height).
- **Labels**: Ground truth class derived from `.las` classification or folder structure.

In [None]:
# Cell E0 — Install dependencies
!pip install numpy open3d trimesh laspy scipy onnxruntime numba py-vox-io

In [None]:
# Cell E1 — Mount Drive & Setup
from google.colab import drive
import os
import sys

drive.mount('/content/drive')

BASE = "/content/drive/MyDrive/voxel_engine"
SRC_REPO = "/content/spec-kit/src"
SRC_LEGACY = "/content/voxel_engine_src"

INPUT_DIR = os.path.join(BASE, "input")
TRAIN_DIR = os.path.join(BASE, "training_data")

os.makedirs(TRAIN_DIR, exist_ok=True)

# Ensure src is in path
if os.path.exists(SRC_REPO) and SRC_REPO not in sys.path:
    sys.path.append(SRC_REPO)
    print(f"Added {SRC_REPO} to sys.path")

if os.path.exists(SRC_LEGACY) and SRC_LEGACY not in sys.path:
    sys.path.append(SRC_LEGACY)
    print(f"Added {SRC_LEGACY} to sys.path")

# Reload modules to ensure we have latest version
try:
    import loader, classify, utils
    import numpy as np
    import imp
    imp.reload(loader)
    imp.reload(classify)
    imp.reload(utils)
except ImportError as e:
    print("Error importing modules. Make sure you have cloned the repo or run previous notebooks.")
    print("Try running: !git clone https://github.com/yamatotakeru616/spec-kit.git /content/spec-kit")
    raise e

print("Ready to process data in:", INPUT_DIR)

In [None]:
# Cell E2 — Define Feature Extraction Helper

def process_file(path, label_override=None):
    """
    Loads a file, computes features, and returns (features, labels).
    If label_override is provided, forces all points to have that label.
    Otherwise, tries to read labels from file (LAS classification).
    """
    print(f"Processing {os.path.basename(path)}...")
    
    # Load with upgraded loader that supports labels
    # Note: If loader.py wasn't updated in previous steps, this might fail to get labels if not using folder mode.
    try:
        if hasattr(loader, 'load_annotated_pointcloud'):
            pts, cols, file_labels = loader.load_annotated_pointcloud(path)
        else:
            # Fallback if old loader
            pts, cols = loader.load_pointcloud(path)
            file_labels = None
    except Exception as e:
        print(f"Failed to load {path}: {e}")
        return None, None

    # Compute features
    # compute_features(points, colors=None, normals=None)
    feats = classify.compute_features(pts, colors=cols)
    
    # Determine Labels
    target_labels = None
    
    if label_override is not None:
        # Create full array of this label
        target_labels = np.full(pts.shape[0], label_override, dtype=np.uint8)
    elif file_labels is not None:
        target_labels = file_labels
    else:
        print(f"Warning: No labels found for {path} and no override provided. Skipping labels.")
    
    return feats, target_labels

In [None]:
# Cell E3 — Process Dataset (Option A: Folders as Classes)
# Structure:
# input/
#   class_1_ground/
#     file1.laz
#   class_2_building/
#     file2.ply

all_feats = []
all_labels = []

# Define your class mapping here if using folders
CLASS_MAP = {
    "ground": 1,
    "vegetation": 2,
    "building": 3,
    "vehicle": 4
}

scan_mode = "folder" # or "file"

if scan_mode == "folder":
    for folder_name, label_id in CLASS_MAP.items():
        folder_path = os.path.join(INPUT_DIR, folder_name)
        if not os.path.exists(folder_path): 
            continue
            
        files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.las', '.laz', '.ply', '.obj'))]
        print(f"Found {len(files)} files for class '{folder_name}' (ID: {label_id})")
        
        for f in files:
            fp = os.path.join(folder_path, f)
            f_feats, f_lbls = process_file(fp, label_override=label_id)
            if f_feats is not None and f_lbls is not None:
                all_feats.append(f_feats)
                all_labels.append(f_lbls)

# Cell E4 — Process Dataset (Option B: Single file with embedded labels)
# If using a single labeled LAS file in input/
if scan_mode == "file":
     files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(('.las', '.laz'))]
     for f in files:
         fp = os.path.join(INPUT_DIR, f)
         f_feats, f_lbls = process_file(fp)
         if f_feats is not None and f_lbls is not None:
            all_feats.append(f_feats)
            all_labels.append(f_lbls)
            
if len(all_feats) > 0:
    X = np.vstack(all_feats)
    y = np.concatenate(all_labels)
    print(f"\nTotal Training Data: {X.shape[0]} points")
    print(f"Features: {X.shape[1]}")
    print(f"Classes: {np.unique(y)}")
else:
    print("No training data found.")

In [None]:
# Cell E5 — Save Training Data
if len(all_feats) > 0:
    save_path = os.path.join(TRAIN_DIR, "training_data.npz")
    np.savez_compressed(save_path, X=X, y=y)
    print(f"Saved dataset to {save_path}")
    
    # Optional: Save a small sample for verification
    # sample_idx = np.random.choice(len(X), min(10000, len(X)), replace=False)
    # np.savez(os.path.join(TRAIN_DIR, "sample_data.npz"), X=X[sample_idx], y=y[sample_idx])