## YOLO26 Nano Training for MIAP Person Detection

This notebook performs the following steps:
1.  **Setup**: Installs necessary libraries and connects to Google Drive and Google Cloud for data access.
2.  **Data Preparation**: 
    - Downloads annotations for the MIAP (person subset of Google Open Dataset) from a GCS bucket.
    - Downloads the corresponding images from the public Open Images dataset bucket.
    - Filters out images containing only very small bounding boxes (width or height < 6 pixels).
    - Converts the dataset into the YOLO format required by Ultralytics.
3.  **Training**: Trains a `yolo26n` (nano) model on the prepared dataset.
4.  **Export**: 
    - Exports the trained model to ONNX format, including preprocessing and NMS post-processing.
    - Creates a Float32 version.
    - Creates an INT8 quantized version using static calibration.
5.  **Save & Verify**: Saves the final models to Google Drive and runs a quick verification with ONNX Runtime.

Fake interaction (cmd option I, console)
```js
function KeepAlive() {
    console.log("Interaction simulated");
    // This targets the actual button hidden inside the shadow root
    const connectButton = document.querySelector("#top-toolbar > colab-connect-button")
        ?.shadowRoot?.querySelector("#connect");
    
    if (connectButton) {
        connectButton.click();
    } else {
        // Fallback: Click the comment button or a generic toolbar item
        document.querySelector("colab-toolbar-button#comments")?.click();
    }
}
setInterval(KeepAlive, 60000); // Runs every 1 minute
```
or this version with randomness
```js
function KeepAlive() {
    // Generate a random delay between 1 and 3 minutes (in milliseconds)
    const min = 1 * 60 * 1000;
    const max = 3 * 60 * 1000;
    const randomDelay = Math.floor(Math.random() * (max - min + 1)) + min;

    console.log(`Interaction simulated. Next check in: ${Math.round(randomDelay/1000)}s`);

    // Target the button inside the Shadow DOM
    const connectButton = document.querySelector("#top-toolbar > colab-connect-button")
        ?.shadowRoot?.querySelector("#connect");
    
    if (connectButton) {
        connectButton.click();
    } else {
        // Fallback: Click the 'Comment' icon to register activity
        document.querySelector("colab-toolbar-button#comments")?.click();
    }

    // Schedule the next execution with the new random delay
    setTimeout(KeepAlive, randomDelay);
}

// Start the first cycle
KeepAlive();
```
Run `caffeinate` on mac terminal

In [None]:
# %pip install ultralytics onnx onnxruntime onnxsim pandas gcsfs tqdm -q
!pip install uv

# Use uv pip install with the --system flag to install into the current environment
!uv pip install ultralytics onnx onnxruntime onnxsim pandas gcsfs tqdm --system

### 1. Setup and Authentication

Mount Google Drive to save the final models and authenticate with Google Cloud to access the dataset annotations.

In [None]:
from google.colab import drive, auth
import os

print("Mounting Google Drive...")
drive.mount('/content/drive')

print("Authenticating with Google Cloud...")
auth.authenticate_user()

# Define a directory in your Google Drive to save the models
GDRIVE_SAVE_DIR = '/content/drive/MyDrive/miap_yolov26_models'
os.makedirs(GDRIVE_SAVE_DIR, exist_ok=True)
print(f"Models will be saved to: {GDRIVE_SAVE_DIR}")

In [None]:
import onnx
import numpy as np
from onnx import helper, numpy_helper, TensorProto
import onnxruntime as ort
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantFormat, QuantType, CalibrationMethod

def embed_uint8_preprocess_into_onnx(in_onnx, out_onnx, imgsz, input_scale=1.0/255.0):
    """
    Embeds uint8 normalization and resize into the ONNX graph.
    Input: [1, 3, H, W] uint8
    Output: [1, 3, imgsz, imgsz] float32
    """
    model = onnx.load(in_onnx)
    orig_in_name = model.graph.input[0].name
    
    new_input_name = "images_uint8"
    # Use dynamic H and W to allow input of any size, which will be resized to imgsz
    new_input = helper.make_tensor_value_info(new_input_name, TensorProto.UINT8, [1, 3, "Height", "Width"])
    
    cast_out = f"{orig_in_name}__cast_f32"
    scaled_out = f"{orig_in_name}__scaled"
    resize_out = orig_in_name

    nodes = [
        helper.make_node("Cast", inputs=[new_input_name], outputs=[cast_out], to=TensorProto.FLOAT, name="Preprocess_Cast"),
        helper.make_node("Mul", inputs=[cast_out, "Preprocess_Scale_Val"], outputs=[scaled_out], name="Preprocess_Scale")
    ]
    
    # Preprocessing Initializers
    scale_tensor = numpy_helper.from_array(np.array([input_scale], dtype=np.float32), name="Preprocess_Scale_Val")
    roi = numpy_helper.from_array(np.array([], dtype=np.float32), name="Preprocess_ROI")
    scales = numpy_helper.from_array(np.array([], dtype=np.float32), name="Preprocess_Scales")
    sizes = numpy_helper.from_array(np.array([1, 3, imgsz, imgsz], dtype=np.int64), name="Preprocess_Sizes")
    
    model.graph.initializer.extend([scale_tensor, roi, scales, sizes])
    
    nodes.append(helper.make_node("Resize", 
                                  inputs=[scaled_out, "Preprocess_ROI", "Preprocess_Scales", "Preprocess_Sizes"], 
                                  outputs=[resize_out], mode="linear", name="Preprocess_Resize"))

    # Reconstruct graph
    # Remove the old input and add the new uint8 input
    del model.graph.input[0]
    model.graph.input.insert(0, new_input)
    
    # Prepend preprocessing nodes to the graph
    original_nodes = list(model.graph.node)
    del model.graph.node[:]
    model.graph.node.extend(nodes + original_nodes)
    
    onnx.checker.check_model(model)
    onnx.save(model, out_onnx)
    print(f"Preprocessed ONNX saved to {out_onnx}")
    return out_onnx

def finalize_onnx_for_deployment(onnx_path, imgsz):
    """
    Aligns the ONNX model with the ONNX_GUIDELINES.md:
    1. Rescale coordinate outputs to the original input resolution (uint8).
    2. Consolidate outputs into a single [N, 7] tensor: [x1, y1, x2, y2, score, class_id, batch_idx].
    3. Integrated NMS is expected to already be present (from export(nms=True)).
    """
    model = onnx.load(onnx_path)
    graph = model.graph
    input_name = graph.input[0].name
    
    # Check if there is already an output with shape [1, N, 6] (standard Ultralytics NMS output)
    target_out_name = None
    for out in graph.output:
        try:
            shape = [d.dim_value if d.dim_value > 0 else 0 for d in out.type.tensor_type.shape.dim]
            # Ultralytics with nms=True usually gives [1, 300, 6] or similar
            if len(shape) == 3 and shape[0] == 1 and shape[2] == 6:
                target_out_name = out.name
                break
        except:
            continue
            
    if not target_out_name:
        print(f"Warning: Could not find output with shape [1, N, 6] in {onnx_path}. Is nms=True used?")
        # Fallback: just rename existing outputs if they exist
        for i, out in enumerate(model.graph.output):
            out.name = "detections" if i == 0 else f"detections_{i}"
        onnx.save(model, onnx_path)
        return
    
    # GUIDELINE IMPLEMENTATION:
    
    # 1. Reshape [1, N, 6] -> [N, 6]
    n6_name = "Guideline_Reshaped_N6"
    n6_shape_const = "Guideline_N6_Shape_Const"
    graph.initializer.append(numpy_helper.from_array(np.array([-1, 6], dtype=np.int64), name=n6_shape_const))
    graph.node.append(helper.make_node("Reshape", inputs=[target_out_name, n6_shape_const], outputs=[n6_name], name="Guideline_ReshapeN6"))

    # 2. Split [N, 6] into [N, 4], [N, 1], [N, 1]
    b_raw, s_raw, c_raw = "Guideline_B_Raw", "Guideline_S_Raw", "Guideline_C_Raw"
    split_const = "Guideline_Split_Const"
    graph.initializer.append(numpy_helper.from_array(np.array([4, 1, 1], dtype=np.int64), name=split_const))
    graph.node.append(helper.make_node("Split", inputs=[n6_name, split_const], outputs=[b_raw, s_raw, c_raw], axis=1, name="Guideline_SplitN6"))

    # 3. Generate batch_idx [N, 1] (all zeros, Float32)
    s_shape = "Guideline_S_Shape"
    b_idx_2d = "Guideline_BatchIdx_2d"
    graph.node.append(helper.make_node("Shape", inputs=[s_raw], outputs=[s_shape], name="Guideline_GetSShape"))
    graph.node.append(helper.make_node(
        "ConstantOfShape", 
        inputs=[s_shape], 
        outputs=[b_idx_2d], 
        value=helper.make_tensor("val", TensorProto.FLOAT, [1], [0.0]),
        name="Guideline_CreateBatchIdx"
    ))

    # 4. Rescale boxes [N, 4] by comparing input resolution to training imgsz
    b_scaled = "Guideline_B_Rescaled"
    in_shape = "Guideline_InputShape"
    graph.node.append(helper.make_node("Shape", inputs=[input_name], outputs=[in_shape], name="Guideline_GetInShape"))
    
    h_idx = "Guideline_HIdx"
    w_idx = "Guideline_WIdx"
    graph.initializer.extend([
        numpy_helper.from_array(np.array(2, dtype=np.int64), name=h_idx),
        numpy_helper.from_array(np.array(3, dtype=np.int64), name=w_idx),
    ])
    h_val, w_val = "Guideline_HVal", "Guideline_WVal"
    graph.node.append(helper.make_node("Gather", inputs=[in_shape, h_idx], outputs=[h_val], axis=0, name="Guideline_GatherH"))
    graph.node.append(helper.make_node("Gather", inputs=[in_shape, w_idx], outputs=[w_val], axis=0, name="Guideline_GatherW"))
    
    h_f32, w_f32 = "Guideline_Hf32", "Guideline_Wf32"
    graph.node.append(helper.make_node("Cast", inputs=[h_val], outputs=[h_f32], to=TensorProto.FLOAT, name="Guideline_CastH"))
    graph.node.append(helper.make_node("Cast", inputs=[w_val], outputs=[w_f32], to=TensorProto.FLOAT, name="Guideline_CastW"))
    
    imgsz_name = "Guideline_TrainImgsz"
    graph.initializer.append(numpy_helper.from_array(np.array([float(imgsz)], dtype=np.float32), name=imgsz_name))
    h_scale, w_scale = "Guideline_HScale", "Guideline_WScale"
    graph.node.append(helper.make_node("Div", inputs=[h_f32, imgsz_name], outputs=[h_scale], name="Guideline_DivH"))
    graph.node.append(helper.make_node("Div", inputs=[w_f32, imgsz_name], outputs=[w_scale], name="Guideline_DivW"))
    
    scales = "Guideline_BoxScales"
    # Boxes are [x1, y1, x2, y2]. Scale by [W, H, W, H]
    graph.node.append(helper.make_node("Concat", inputs=[w_scale, h_scale, w_scale, h_scale], outputs=[scales], axis=0, name="Guideline_ConcatScales"))
    graph.node.append(helper.make_node("Mul", inputs=[b_raw, scales], outputs=[b_scaled], name="Guideline_RescaleBoxes"))

    # 5. Final Concat [N, 7]
    # Format: [x1, y1, x2, y2, score, class_id, batch_idx]
    final_output_name = "detections"
    graph.node.append(helper.make_node(
        "Concat", 
        inputs=[b_scaled, s_raw, c_raw, b_idx_2d], 
        outputs=[final_output_name], 
        axis=1, 
        name="Guideline_FinalConcat"
    ))

    # Update outputs to be just the consolidated detection tensor
    del graph.output[:]
    graph.output.append(helper.make_tensor_value_info(final_output_name, TensorProto.FLOAT, ["N", 7]))

    onnx.checker.check_model(model)
    onnx.save(model, onnx_path)
    print(f"Guideline-aligned ONNX saved to {onnx_path}")


### 2. Data Preparation

Download the `vertex_miap_import.csv` file, which contains GCS paths to the images and their corresponding bounding box annotations.

In [None]:
import pandas as pd
import os
import gcsfs
from PIL import Image
from tqdm.notebook import tqdm
import numpy as np
import random

# CONFIGURATION
GCS_BUCKET = 'colin-miap-madness'
CSV_FILENAME = 'vertex_miap_import.csv'
GCS_CSV_PATH = f'gs://{GCS_BUCKET}/{CSV_FILENAME}'
IMAGE_SIZE = 320
MIN_BOX_PIXEL_SIZE = 7
VAL_SPLIT = 0.07  # 7% for validation
n_samples = 30000

# Initialize GCS FileSystem
fs = gcsfs.GCSFileSystem()

DATASET_ROOT = '/content/datasets/miap_single_class'
CACHE_DIR = os.path.join(DATASET_ROOT, 'image_cache')
for d in [os.path.join(DATASET_ROOT, 'images', 'train'), 
          os.path.join(DATASET_ROOT, 'images', 'val'),
          os.path.join(DATASET_ROOT, 'labels', 'train'), 
          os.path.join(DATASET_ROOT, 'labels', 'val'),
          CACHE_DIR]:
    os.makedirs(d, exist_ok=True)

print('Reading annotations CSV from GCS...')
col_names = ['ml_use', 'gcs_path', 'label', 'x_min', 'y_min', 'c1', 'c2', 'x_max', 'y_max', 'c3', 'c4']
df = pd.read_csv(GCS_CSV_PATH, header=None, names=col_names)
print(f'Found {len(df)} annotations.')

grouped = list(df.groupby('gcs_path'))
random.seed(42)
random.shuffle(grouped)

# --- Native Batch Download Optimization ---
# Collect paths for a buffer slightly larger than n_samples to account for filtering
unique_gcs_paths = [g[0] for g in grouped[:int(n_samples * 1.5)]]
print(f"Preparing batch download for {len(unique_gcs_paths)} images...")

rpaths = []
lpaths = []
for gcs_path in unique_gcs_paths:
    img_id = gcs_path.split('/')[-1]
    local_path = os.path.join(CACHE_DIR, img_id)
    if not os.path.exists(local_path):
        rpaths.append(gcs_path)
        lpaths.append(local_path)

if rpaths:
    print(f"Downloading {len(rpaths)} new images using GCS batch mode...")
    # gcsfs.get with list inputs is a "true batch" operation that uses 
    # internal async connection pooling and parallel transfers.
    fs.get(rpaths, lpaths)
    print("Batch download complete.")

# --- Processing Loop ---
images_processed = {'train': 0, 'val': 0}
images_dropped = 0

print(f'Processing and filtering images (min_box={MIN_BOX_PIXEL_SIZE}px)...')
for gcs_path, group in tqdm(grouped):
    split = 'val' if (images_processed['train'] + images_processed['val']) % int(1/VAL_SPLIT) == 0 else 'train'
    
    image_id = gcs_path.split('/')[-1] 
    cached_path = os.path.join(CACHE_DIR, image_id)
    
    if not os.path.exists(cached_path):
        # Fallback for any outliers not in the original batch
        try: fs.get(gcs_path, cached_path)
        except: continue

    local_image_path = os.path.join(DATASET_ROOT, 'images', split, image_id)
    local_label_path = os.path.join(DATASET_ROOT, 'labels', split, image_id.replace('.jpg', '.txt'))

    try:
        with Image.open(cached_path) as img: 
            img_w, img_h = img.size
    except Exception: 
        if os.path.exists(cached_path): os.remove(cached_path)
        continue

    yolo_labels = []
    scale = min(IMAGE_SIZE / img_w, IMAGE_SIZE / img_h)
    
    for _, row in group.iterrows():
        x_min, y_min, x_max, y_max = row['x_min'], row['y_min'], row['x_max'], row['y_max']
        bw_px = (x_max - x_min) * img_w * scale
        bh_px = (y_max - y_min) * img_h * scale
        if bw_px < MIN_BOX_PIXEL_SIZE or bh_px < MIN_BOX_PIXEL_SIZE:
            continue
            
        cx, cy = (x_min + x_max) / 2.0, (y_min + y_max) / 2.0
        w, h = x_max - x_min, y_max - y_min
        yolo_labels.append(f'0 {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}')
    
    if yolo_labels:
        if not os.path.exists(local_image_path):
            os.rename(cached_path, local_image_path)
        with open(local_label_path, 'w') as f: f.write('\n'.join(yolo_labels))
        images_processed[split] += 1
    else:
        images_dropped += 1
        if os.path.exists(cached_path): os.remove(cached_path)
        
    if images_processed["train"] >= n_samples:
        break

print('\n--- Data Preparation Summary ---')
print(f"Training images: {images_processed['train']}")
print(f"Validation images: {images_processed['val']}")
print(f'Images dropped (no valid boxes): {images_dropped}')

#### Create Dataset YAML File

In [None]:
import yaml
dataset_yaml_path = os.path.join(DATASET_ROOT, 'data.yaml')
yaml_content = {
    'path': os.path.abspath(DATASET_ROOT),
    'train': 'images/train',
    'val': 'images/val',
    'names': {0: 'person'}
}
with open(dataset_yaml_path, 'w') as f: yaml.dump(yaml_content, f)
print(f'Dataset YAML created at: {dataset_yaml_path}')


### 3. Model Training

In [None]:
from ultralytics import YOLO
import shutil
import os

MODEL_VARIANT = 'yolo26n.pt' 

# Check if a resume checkpoint exists on Drive to handle training crashes
last_ckpt_drive = os.path.join(GDRIVE_SAVE_DIR, 'last.pt')
if os.path.exists(last_ckpt_drive):
    print(f"Found existing checkpoint on Drive. Resuming from: {last_ckpt_drive}")
    model = YOLO(last_ckpt_drive)
    resume_arg = True
else:
    model = YOLO(MODEL_VARIANT)
    resume_arg = False

# --- NEW: Callback to sync weights to Drive during training ---
def on_train_epoch_end(trainer):
    """Callback to sync best and last checkpoints to Google Drive after each epoch."""
    best_local = os.path.join(trainer.save_dir, 'weights', 'best.pt')
    last_local = os.path.join(trainer.save_dir, 'weights', 'last.pt')
    
    # Sync best weights
    if os.path.exists(best_local):
        shutil.copy2(best_local, os.path.join(GDRIVE_SAVE_DIR, 'best.pt'))
    
    # Sync last weights (for resume capability)
    if os.path.exists(last_local):
        shutil.copy2(last_local, os.path.join(GDRIVE_SAVE_DIR, 'last.pt'))

# Register the callback before starting training
model.add_callback("on_train_epoch_end", on_train_epoch_end)
# --------------------------------------------------------------

results = model.train(
    data=dataset_yaml_path, 
    imgsz=IMAGE_SIZE, 
    epochs=30, 
    batch=32, 
    name='miap_person_detector',
    project='runs',
    resume=resume_arg
)
print('\nTraining complete!')

# Final weights already synced by callback, but confirming location
best_pt_drive = os.path.join(GDRIVE_SAVE_DIR, 'best.pt')
print(f'Best weights available on Drive: {best_pt_drive}')

### 4. Export to ONNX

In [None]:
import shutil
from ultralytics import YOLO

# Load the best weights from Google Drive for Export/Quantization
best_weights_path = os.path.join(GDRIVE_SAVE_DIR, 'best.pt')
print(f"Loading weights from Drive: {best_weights_path}")
model = YOLO(best_weights_path)

print('\n1. Exporting Raw FP32 ONNX model with integrated NMS...')
# nms=True adds the NMS node. opset=17 is recommended by guidelines.
# max_det=100 limits the number of detections to reduce overhead.
fp32_raw_path = model.export(format='onnx', imgsz=IMAGE_SIZE, opset=17, simplify=True, nms=True, max_det=100)

print('\n2. Embedding Preprocessing (uint8 -> float32 -> resize) into model...')
fp32_pre_path = fp32_raw_path.replace('.onnx', '_pre_u8.onnx')
embed_uint8_preprocess_into_onnx(fp32_raw_path, fp32_pre_path, IMAGE_SIZE)

print('\n3. Finalizing ONNX for deployment (aligning with GUIDELINES)...')
# This function rescales coordinates to uint8 input size and consolidates to [N, 7]
finalize_onnx_for_deployment(fp32_pre_path, IMAGE_SIZE)
print(f'FP32 ONNX (guideline aligned) saved to: {fp32_pre_path}')


### 5. Advanced Static Quantization (Improved)

YOLO models can be sensitive to static quantization. We use ONNX Runtimeâ€™s advanced quantization tools directly, using **Entropy (KL Divergence)** calibration and a larger calibration set.

In [None]:
import glob

class MIAPCalibrationDataReader(CalibrationDataReader):
    def __init__(self, image_dir, imgsz, max_images=1000):
        self.image_paths = glob.glob(os.path.join(image_dir, '*.jpg'))
        random.shuffle(self.image_paths)
        self.image_paths = self.image_paths[:max_images]
        self.imgsz = imgsz
        self.index = 0
        
        # Get input name from model
        session = ort.InferenceSession(fp32_pre_path, providers=['CPUExecutionProvider'])
        self.input_name = session.get_inputs()[0].name

    def get_next(self):
        if self.index >= len(self.image_paths): return None
        
        # Load and resize to exact imgsz in uint8 to match the new uint8 input
        img = Image.open(self.image_paths[self.index]).convert('RGB')
        img = img.resize((self.imgsz, self.imgsz), Image.BILINEAR)
        input_data = np.array(img).transpose(2, 0, 1)[None, ...].astype(np.uint8)
        
        self.index += 1
        return {self.input_name: input_data}

print('Starting Static Quantization...')
# Calibrate on a representative subset of the training data (e.g. 1000-2000 images is usually plenty)
# but we can go higher if desired. 2000 is a good balance.
dr = MIAPCalibrationDataReader(os.path.join(DATASET_ROOT, 'images', 'train'), IMAGE_SIZE, max_images=2000)

int8_path = fp32_pre_path.replace('.onnx', '_int8.onnx')

quantize_static(
    model_input=fp32_pre_path, 
    model_output=int8_path, 
    calibration_data_reader=dr, 
    quant_format=QuantFormat.QDQ,
    activation_type=QuantType.QUInt8, 
    weight_type=QuantType.QInt8, 
    per_channel=True, 
    reduce_range=False, # Often better for accuracy on non-Intel hardware
    calibrate_method=CalibrationMethod.Entropy # KL Divergence - better for fine gradients
)

print(f'\nINT8 ONNX model saved to: {int8_path}')


### 6. Save & Inspect ONNX Models

In [None]:
import os
import shutil
import numpy as np
import onnxruntime as ort

def inspect_onnx_model(model_path):
    print(f'\n--- Inspecting: {os.path.basename(model_path)} ---')
    # Using CPU provider for inspection
    sess = ort.InferenceSession(model_path, providers=['CPUExecutionProvider'])
    input_nodes = sess.get_inputs()
    print(f'Inputs: {[(n.name, n.shape, n.type) for n in input_nodes]}')
    print(f'Outputs: {[(n.name, n.shape, n.type) for n in sess.get_outputs()]}')
    
    # Use global IMAGE_SIZE or default to 320 if not found
    img_sz = IMAGE_SIZE if 'IMAGE_SIZE' in globals() else 320
    
    # Test with dummy input - use case-insensitive check for uint8
    if "uint8" in input_nodes[0].type.lower():
        dummy_input = np.random.randint(0, 255, size=(1, 3, img_sz, img_sz), dtype=np.uint8)
    else:
        dummy_input = np.random.rand(1, 3, img_sz, img_sz).astype(np.float32)
        
    outputs = sess.run(None, {input_nodes[0].name: dummy_input})
    print(f'Output shapes: {[o.shape for o in outputs]}')

# Define final paths in Drive
final_fp32 = os.path.join(GDRIVE_SAVE_DIR, os.path.basename(fp32_pre_path))
final_int8 = os.path.join(GDRIVE_SAVE_DIR, os.path.basename(int8_path))

print(f"Saving FP32 ONNX model to: {final_fp32}")
shutil.copy2(fp32_pre_path, final_fp32)

print(f"Saving INT8 ONNX model to: {final_int8}")
shutil.copy2(int8_path, final_int8)

print(f'\nAll models saved to Drive: {GDRIVE_SAVE_DIR}')
inspect_onnx_model(final_fp32)
inspect_onnx_model(final_int8)
