# Notebook F — 06_train_model.ipynb (Train Classifier)

This notebook loads the `training_data.npz` created in step 05, trains a Machine Learning model (Random Forest), and exports it to ONNX format for use in the pipeline.

## Output
- `output/models/model.onnx`: The trained model.

In [None]:
# Cell F0 — Install dependencies
import sys
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'numpy', 'scipy', 'scikit-learn', 'onnxruntime', 'skl2onnx', 'onxconverter-common'])

In [None]:
# Cell F1 — Load Training Data
from google.colab import drive
import os
import numpy as np

drive.mount('/content/drive')

BASE = "/content/drive/MyDrive/voxel_engine"
TRAIN_DIR = os.path.join(BASE, "training_data")
MODEL_DIR = os.path.join(BASE, "models")
os.makedirs(MODEL_DIR, exist_ok=True)

data_path = os.path.join(TRAIN_DIR, "training_data.npz")

if os.path.exists(data_path):
    data = np.load(data_path)
    X = data['X']
    y = data['y']
    print(f"Loaded training data: {X.shape} samples")
    print(f"Classes: {np.unique(y)}")
else:
    raise FileNotFoundError(f"Training data not found at {data_path}. Please run 05_create_training_data.ipynb first.")

In [None]:
# Cell F2 — Train Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
print("Training Random Forest...")
clf = RandomForestClassifier(n_estimators=100, max_depth=20, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print(classification_report(y_test, y_pred))

In [None]:
# Cell F3 — Export to ONNX
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

print("Converting to ONNX...")
# Define input type: FloatTensor of shape [None, n_features]
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]

onnx_model = convert_sklearn(clf, initial_types=initial_type)

output_path = os.path.join(MODEL_DIR, "model.onnx")
with open(output_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"Model saved to {output_path}")