# 02 - Kaggle Training: Place Recognition on NCLT

Train a MinkLoc3D-based place recognition model on the NCLT dataset.

**Requirements**: Attach both Kaggle datasets to this notebook:
1. [NCLT Preprocessed](https://www.kaggle.com/datasets/creatorofuniverses/nclt-iprofi-hack-23) — LiDAR point clouds, images, poses
2. [NCLT Sensors Addon](https://www.kaggle.com/datasets/YOUR_USERNAME/nclt-sensors-addon) — IMU, GPS, odometry, gyro, ground truth

## 1. Setup

In [None]:
# Clone the project repo (adjust URL)
# !git clone https://github.com/YOUR_USERNAME/nclt-slam-project.git
# %cd nclt-slam-project

# Install dependencies
!pip install -q torch torchvision open3d pyyaml tqdm scipy scikit-learn matplotlib
# MinkowskiEngine requires special installation:
# !pip install -q MinkowskiEngine -f https://nvidia.com/MinkowskiEngine/cu118/torch2.0.0/index.html

import sys
sys.path.insert(0, '.')

## 2. Link Kaggle Data

In [None]:
import os
from pathlib import Path

# --- Primary dataset: LiDAR point clouds + poses ---
KAGGLE_DATA = Path('/kaggle/input/nclt-iprofi-hack-23/NCLT_preprocessed')
LOCAL_DATA = Path('./data/NCLT_preprocessed')

if KAGGLE_DATA.exists():
    print(f'Running on Kaggle. LiDAR data at: {KAGGLE_DATA}')
    LOCAL_DATA.parent.mkdir(parents=True, exist_ok=True)
    if not LOCAL_DATA.exists():
        os.symlink(str(KAGGLE_DATA), str(LOCAL_DATA))
    data_path = KAGGLE_DATA
elif LOCAL_DATA.exists():
    print(f'Running locally. LiDAR data at: {LOCAL_DATA}')
    data_path = LOCAL_DATA
else:
    raise FileNotFoundError(
        'Dataset not found. On Kaggle, attach the NCLT dataset. '
        'Locally, run: python scripts/download_nclt_sample.py'
    )

# --- Sensors addon dataset: IMU, GPS, odometry, gyro, ground truth ---
KAGGLE_SENSORS = Path('/kaggle/input/nclt-sensors-addon')
LOCAL_SENSORS = Path('./data/nclt_sensors_addon')

sensors_path = None
if KAGGLE_SENSORS.exists():
    print(f'Sensors addon found at: {KAGGLE_SENSORS}')
    LOCAL_SENSORS.parent.mkdir(parents=True, exist_ok=True)
    if not LOCAL_SENSORS.exists():
        os.symlink(str(KAGGLE_SENSORS), str(LOCAL_SENSORS))
    sensors_path = KAGGLE_SENSORS
elif LOCAL_SENSORS.exists():
    print(f'Local sensors data at: {LOCAL_SENSORS}')
    sensors_path = LOCAL_SENSORS
else:
    print('Sensors addon not found - sensor features will be disabled.')
    print('Attach nclt-sensors-addon on Kaggle or download locally.')

# List available sessions (directly under data_path, not in sessions/ subdir)
sessions = sorted([
    d.name for d in data_path.iterdir()
    if d.is_dir() and d.name.startswith('20')
])
print(f'\nLiDAR sessions ({len(sessions)}): {sessions}')

if sensors_path is not None:
    sensor_sessions = sorted([
        d.name for d in sensors_path.iterdir()
        if d.is_dir() and not d.name.startswith('.')
    ])
    print(f'Sensor sessions ({len(sensor_sessions)}): {sensor_sessions}')

## 3. Verify Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from src.utils.point_cloud import load_velodyne_bin
from src.utils.io_utils import load_config

config = load_config('configs/dataset_config.yaml')

# Load a sample point cloud
sample_session = sessions[0] if sessions else '2012-01-08'
velodyne_dir = data_path / sample_session / 'velodyne_data'

if velodyne_dir.exists():
    bin_files = sorted(velodyne_dir.glob('*.bin'))
    if bin_files:
        sample_pc = load_velodyne_bin(bin_files[0])
        print(f'Sample point cloud: {sample_pc.shape}')
        print(f'X range: [{sample_pc[:,0].min():.1f}, {sample_pc[:,0].max():.1f}]')
        print(f'Y range: [{sample_pc[:,1].min():.1f}, {sample_pc[:,1].max():.1f}]')
        print(f'Z range: [{sample_pc[:,2].min():.1f}, {sample_pc[:,2].max():.1f}]')
        
        # Visualize
        fig, ax = plt.subplots(1, 1, figsize=(10, 10))
        ax.scatter(sample_pc[:, 0], sample_pc[:, 1], s=0.1, c=sample_pc[:, 2], cmap='viridis')
        ax.set_xlabel('X (m)')
        ax.set_ylabel('Y (m)')
        ax.set_title(f'Sample Point Cloud - {sample_session}')
        ax.set_aspect('equal')
        plt.show()
    else:
        print('No .bin files found')
else:
    print(f'Velodyne directory not found: {velodyne_dir}')

## 3b. Sensor Data Overview (Optional)

If the sensors addon dataset is attached, preview IMU and GPS data.

In [None]:
if sensors_path is not None:
    from src.datasets.sensor_loader import SessionSensorManager
    from src.utils.io_utils import load_config as _lc

    _cfg = _lc('configs/dataset_config.yaml')
    _sensor_cfg = _cfg.get('nclt', {}).get('sensors', {})

    session_name = sample_session
    session_sensor_dir = sensors_path / session_name

    if session_sensor_dir.exists():
        manager = SessionSensorManager(session_sensor_dir, _sensor_cfg)

        print(f'Session: {session_name}')
        print(f'  IMU:          {manager.imu is not None}')
        print(f'  GPS:          {manager.gps is not None}')
        print(f'  Odometry:     {manager.odometry is not None}')
        print(f'  KVH gyro:     {manager.kvh is not None}')
        print(f'  Ground truth: {manager.ground_truth is not None}')

        # Quick IMU preview
        if manager.imu is not None:
            imu_data = manager.imu.load()
            duration_s = (imu_data.timestamps[-1] - imu_data.timestamps[0]) / 1e6
            print(f'\n  IMU: {len(imu_data)} samples, {duration_s:.1f} s')

        # Quick GPS preview
        if manager.gps is not None:
            gps_data = manager.gps.load()
            print(f'  GPS: {len(gps_data)} readings')
    else:
        print(f'No sensor data for session {session_name}')
else:
    print('Sensors addon not available - skipping sensor overview.')

## 4. Load Training Config

In [None]:
train_config = load_config('configs/train_config.yaml')
training = train_config['training']

print('Training configuration:')
for key, value in training.items():
    print(f'  {key}: {value}')

## 5. Training Loop

Train the place recognition model with triplet loss.

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

# Choose model based on MinkowskiEngine availability
try:
    import MinkowskiEngine as ME
    model_type = 'minkloc3d'
    print('MinkowskiEngine available - using MinkLoc3D')
except ImportError:
    model_type = 'pointnet'
    print('MinkowskiEngine not available - falling back to PointNet')

from src.models.place_recognition import PlaceRecognitionWrapper, TripletLoss

feature_dim = training.get('feature_dim', 256)
model = PlaceRecognitionWrapper(
    model_type=model_type,
    feature_dim=feature_dim,
).to(device)

loss_fn = TripletLoss(margin=training.get('margin', 0.2))
optimizer = torch.optim.Adam(
    model.parameters(),
    lr=training.get('learning_rate', 1e-3),
    weight_decay=training.get('weight_decay', 1e-4),
)

print(f'Model: {model_type}, feature_dim={feature_dim}')
total_params = sum(p.numel() for p in model.parameters())
print(f'Total parameters: {total_params:,}')

In [None]:
# TODO: Implement actual training loop
# This is a placeholder - fill in after data loading is verified

# from src.datasets.nclt_pairs import NCLTPairsDataset, pairs_collate_fn
# from src.datasets.transforms import build_transforms
#
# aug_config = {
#     'point_cloud': config['nclt']['point_cloud'],
#     'augmentation': training.get('augmentation', {}),
# }
# train_transform = build_transforms(aug_config, is_train=True)
#
# train_dataset = NCLTPairsDataset(
#     config_path='configs/dataset_config.yaml',
#     split='train',
#     transform=train_transform,
# )
#
# train_loader = DataLoader(
#     train_dataset,
#     batch_size=training.get('batch_size', 32),
#     shuffle=True,
#     num_workers=2,
#     collate_fn=pairs_collate_fn,
# )
#
# epochs = training.get('epochs', 80)
# for epoch in range(epochs):
#     model.train()
#     epoch_loss = 0.0
#     for batch in train_loader:
#         anchor_desc = model(batch['anchor'].to(device))
#         positive_desc = model(batch['positive'].to(device))
#         negative_desc = model(batch['negatives'].to(device))
#         loss = loss_fn(anchor_desc, positive_desc, negative_desc)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         epoch_loss += loss.item()
#     print(f'Epoch {epoch}: loss={epoch_loss/len(train_loader):.4f}')

print('Training loop placeholder - uncomment when data is ready')

## 6. Evaluation

In [None]:
# from src.evaluation.metrics import recall_at_k
# from src.evaluation.visualization import plot_recall_at_k
#
# # Extract descriptors for validation set
# model.eval()
# descriptors = []
# positions = []
#
# with torch.no_grad():
#     for batch in val_loader:
#         desc = model(batch['anchor'].to(device))
#         descriptors.append(desc.cpu().numpy())
#         positions.append(batch['anchor_pose'][:, :3, 3].numpy())
#
# descriptors = np.concatenate(descriptors)
# positions = np.concatenate(positions)
#
# # Compute recall@K
# n = len(descriptors)
# mid = n // 2
# recall = recall_at_k(
#     descriptors[:mid], descriptors[mid:],
#     positions[:mid], positions[mid:],
#     k_values=[1, 5, 10],
# )
# print(f'Recall: {recall}')
# plot_recall_at_k(recall)
# plt.show()

print('Evaluation placeholder - uncomment after training')

## 7. Save Checkpoints

In [None]:
# Save to Kaggle output
import os

output_dir = Path('/kaggle/working') if Path('/kaggle/working').exists() else Path('./checkpoints')
output_dir.mkdir(parents=True, exist_ok=True)

# torch.save({
#     'model_state_dict': model.state_dict(),
#     'optimizer_state_dict': optimizer.state_dict(),
#     'config': training,
# }, output_dir / 'best_model.pth')
# print(f'Model saved to {output_dir / "best_model.pth"}')

print(f'Checkpoint directory: {output_dir}')
print('Uncomment save code after training is implemented')