In [1]:
import gc
import numpy as np
import polars as pl
import tensorflow as tf
from pathlib import Path
from tensorflow import shape, minimum
from tensorflow.keras import backend as k
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import pad_sequences, Sequence, to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Dense, Input, Conv1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate,
    BatchNormalization, GRU, Dropout, add, Activation, Multiply, Reshape,
    LayerNormalization, Add, Bidirectional, LSTM, UpSampling1D, Lambda, GaussianNoise, MultiHeadAttention
)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

from src.nn_blocks import tof_block, residual_se_cnn_block, TransformerBlock, tof_block_2, features_processing, unet_se_cnn

NUM_CLASSES = 18


# --- Gated Model 1: Based on CNN-RNN Hybrid ---
def create_gated_cnn_rnn(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # IMU branch
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd) # Output: (None, 64, 64)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output: (None, 32, 128)
    x1 = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd)))(x1) # Output: (None, 32, 256)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)

    # --- FIX: Project x2 to match x1's feature dimension before processing ---
    x2_projected = Dense(256, activation='relu')(x2)

    # Now both inputs to features_processing have shape (None, 32, 256)
    x = features_processing(x1, x2_projected)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Gated Model 2: Based on UNet_Style ---
def create_gated_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # IMU branch
    x1 = unet_se_cnn(imu, unet_depth=4, base_filters=64, kernel_size=5, drop=0.3) # Output: (None, 128, 64)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)

    # We will use a simpler approach for this model.
    x1_pooled = GlobalAveragePooling1D()(x1)
    x2_pooled = GlobalAveragePooling1D()(x2)
    x = Concatenate()([x1_pooled, x2_pooled])
    
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Gated Model 3: Based on CNN_Transformer ---
def create_gated_cnn_transformer(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # IMU branch
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd) # Output: (None, 64, 64)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output: (None, 32, 128)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128, rate=0.3)(x1) # Output: (None, 32, 128)
    x1 = residual_se_cnn_block(x1, 64, 3, drop=0.2, wd=wd) # Output: (None, 16, 64)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output: (None, 8, 128)    
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128, rate=0.3)(x1) # Output: (None, 8, 128)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)
    x2 = tf.keras.layers.MaxPooling1D(4)(x2)

    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def best_unet_1(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = unet_se_cnn(imu, 3, base_filters=128, kernel_size=3)
    x2 = tof_block(tof, wd)

    x = features_processing(x1, x2)
    x = tf.keras.layers.Dropout(0.3)(x) 
    main_out = tf.keras.layers.Dense(18, activation="softmax", name="main_output")(x)
    gate_out = tf.keras.layers.Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return tf.keras.models.Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def best_unet_2(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = unet_se_cnn(imu, 3, base_filters=128, kernel_size=3)
    x2 = tof_block_2(tof, wd)

    x = features_processing(x1, x2)
    x = tf.keras.layers.Dropout(0.3)(x) 
    main_out = tf.keras.layers.Dense(18, activation="softmax", name="main_output")(x)
    gate_out = tf.keras.layers.Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return tf.keras.models.Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

2025-08-23 20:59:08.848228: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755979148.870711 4081569 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755979148.876568 4081569 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755979148.892881 4081569 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755979148.892910 4081569 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755979148.892912 4081569 computation_placer.cc:177] computation placer alr

In [2]:
# =====================================================================================
# 5 NEW ADVANCED MODEL ARCHITECTURES
# =====================================================================================

from src.nn_blocks import match_time_steps, wave_block, res_se_cnn_decoder_block

# --- Advanced Model 2: Stacked Transformer Tower ---
def create_advanced_model_2_transformer_tower(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Strong CNN backbone to create rich features for the Transformer
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output shape: (None, 32, 128)
    
    # Stacked Transformer Tower
    # Each block attends to the output of the previous one, building deeper context.
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output shape: (None, 32, 128)

    # Merge and classify
    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model 3: Hybrid UNet + WaveNet ---
def create_advanced_model_3_unet_wave(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1_unet = unet_se_cnn(imu, unet_depth=3, base_filters=64, kernel_size=5)
    x1_wave = wave_block(imu, 64, 3, n=5, dropout_rate=0.3) # n=5 -> dilations up to 16
    
    x1_unet_matched, x1_wave_matched = match_time_steps(x1_unet, x1_wave)
    x1 = Concatenate()([x1_unet_matched, x1_wave_matched])
    
    x2 = tof_block(tof, wd)

    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def create_wave_net(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = wave_block(imu, 128, 3, n=4, dropout_rate=0.3) 
    x2 = tof_block(tof, wd)

    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model 4: Triple Stacked Block Design ---
def cnn_gru_block(x, filters, kernel_size, wd=1e-4):
    # A self-contained block combining CNN and GRU
    x_cnn = residual_se_cnn_block(x, filters, kernel_size, wd=wd)
    x_gru = Bidirectional(GRU(filters // 2, return_sequences=True))(x_cnn)
    return x_gru

def cnn_gru_block(x, filters, kernel_size, wd=1e-4):
    """
    A simplified and robust block that first applies a CNN, then a GRU.
    """
    # 1. CNN part for feature extraction and downsampling
    x = residual_se_cnn_block(x, filters, kernel_size, wd=wd)
    
    # 2. GRU part for sequence processing
    x = Bidirectional(GRU(filters, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    
    return x

def create_advanced_model_4_stacked_blocks(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Apply the hybrid block three times
    x1 = cnn_gru_block(imu, 64, 3)  # Output: (None, 64, 128)
    x1 = cnn_gru_block(x1, 128, 5) # Output: (None, 32, 256)
    
    # The final block will not return sequences to simplify the final merge
    x1 = Bidirectional(GRU(128, return_sequences=False))(x1) # Output: (None, 256)
    
    # Standard ToF branch, but we need to aggregate it to match x1
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)
    x2 = GlobalAveragePooling1D()(x2) # Output: (None, 128)

    # Merge the two aggregated feature vectors
    x = Concatenate()([x1, x2]) # Output: (None, 256 + 128) = (None, 384)
    
    # Final classifier MLP
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model 5: UNet with BiLSTM Bottleneck ---
def unet_se_cnn_bilstm(x, unet_depth=3, base_filters=64, kernel_size=3, drop=0.3):
    filters = base_filters
    skips = []
    for _ in range(unet_depth):
        x = residual_se_cnn_block(x, filters, kernel_size, drop=drop)
        skips.append(x)
        filters *= 2
    
    # --- BiLSTM Bottleneck ---
    # Process the most compressed representation sequentially
    x = Bidirectional(LSTM(filters // 2, return_sequences=True))(x)
    
    for skip in reversed(skips):
        filters //= 2
        x = res_se_cnn_decoder_block(x, filters, kernel_size, drop=drop, skip_connection=skip)
    return x

def create_advanced_model_1_deep_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branches ---
    x1_unet = unet_se_cnn(imu, unet_depth=4, base_filters=128, kernel_size=5, drop=0.3)
    x1_conv_k3 = residual_se_cnn_block(imu, 64, 3)
    x1_conv_k7 = residual_se_cnn_block(imu, 64, 7)
    
    # --- FIX: Aggregate each branch BEFORE merging ---
    # This creates a fixed-size vector from each branch, avoiding shape conflicts.
    p1 = GlobalAveragePooling1D()(x1_unet)
    p2 = GlobalAveragePooling1D()(x1_conv_k3)
    p3 = GlobalAveragePooling1D()(x1_conv_k7)
    
    # --- ToF Branch ---
    x2 = tof_block_2(tof, wd)
    p4 = GlobalAveragePooling1D()(x2)

    # Concatenate the aggregated feature vectors
    x = Concatenate()([p1, p2, p3, p4])
    
    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def create_advanced_model_5_unet_bilstm(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Use the UNet with the BiLSTM bottleneck
    x1 = unet_se_cnn_bilstm(imu, unet_depth=3, base_filters=128, kernel_size=3)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd)

    # --- FIX: Use the robust aggregation strategy instead of features_processing ---
    x1_pooled = GlobalAveragePooling1D()(x1)
    x2_pooled = GlobalAveragePooling1D()(x2)
    x = Concatenate()([x1_pooled, x2_pooled])

    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [3]:
# =====================================================================================
# 3 NEW ADVANCED MODEL ARCHITECTURES
# =====================================================================================
from src.nn_blocks import attention_layer

def create_advanced_model_A_dual_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Branch 1: A deep U-Net for the IMU data
    x1_raw = unet_se_cnn(imu, unet_depth=4, base_filters=128, kernel_size=5, drop=0.3)
    
    # Branch 2: A parallel, slightly lighter U-Net for the ToF/Thermal data
    x2_raw = unet_se_cnn(tof, unet_depth=3, base_filters=64, kernel_size=5, drop=0.3)

    # --- FIX: Project both branches to a common feature dimension (e.g., 128) ---
    # This ensures the input to features_processing is consistent.
    x1 = Conv1D(128, 1, padding='same', activation='relu', name='imu_projection')(x1_raw)
    x2 = Conv1D(128, 1, padding='same', activation='relu', name='tof_projection')(x2_raw)
    
    # Now both x1 and x2 have shape (None, 128, 128)
    # They can be safely passed to the features_processing block.
    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model B: Cross-Attention Fusion ---
# Hypothesis: Instead of just concatenating the IMU and ToF branches, we can create
# richer features by allowing them to "talk to each other." The IMU branch will learn
# what to pay attention to in the ToF data, and vice-versa.
def create_advanced_model_B_cross_attention(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # 1. Create strong, downsampled feature representations for both branches
    # Output Shape for both: (None, 32, 128)
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd)
    
    x2 = tof_block_2(tof, wd)

    # 2. Cross-Attention Fusion
    # The IMU branch queries the ToF branch for relevant context
    imu_attends_tof = tf.keras.layers.Attention()([x1, x2])
    # The ToF branch queries the IMU branch for relevant context
    tof_attends_imu = tf.keras.layers.Attention()([x2, x1])
    
    # 3. Create an enriched representation by concatenating all perspectives
    # The final tensor contains the original features plus the context-aware features.
    # Shape: (None, 32, 128 + 128 + 128 + 128) = (None, 32, 512)
    x = Concatenate()([x1, imu_attends_tof, x2, tof_attends_imu])
    
    # 4. Final Processing
    # We use a powerful sequence processor on this ultra-rich tensor
    x = Bidirectional(GRU(256, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    x = attention_layer(x)
    
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model C: Stacked Hybrid Blocks ---
# Hypothesis: A single block of (CNN -> RNN) is good. Repeatedly stacking this
# hybrid block will allow the model to learn progressively more abstract and

# powerful spatio-temporal features.
def cnn_lstm_block(x, filters, kernel_size, drop=0.2, wd=1e-4):
    # A self-contained, reusable block
    x = residual_se_cnn_block(x, filters, kernel_size, drop=drop, wd=wd)
    x = Bidirectional(LSTM(filters, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    return x

def create_advanced_model_C_stacked_hybrid(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch: Stacked Hybrid Blocks ---
    # Each block refines the output of the previous one
    # Input: (128, D) -> Block1: (64, 128) -> Block2: (32, 256)
    x1 = cnn_lstm_block(imu, 64, 3)
    x1 = cnn_lstm_block(x1, 128, 5)
    
    # --- ToF Branch ---
    # Output: (32, 128)
    x2 = tof_block_2(tof, wd)
    # Project ToF features to match the final IMU feature dimension (256)
    x2_projected = Dense(256, activation='relu')(x2)

    # Now both inputs have shape (None, 32, 256) and can be processed
    x = features_processing(x1, x2_projected)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [4]:
# =====================================================================================
# 3 NEW ADVANCED MODEL ARCHITECTURES
# =====================================================================================

# --- Advanced Model A: BERT-Fusion (Keras Implementation) ---
# Hypothesis: Using a Transformer (BERT) as a late-stage fusion layer for features
# from three separate, specialized branches will create the most powerful representation.
# This is a direct translation of the PyTorch model's core idea.
def create_advanced_model_A_bert_fusion(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof_and_thm = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)
    
    # We need to split ToF and Thermal for separate processing
    # Assuming thm_cols are the first 5 in the tof_and_thm tensor
    thm = tf.keras.layers.Lambda(lambda t: t[:, :, :5])(tof_and_thm)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, 5:])(tof_and_thm)

    # 1. Create three separate feature extraction branches
    # IMU Branch
    x_imu = residual_se_cnn_block(imu, 128, 3)
    x_imu = residual_se_cnn_block(x_imu, 256, 5) # Shape: (None, 32, 256)
    
    # Thermal Branch
    x_thm = residual_se_cnn_block(thm, 64, 3)
    x_thm = residual_se_cnn_block(x_thm, 128, 5)
    x_thm = Conv1D(256, 1, padding='same', activation='relu')(x_thm) # Project to 256 features
    
    # ToF Branch
    x_tof = residual_se_cnn_block(tof, 128, 3)
    x_tof = residual_se_cnn_block(x_tof, 256, 5) # Shape: (None, 32, 256)
    
    # 2. Concatenate along the feature axis and feed into a Transformer
    # Shape: (None, 32, 256+256+256) -> (None, 32, 768)
    x = Concatenate()([x_imu, x_thm, x_tof])
    
    # Transformer (BERT-like) layers for deep fusion
    x = TransformerBlock(embed_dim=768, num_heads=8, ff_dim=1024, rate=0.2)(x)
    x = TransformerBlock(embed_dim=768, num_heads=8, ff_dim=1024, rate=0.2)(x)
    
    # 3. Use Global Pooling to aggregate the time dimension
    x = GlobalAveragePooling1D()(x)
    
    # 4. Final Classifier MLP
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model B: Hyper-UNet ---
# Hypothesis: Since U-Nets are the top performers, an even deeper and wider U-Net
# with more filters and a deeper encoder/decoder structure will capture more complex features.
def create_advanced_model_B_hyper_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Branch 1: A very deep and wide U-Net for IMU data
    # unet_depth=5 creates a very deep model, base_filters=128 makes it wide.
    x1 = unet_se_cnn(imu, unet_depth=5, base_filters=128, kernel_size=5, drop=0.3)
    
    # Branch 2: A standard ToF block
    x2 = tof_block_2(tof, wd)

    # Project both branches to a common, large feature dimension before merging
    x1_proj = Conv1D(128, 1, padding='same', activation='relu')(x1)
    x2_proj = Conv1D(128, 1, padding='same', activation='relu')(x2)
    
    # Use the standard features_processing block to merge and classify
    x = features_processing(x1_proj, x2_proj)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model C: Parallel UNet-Transformer Hybrid ---
# Hypothesis: The IMU signal contains both local patterns (best for U-Net) and global
# context (best for Transformer). Processing the IMU with both backbones in parallel
# and fusing their outputs will create the ultimate feature representation.
def create_advanced_model_C_parallel_hybrid(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch is now two parallel streams ---
    
    # Stream 1: U-Net for multi-resolution analysis
    imu_unet = unet_se_cnn(imu, unet_depth=4, base_filters=128, kernel_size=5)
    
    # Stream 2: CNN -> Transformer Tower for global context
    imu_cnn = residual_se_cnn_block(imu, 64, 3)
    imu_cnn = residual_se_cnn_block(imu_cnn, 128, 5) # Shape: (None, 32, 128)
    imu_transformer = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256)(imu_cnn)
    
    # --- Fusion of IMU streams ---
    imu_unet_matched, imu_transformer_matched = match_time_steps(imu_unet, imu_transformer)
    x1 = Concatenate()([imu_unet_matched, imu_transformer_matched]) # Shape: (None, 32, 256)
    
    # --- ToF Branch ---
    x2 = tof_block_2(tof, wd) # Shape: (None, 32, 128)

    # --- FIX: Project both branches to a common, predictable feature dimension ---
    # Let's project both to 256 features, so the merged result is 512.
    x1_proj = Conv1D(256, 1, padding='same', activation='relu', name='imu_projection')(x1)
    x2_proj = Conv1D(256, 1, padding='same', activation='relu', name='tof_projection')(x2)
    
    # Now both x1_proj and x2_proj have shape (None, 32, 256)
    # They can be safely passed to the features_processing block.
    x = features_processing(x1_proj, x2_proj)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [5]:
# import polars as pl
# df = pl.read_parquet('output/imu_physics_feats.parquet')
# df.columns

In [6]:
from tensorflow.keras import Layer, Sequential

def ImuFeatureExtractorLayer(imu_input):
    """A Keras layer to perform on-the-fly feature engineering."""
    acc = imu_input[:, :, :3]  # Assuming raw acc_x, y, z are the first 3 features
    gyro = imu_input[:, :, 3:6] # Assuming raw rot_w,x,y,z -> angular velocity are next
    
    acc_mag = tf.norm(acc, axis=-1, keepdims=True)
    gyro_mag = tf.norm(gyro, axis=-1, keepdims=True)
    
    # Jerk (diff) requires padding to maintain time dimension
    jerk = tf.pad(acc[:, 1:, :] - acc[:, :-1, :], [[0, 0], [1, 0], [0, 0]])
    
    # Squared values
    acc_pow = tf.square(acc)
    
    # Concatenate all derived features
    return Concatenate()([acc, gyro, acc_mag, gyro_mag, jerk, acc_pow])

def create_new_model_1_in_model_fe(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    # IMPORTANT: This model expects the RAW acc/rot features, not the engineered ones.
    # You will need to adjust your data pipeline to feed the raw features.
    imu_raw = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # 1. On-the-fly feature engineering branch
    x1 = ImuFeatureExtractorLayer(imu_raw)
    
    # 2. Standard CNN backbone to process these rich features
    x1 = residual_se_cnn_block(x1, 128, 5)
    x1 = residual_se_cnn_block(x1, 256, 7)
    
    # 3. Standard ToF branch
    x2 = tof_block_2(tof, wd)

    # 4. Merge and classify
    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# =====================================================================================
# 3 NEW ADVANCED PANNs-BASED MODEL ARCHITECTURES
# =====================================================================================

def create_panns_model_A_rnn_head(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch: Parallel CNNs (PANNs Backbone) ---
    # Each branch downsamples time to 32 and outputs 128 features
    k3 = residual_se_cnn_block(imu, 128, 3)
    k5 = residual_se_cnn_block(imu, 128, 5)
    k7 = residual_se_cnn_block(imu, 128, 7)
    
    # Concatenate the multi-scale features
    # Shape: (None, 32, 128 + 128 + 128) = (None, 32, 384)
    x1 = Concatenate()([k3, k5, k7])
    
    # --- ToF Branch ---
    x2 = tof_block(tof, wd) # Shape: (None, 32, 128)

    # --- Merge and Process with RNN Head ---
    # Project ToF features to match the IMU feature dimension for a cleaner merge
    x2_proj = Conv1D(384, 1, padding='same', activation='relu')(x2)
    
    # Concatenate the full feature set
    x = Concatenate()([x1, x2_proj]) # Shape: (None, 32, 384 + 384) = (None, 32, 768)
    
    # Add a powerful RNN head to learn sequential patterns from the rich features
    x = Bidirectional(GRU(384, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    x = attention_layer(x) # Use attention to summarize the sequence
    
    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def pann_rnn_head_feat_processing(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch: Parallel CNNs (PANNs Backbone) ---
    # Each branch downsamples time to 32 and outputs 128 features
    k3 = residual_se_cnn_block(imu, 128, 3)
    k5 = residual_se_cnn_block(imu, 128, 5)
    k7 = residual_se_cnn_block(imu, 128, 7)
    
    # Shape: (None, 32, 128 + 128 + 128) = (None, 32, 384)
    x1 = Concatenate()([k3, k5, k7])
    x2 = tof_block(tof, wd) # Shape: (None, 32, 128)
    x2_proj = Conv1D(384, 1, padding='same', activation='relu')(x2)
    
    x = features_processing(x1, x2_proj)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [7]:
def ImuFeatureExtractorLayer(imu_input):
    """A Keras layer to perform on-the-fly feature engineering."""
    acc = tf.keras.layers.Lambda(lambda t: t[:, :, :3])(imu_input)
    gyro = tf.keras.layers.Lambda(lambda t: t[:, :, 3:6])(imu_input)
    
    # --- THIS IS THE FIX ---
    # Wrap each raw TensorFlow function in a Keras Lambda layer.
    
    acc_mag = tf.keras.layers.Lambda(lambda t: tf.norm(t, axis=-1, keepdims=True))(acc)
    gyro_mag = tf.keras.layers.Lambda(lambda t: tf.norm(t, axis=-1, keepdims=True))(gyro)
    
    # The tf.pad function also needs to be wrapped.
    jerk = tf.keras.layers.Lambda(
        lambda t: tf.pad(t[:, 1:, :] - t[:, :-1, :], [[0, 0], [1, 0], [0, 0]])
    )(acc)
    
    # tf.square is a simple operation, but for consistency, we can wrap it too.
    acc_pow = tf.keras.layers.Lambda(tf.square)(acc)
    
    # Concatenate all the resulting KerasTensors
    return Concatenate()([acc, gyro, acc_mag, gyro_mag, jerk, acc_pow])

def create_new_model_1_in_model_fe(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu_raw = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # 1. On-the-fly feature engineering branch
    x1 = ImuFeatureExtractorLayer(imu_raw)
    
    # 2. Standard CNN backbone to process these rich features
    x1 = residual_se_cnn_block(x1, 128, 5)
    x1 = residual_se_cnn_block(x1, 256, 7)
    
    # 3. Standard ToF branch
    x2 = tof_block_2(tof, wd)

    # --- FIX: Aggregate each branch BEFORE merging ---
    # This creates fixed-size vectors and avoids all shape conflicts.
    x1_pooled = GlobalAveragePooling1D()(x1)
    x2_pooled = GlobalAveragePooling1D()(x2)
    
    # Concatenate the aggregated feature vectors
    x = Concatenate()([x1_pooled, x2_pooled])
    
    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [8]:
def generate_tof_features_for_inference(df: pl.DataFrame) -> pl.DataFrame:
    decay_weights = np.power(0.9, np.arange(64))
    x_coords, y_coords = np.meshgrid(np.arange(8), np.arange(8))
    feature_expressions = []
    for i in range(1, 6):
        pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
        if not all(col in df.columns for col in pixel_cols): continue
        list_expr = pl.concat_list([pl.when(pl.col(c) == -1).then(None).otherwise(pl.col(c)) for c in pixel_cols]).alias(f"tof_{i}_list")
        feature_expressions.extend([list_expr.list.mean().alias(f'tof_{i}_mean'), list_expr.list.std().alias(f'tof_{i}_std'),
                                    list_expr.list.min().alias(f'tof_{i}_min'), list_expr.list.max().alias(f'tof_{i}_max'),
                                    list_expr.list.median().alias(f'tof_{i}_median'), list_expr.list.diff().list.mean().alias(f'tof_{i}_diff_mean'),
                                    list_expr.list.drop_nulls().list.len().alias(f'tof_{i}_active_pixels'),
                                    # list_expr.list.drop_nulls().map_elements(pl_skew, return_dtype=pl.Float64).alias(f'tof_{i}_skew'),
                                    # list_expr.list.drop_nulls().map_elements(pl_kurtosis, return_dtype=pl.Float64).alias(f'tof_{i}_kurtosis')
                                    ])
        tof_data_exprs = [pl.when(pl.col(c) == -1).then(None).otherwise(pl.col(c)) for c in pixel_cols]
        feature_expressions.append(pl.sum_horizontal([(expr * weight).fill_null(0) for expr, weight in zip(tof_data_exprs, decay_weights)]).alias(f'tof_{i}_mean_decay'))
        weights_exprs = [(1 / (expr + 1e-6)).fill_null(0) for expr in tof_data_exprs]
        total_weight_expr = pl.sum_horizontal(weights_exprs)
        centroid_x_expr = pl.when(total_weight_expr > 1e-9).then(pl.sum_horizontal([(w * c) for w, c in zip(weights_exprs, x_coords.ravel())]) / total_weight_expr).otherwise(None)
        centroid_y_expr = pl.when(total_weight_expr > 1e-9).then(pl.sum_horizontal([(w * c) for w, c in zip(weights_exprs, y_coords.ravel())]) / total_weight_expr).otherwise(None)
        feature_expressions.extend([centroid_x_expr.alias(f'tof_{i}_centroid_x'), centroid_y_expr.alias(f'tof_{i}_centroid_y')])
    if feature_expressions:
        df = df.with_columns(feature_expressions)
    return df

In [9]:
# =====================================================================================
# CONFIGURATION
# =====================================================================================
LR_INIT = 5e-4
WD = 3e-3
NUM_CLASSES = 18
BATCH_SIZE = 64
N_SPLITS = 5 
MAX_PAD_LEN = 128
FEATURE_DIR = Path('output')
RAW_DIR = Path('input/cmi-detect-behavior-with-sensor-data')
RANDOM_STATE = 42
SAMPLING_RATE_HZ = 200 # Use the correct sampling rate

from src.merge_feats_dynamic import merge_feature_sets
from src.functions import create_sequence_dataset, generate_gate_targets, train_model
from src.imu_physics_feats import calculate_angular_velocity
from src.nn_blocks import GatedMixupGenerator

# =====================================================================================
# TRAINING LOGIC
# =====================================================================================


final_df = pl.read_parquet(FEATURE_DIR / "cleaned_base_train_data.parquet")
demographics_df = pl.read_csv(RAW_DIR / "train_demographics.csv")
final_df = final_df.join(demographics_df, on='subject', how='left')
meta_cols = ['sequence_id', 'sequence_counter', 'subject', 'gesture']


le = LabelEncoder()
gesture_encoded = le.fit_transform(final_df.get_column('gesture'))
final_df = final_df.with_columns(pl.Series("gesture_int", gesture_encoded))  
print(f"  Final merged DataFrame created with shape: {final_df.shape}")

all_columns = final_df.columns
thm_cols = [c for c in all_columns if c.startswith('thm')]
tof_cols = [c for c in all_columns if c.startswith('tof')]
imu_cols = [c for c in all_columns if c.startswith('acc_') or c.startswith('rot_')]
# 2. Create the final, ordered list of all features.
all_feature_cols = imu_cols + thm_cols + tof_cols
imu_dim = len(imu_cols)
print(f"  Training with {len(all_feature_cols)} total features ({imu_dim} IMU, {len(tof_cols)} ToF/Thm).")    

# 3. Reorder the DataFrame to match the required structure for the model.
metadata_to_keep = ['sequence_id', 'sequence_counter', 'gesture', 'gesture_int', 'subject']
final_df = final_df.select(metadata_to_keep + all_feature_cols)
print("  DataFrame columns have been reordered for the model.")

# --- Step 3: Prepare for Cross-Validation ---
cv_info = final_df.group_by("sequence_id").agg(pl.first("gesture_int")).sort("sequence_id")
all_sequence_ids = cv_info.get_column("sequence_id").to_numpy()
y_for_split = cv_info.get_column("gesture_int").to_numpy()

input_shape = (MAX_PAD_LEN, len(all_feature_cols)) 

model_results = {}

# Your new list of experiments
model_builders = [
    ("fe_nn_model", lambda: create_new_model_1_in_model_fe(input_shape, imu_dim)),
]

print("  Loading clean base DataFrame...")
final_df = pl.read_parquet(FEATURE_DIR / "cleaned_base_train_data.parquet")
demographics_df = pl.read_csv(RAW_DIR / "train_demographics.csv")
final_df = final_df.join(demographics_df, on='subject', how='left')

le = LabelEncoder()
gesture_encoded = le.fit_transform(final_df.get_column('gesture'))
final_df = final_df.with_columns(pl.Series("gesture_int", gesture_encoded))  

# --- Step 2: Engineer ToF/Thm Features ---
# We must create the engineered ToF/Thm features as the model expects them.
print("  Engineering ToF/Thermal features...")
final_df = generate_tof_features_for_inference(final_df) # Assuming this function is defined

# --- Step 3: Define the specific feature columns for THIS model ---
# This model requires RAW IMU features and ENGINEERED ToF/Thm features.

# Raw IMU features for the on-the-fly feature extractor layer
# IMPORTANT: The ImuFeatureExtractorLayer expects acc_x,y,z then angular_vel_x,y,z.
# We must calculate angular velocity here.

grouped = final_df.partition_by("sequence_id", maintain_order=True)
angular_vel_dfs = []
for group in grouped:
    rot_df = group.select(["rot_x", "rot_y", "rot_z", "rot_w"])
    angular_vel = calculate_angular_velocity(rot_df, SAMPLING_RATE_HZ)
    av_df = pl.DataFrame(angular_vel, schema=["angular_vel_x", "angular_vel_y", "angular_vel_z"])
    av_df = av_df.with_columns(pl.Series("sequence_counter", group["sequence_counter"]))
    av_df = av_df.with_columns(pl.Series("sequence_id", group["sequence_id"]))
    angular_vel_dfs.append(av_df)
angular_vel_df = pl.concat(angular_vel_dfs)
final_df = final_df.join(angular_vel_df, on=['sequence_id', 'sequence_counter'], how='left').fill_null(0)

raw_imu_cols = [
    'acc_x', 'acc_y', 'acc_z',
    'angular_vel_x', 'angular_vel_y', 'angular_vel_z'
]

# Engineered ToF/Thm features
engineered_tof_cols = [c for c in final_df.columns if c.startswith(('tof_', 'thm_')) and '_v' not in c]

# Create the final, ordered list of all features for the model input
all_feature_cols = raw_imu_cols + engineered_tof_cols
imu_dim = len(raw_imu_cols) # The split point is the number of RAW features
print(f"  Training with {len(all_feature_cols)} total features ({imu_dim} raw IMU).")    

# Reorder the DataFrame to match the required structure
metadata_to_keep = ['sequence_id', 'sequence_counter', 'gesture', 'gesture_int', 'subject']
final_df = final_df.select(metadata_to_keep + all_feature_cols)
print("  DataFrame columns have been reordered for the model.")

# --- Step 4: Prepare for Cross-Validation ---
cv_info = final_df.group_by("sequence_id").agg(pl.first("gesture_int")).sort("sequence_id")
all_sequence_ids = cv_info.get_column("sequence_id").to_numpy()
y_for_split = cv_info.get_column("gesture_int").to_numpy()

input_shape = (MAX_PAD_LEN, len(all_feature_cols)) 
model_results = {}
model_builders = [("In_Model_FE", lambda: create_new_model_1_in_model_fe(input_shape, imu_dim))]

for model_name, model_builder in model_builders:
    print("\n" + "="*60)
    print(f"▶ Training and Evaluating Model: {model_name}")
    print("="*60)

    kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    fold_accuracies = []
    all_preds = []
    all_labels = []
    best_epochs = []

    for fold_idx, (train_indices, val_indices) in enumerate(kf.split(all_sequence_ids, y_for_split)):
        print(f"\n=== Fold {fold_idx + 1}/{N_SPLITS} for {model_name} ===")
        train_ids = all_sequence_ids[train_indices]
        val_ids = all_sequence_ids[val_indices]

        train_df = final_df.filter(pl.col('sequence_id').is_in(train_ids))
        val_df = final_df.filter(pl.col('sequence_id').is_in(val_ids))
        
        scaler = StandardScaler()
        train_features_scaled = scaler.fit_transform(train_df.select(all_feature_cols))
        val_features_scaled = scaler.transform(val_df.select(all_feature_cols))
        
        X_train_scaled_features = pl.DataFrame(train_features_scaled, schema=all_feature_cols)
        X_val_scaled_features = pl.DataFrame(val_features_scaled, schema=all_feature_cols)

        meta_cols_to_keep = ['sequence_id', 'sequence_counter', 'gesture_int']
        train_df_final = train_df.select(meta_cols_to_keep).with_columns(X_train_scaled_features)
        val_df_final = val_df.select(meta_cols_to_keep).with_columns(X_val_scaled_features)

        # The gate target needs to be based on the raw ToF columns
        train_gate_target_df = generate_gate_targets(train_df, [c for c in train_df.columns if c.startswith('tof_v')])
        val_gate_target_df = generate_gate_targets(val_df, [c for c in val_df.columns if c.startswith('tof_v')])

        X_train, y_train, train_gate_target = create_sequence_dataset(train_df_final, all_feature_cols, train_gate_target_df)
        X_val, y_val, val_gate_target = create_sequence_dataset(val_df_final, all_feature_cols, val_gate_target_df)

        X_train_padded = pad_sequences(X_train, maxlen=MAX_PAD_LEN, padding='post', truncating='post', dtype='float32')
        X_val_padded = pad_sequences(X_val, maxlen=MAX_PAD_LEN, padding='post', truncating='post', dtype='float32')
        
        y_train_cat = to_categorical(y_train, num_classes=NUM_CLASSES)
        y_val_cat = to_categorical(y_val, num_classes=NUM_CLASSES)
        
        train_dataset = GatedMixupGenerator(
            X=X_train_padded, y=y_train_cat, gate_targets=train_gate_target,
            batch_size=BATCH_SIZE, imu_dim=imu_dim, alpha=0.2, masking_prob=0.25
        )
        val_dataset = tf.data.Dataset.from_tensor_slices((
            X_val_padded, {'main_output': y_val_cat, 'tof_gate': val_gate_target[:, np.newaxis]}
        )).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)

        del X_train, y_train, X_val, y_val, X_train_padded, X_val_padded
        gc.collect()
        
        model = model_builder()
        history = train_model(model, train_dataset, val_dataset, 150, LR_INIT, WD)
    
        monitor_metric = 'val_main_output_accuracy' if isinstance(model.output, dict) else 'val_accuracy'
        best_epoch = np.argmax(history.history[monitor_metric]) + 1
        best_epochs.append(best_epoch)
        print(f"--- Fold {fold_idx + 1} Best Epoch: {best_epoch} ---")

        # --- EVALUATION ---
        val_preds = model.predict(val_dataset)
        main_output_preds = val_preds['main_output']
        
        y_pred_fold = np.argmax(main_output_preds, axis=1)
        y_true_fold = np.argmax(y_val_cat, axis=1)
        fold_acc = accuracy_score(y_true_fold, y_pred_fold)
        fold_accuracies.append(fold_acc)
        print(f"Fold {fold_idx + 1} Accuracy: {fold_acc:.4f}")
        all_preds.append(y_pred_fold)
        all_labels.append(y_true_fold)

        del train_dataset, model, val_dataset
        gc.collect()

    # --- FINAL OOF REPORT for this model architecture ---
    print(f"\n=== OOF Summary for {model_name} ===")
    print(f"Per-fold Accuracies: {[round(a, 4) for a in fold_accuracies]}")
    print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")
    
    # --- NEW: Report on the best epochs found ---
    avg_best_epoch = int(np.mean(best_epochs))
    print(f"Best epochs per fold: {best_epochs}")
    print(f"Average best epoch: {avg_best_epoch}")
    
    # Store the results for this model
    model_results[model_name] = {
        'mean_accuracy': np.mean(fold_accuracies),
        'avg_best_epoch': avg_best_epoch
    }

    y_all_pred = np.concatenate(all_preds)
    y_all_true = np.concatenate(all_labels)
    print("\n=== Overall Classification Report ===")
    print(classification_report(y_all_true, y_all_pred, target_names=le.classes_, digits=4))

# --- FINAL SUMMARY ACROSS ALL MODELS ---
print("\n" + "="*60)
print("▶ FINAL MODEL EXPERIMENT SUMMARY")
print("="*60)
for model_name, results in model_results.items():
    print(f"  - {model_name}: Mean Accuracy = {results['mean_accuracy']:.4f}, Avg Best Epoch = {results['avg_best_epoch']}")

  Final merged DataFrame created with shape: (574945, 356)
  Training with 332 total features (7 IMU, 320 ToF/Thm).
  DataFrame columns have been reordered for the model.
  Loading clean base DataFrame...
  Engineering ToF/Thermal features...
  Training with 61 total features (6 raw IMU).
  DataFrame columns have been reordered for the model.

▶ Training and Evaluating Model: In_Model_FE

=== Fold 1/5 for In_Model_FE ===


I0000 00:00:1755979285.256308 4081569 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4714 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1060, pci bus id: 0000:01:00.0, compute capability: 6.1
2025-08-23 21:01:25.268325: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 50939392 exceeds 10% of free system memory.
2025-08-23 21:01:25.542469: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 50939392 exceeds 10% of free system memory.


LR Scheduler: 102 steps per epoch, 15300 total decay steps.
Epoch 1/150


  self._warn_if_super_not_called()
I0000 00:00:1755979295.032878 4081826 service.cc:152] XLA service 0x7b02a8001b80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1755979295.032926 4081826 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce GTX 1060, Compute Capability 6.1
2025-08-23 21:01:35.363622: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1755979296.914054 4081826 cuda_dnn.cc:529] Loaded cuDNN version 90300
2025-08-23 21:01:39.569495: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 12.11GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-23 21:01:40.185664: W external/local_xla/xla/tsl/framework/bfc_allocator.c

[1m  7/102[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 23ms/step - loss: 3.1167 - main_output_accuracy: 0.0843 - main_output_loss: 2.9572 - tof_gate_loss: 0.3371

I0000 00:00:1755979311.577409 4081826 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-08-23 21:01:54.441442: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 11.61GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-23 21:01:54.909895: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 9.92GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-23 21:01:57.128786: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 8.14GiB with freed_by_count=0. The caller indicates that this is 

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - loss: 2.8338 - main_output_accuracy: 0.1512 - main_output_loss: 2.7241 - tof_gate_loss: 0.0928

2025-08-23 21:02:08.711933: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 50939392 exceeds 10% of free system memory.
2025-08-23 21:02:11.618407: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 10.03GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 219ms/step - loss: 2.8323 - main_output_accuracy: 0.1517 - main_output_loss: 2.7227 - tof_gate_loss: 0.0923 - val_loss: 2.4670 - val_main_output_accuracy: 0.3237 - val_main_output_loss: 2.3576 - val_tof_gate_loss: 0.1002
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.3609 - main_output_accuracy: 0.3267 - main_output_loss: 2.2705 - tof_gate_loss: 0.0057 - val_loss: 2.0356 - val_main_output_accuracy: 0.4543 - val_main_output_loss: 1.9445 - val_tof_gate_loss: 0.0286
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 2.1989 - main_output_accuracy: 0.3979 - main_output_loss: 2.1111 - tof_gate_loss: 0.0041 - val_loss: 1.8666 - val_main_output_accuracy: 0.5089 - val_main_output_loss: 1.7819 - val_tof_gate_loss: 0.0090
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - loss: 2.0631 - main_output

2025-08-23 21:09:18.754180: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 50908160 exceeds 10% of free system memory.
2025-08-23 21:09:18.819515: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 50908160 exceeds 10% of free system memory.


LR Scheduler: 102 steps per epoch, 15300 total decay steps.
Epoch 1/150


  self._warn_if_super_not_called()


[1m 34/102[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1s[0m 27ms/step - loss: 3.0499 - main_output_accuracy: 0.0925 - main_output_loss: 2.8797 - tof_gate_loss: 0.3906

2025-08-23 21:09:43.281884: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 11.67GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-23 21:09:43.829406: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 9.95GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-08-23 21:09:45.959230: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 8.14GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 231ms/step - loss: 2.8605 - main_output_accuracy: 0.1375 - main_output_loss: 2.7318 - tof_gate_loss: 0.1845 - val_loss: 2.4369 - val_main_output_accuracy: 0.3196 - val_main_output_loss: 2.3296 - val_tof_gate_loss: 0.0808
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.4057 - main_output_accuracy: 0.3076 - main_output_loss: 2.3151 - tof_gate_loss: 0.0082 - val_loss: 2.1308 - val_main_output_accuracy: 0.4166 - val_main_output_loss: 2.0395 - val_tof_gate_loss: 0.0222
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.1825 - main_output_accuracy: 0.3890 - main_output_loss: 2.0960 - tof_gate_loss: 0.0044 - val_loss: 1.8820 - val_main_output_accuracy: 0.5098 - val_main_output_loss: 1.7923 - val_tof_gate_loss: 0.0116
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.0704 - main_output

  self._warn_if_super_not_called()


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 162ms/step - loss: 2.8394 - main_output_accuracy: 0.1490 - main_output_loss: 2.7294 - tof_gate_loss: 0.0941 - val_loss: 2.4632 - val_main_output_accuracy: 0.3331 - val_main_output_loss: 2.3539 - val_tof_gate_loss: 0.0971
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.3428 - main_output_accuracy: 0.3159 - main_output_loss: 2.2520 - tof_gate_loss: 0.0077 - val_loss: 2.1048 - val_main_output_accuracy: 0.4227 - val_main_output_loss: 2.0093 - val_tof_gate_loss: 0.0255
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 2.2082 - main_output_accuracy: 0.3997 - main_output_loss: 2.1202 - tof_gate_loss: 0.0054 - val_loss: 1.9080 - val_main_output_accuracy: 0.4865 - val_main_output_loss: 1.8186 - val_tof_gate_loss: 0.0115
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 2.0608 - main_output

  self._warn_if_super_not_called()


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 171ms/step - loss: 2.8399 - main_output_accuracy: 0.1535 - main_output_loss: 2.7194 - tof_gate_loss: 0.1440 - val_loss: 2.4965 - val_main_output_accuracy: 0.2926 - val_main_output_loss: 2.3860 - val_tof_gate_loss: 0.1068
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - loss: 2.3549 - main_output_accuracy: 0.3210 - main_output_loss: 2.2643 - tof_gate_loss: 0.0077 - val_loss: 2.1227 - val_main_output_accuracy: 0.4104 - val_main_output_loss: 2.0311 - val_tof_gate_loss: 0.0314
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.1474 - main_output_accuracy: 0.4077 - main_output_loss: 2.0600 - tof_gate_loss: 0.0042 - val_loss: 1.9262 - val_main_output_accuracy: 0.4982 - val_main_output_loss: 1.8419 - val_tof_gate_loss: 0.0208
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.0993 - main_output

  self._warn_if_super_not_called()


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 165ms/step - loss: 2.8320 - main_output_accuracy: 0.1479 - main_output_loss: 2.7237 - tof_gate_loss: 0.0855 - val_loss: 2.4908 - val_main_output_accuracy: 0.3294 - val_main_output_loss: 2.3811 - val_tof_gate_loss: 0.1050
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.3632 - main_output_accuracy: 0.3040 - main_output_loss: 2.2735 - tof_gate_loss: 0.0052 - val_loss: 2.1559 - val_main_output_accuracy: 0.3951 - val_main_output_loss: 2.0659 - val_tof_gate_loss: 0.0178
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 2.2195 - main_output_accuracy: 0.3937 - main_output_loss: 2.1324 - tof_gate_loss: 0.0038 - val_loss: 1.9058 - val_main_output_accuracy: 0.4767 - val_main_output_loss: 1.8186 - val_tof_gate_loss: 0.0077
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - loss: 2.0987 - main_output

In [None]:
import traceback
# =====================================================================================
# ARCHITECTURE SANITY CHECK
# =====================================================================================

# --- Step 1: Get a sample batch and define shapes ---
# (This part of your code is correct)
# Make sure your train_dataset is created before this block
try:
    sample_batch = next(iter(train_dataset))
    sample_input = sample_batch[0]
    input_shape = sample_input.shape[1:]
    imu_dim = len(imu_cols) # Assuming imu_cols is defined
    print(f"Sample input shape for testing: {sample_input.shape}\n")
except Exception as e:
    print(f"Could not get a sample from the dataset. Error: {e}")
    # Exit if we can't get a sample to test with
    exit()

# --- Step 2: Create a list of all model-building functions ---
# (This part of your code is correct)
# model_builders = [
#     ("CNN_Baseline", lambda: create_model_1_cnn_baseline(input_shape)),
#     ("GRU_Baseline", lambda: create_model_2_gru_baseline(input_shape)),
#     ("CNN_RNN_Hybrid", lambda: create_model_3_cnn_rnn_hybrid(input_shape)),
#     ("WaveNet_Style", lambda: create_model_4_wavenet_style(input_shape)),
#     ("UNet_Style", lambda: create_model_5_unet_style(input_shape)),
#     ("Transformer", lambda: create_model_6_transformer(input_shape)),
#     ("CNN_Transformer", lambda: create_model_7_cnn_transformer(input_shape)),
#     # For your two-branch model, you'll need the full IMU+ToF dataset
#     # ("Two_Branch", lambda: create_model_8_two_branch(input_shape, imu_dim)),
# ]

model_builders = [
    ("wave_net", lambda: create_wave_net(input_shape, imu_dim)),
    ("unet_wave", lambda: create_advanced_model_3_unet_wave(input_shape, imu_dim)),
    ("Advanced_Dual_UNet", lambda: create_advanced_model_A_dual_unet(input_shape, imu_dim)),
    ("Hyper UNet", lambda: create_advanced_model_B_hyper_unet(input_shape, imu_dim)),
]

# --- Step 3: Loop through the models, build them, and test with the sample ---
print("--- Testing all model architectures with a sample batch ---")
for model_name, model_builder in model_builders:
    print("\n" + "="*60)
    print(f"▶ Testing Model: {model_name}")
    print("="*60)
    
    try:
        # 1. Build the model using the builder function
        model = model_builder()
        
        # Optional: Print the model summary to check its structure
        print(f"Model Summary for {model_name}:")
        model.summary()
        
        # 2. Pass the sample input through the model
        print(f"\nPerforming forward pass for {model_name}...")
        output = model(sample_input)
        
        # 3. Print the output shape to verify it's correct
        print(f"✅ SUCCESS: Model '{model_name}' ran successfully.")
        # For multi-output models, output might be a list/dict. For single, it's a tensor.
        if isinstance(output, dict):
            for key, value in output.items():
                print(f"   Output '{key}' shape: {value.shape}")
        elif isinstance(output, list):
            for i, value in enumerate(output):
                print(f"   Output {i} shape: {value.shape}")
        else:
            print(f"   Output shape: {output.shape}")

    except Exception as e:
        print(f"❌ ERROR: Model '{model_name}' failed to build or run.")
        traceback.print_exc() # This will print the full error traceback
        
    # Clean up the created model to save memory
    del model
    gc.collect()

print("\n--- Model architecture testing complete. ---")

Could not get a sample from the dataset. Error: name 'train_dataset' is not defined
--- Testing all model architectures with a sample batch ---

▶ Testing Model: wave_net
Model Summary for wave_net:



Performing forward pass for wave_net...
❌ ERROR: Model 'wave_net' failed to build or run.

▶ Testing Model: unet_wave


Traceback (most recent call last):
  File "/tmp/ipykernel_4081569/3681017911.py", line 58, in <module>
    output = model(sample_input)
                   ^^^^^^^^^^^^
NameError: name 'sample_input' is not defined


Model Summary for unet_wave:



Performing forward pass for unet_wave...
❌ ERROR: Model 'unet_wave' failed to build or run.

▶ Testing Model: Advanced_Dual_UNet


Traceback (most recent call last):
  File "/tmp/ipykernel_4081569/3681017911.py", line 58, in <module>
    output = model(sample_input)
                   ^^^^^^^^^^^^
NameError: name 'sample_input' is not defined


Model Summary for Advanced_Dual_UNet:



Performing forward pass for Advanced_Dual_UNet...
❌ ERROR: Model 'Advanced_Dual_UNet' failed to build or run.


Traceback (most recent call last):
  File "/tmp/ipykernel_4081569/3681017911.py", line 58, in <module>
    output = model(sample_input)
                   ^^^^^^^^^^^^
NameError: name 'sample_input' is not defined



▶ Testing Model: Hyper UNet
Model Summary for Hyper UNet:



Performing forward pass for Hyper UNet...
❌ ERROR: Model 'Hyper UNet' failed to build or run.

--- Model architecture testing complete. ---


Traceback (most recent call last):
  File "/tmp/ipykernel_4081569/3681017911.py", line 58, in <module>
    output = model(sample_input)
                   ^^^^^^^^^^^^
NameError: name 'sample_input' is not defined


: 