In [1]:
import gc
import numpy as np
import polars as pl
import tensorflow as tf
from pathlib import Path
from tensorflow import shape, minimum
from tensorflow.keras import backend as k
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import pad_sequences, Sequence, to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Dense, Input, Conv1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D, Concatenate,
    BatchNormalization, GRU, Dropout, add, Activation, Multiply, Reshape,
    LayerNormalization, Add, Bidirectional, LSTM, UpSampling1D, Lambda, GaussianNoise, MultiHeadAttention
)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

from src.nn_blocks import tof_block, residual_se_cnn_block, TransformerBlock, tof_block_2, features_processing, unet_se_cnn

NUM_CLASSES = 18


# --- Gated Model 1: Based on CNN-RNN Hybrid ---
def create_gated_cnn_rnn(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # IMU branch
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd) # Output: (None, 64, 64)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output: (None, 32, 128)
    x1 = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd)))(x1) # Output: (None, 32, 256)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)

    # --- FIX: Project x2 to match x1's feature dimension before processing ---
    x2_projected = Dense(256, activation='relu')(x2)

    # Now both inputs to features_processing have shape (None, 32, 256)
    x = features_processing(x1, x2_projected)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Gated Model 2: Based on UNet_Style ---
def create_gated_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # IMU branch
    x1 = unet_se_cnn(imu, unet_depth=4, base_filters=64, kernel_size=5, drop=0.3) # Output: (None, 128, 64)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)

    # We will use a simpler approach for this model.
    x1_pooled = GlobalAveragePooling1D()(x1)
    x2_pooled = GlobalAveragePooling1D()(x2)
    x = Concatenate()([x1_pooled, x2_pooled])
    
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Gated Model 3: Based on CNN_Transformer ---
def create_gated_cnn_transformer(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # IMU branch
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd) # Output: (None, 64, 64)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output: (None, 32, 128)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128, rate=0.3)(x1) # Output: (None, 32, 128)
    x1 = residual_se_cnn_block(x1, 64, 3, drop=0.2, wd=wd) # Output: (None, 16, 64)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output: (None, 8, 128)    
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=128, rate=0.3)(x1) # Output: (None, 8, 128)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)
    x2 = tf.keras.layers.MaxPooling1D(4)(x2)

    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def best_unet_1(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = unet_se_cnn(imu, 3, base_filters=128, kernel_size=3)
    x2 = tof_block(tof, wd)

    x = features_processing(x1, x2)
    x = tf.keras.layers.Dropout(0.3)(x) 
    main_out = tf.keras.layers.Dense(18, activation="softmax", name="main_output")(x)
    gate_out = tf.keras.layers.Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return tf.keras.models.Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def best_unet_2(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = unet_se_cnn(imu, 3, base_filters=128, kernel_size=3)
    x2 = tof_block_2(tof, wd)

    x = features_processing(x1, x2)
    x = tf.keras.layers.Dropout(0.3)(x) 
    main_out = tf.keras.layers.Dense(18, activation="softmax", name="main_output")(x)
    gate_out = tf.keras.layers.Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return tf.keras.models.Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

2025-08-23 21:45:42.812359: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755981942.831419 4113464 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755981942.837283 4113464 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755981942.853367 4113464 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755981942.853399 4113464 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755981942.853401 4113464 computation_placer.cc:177] computation placer alr

In [2]:
# =====================================================================================
# 5 NEW ADVANCED MODEL ARCHITECTURES
# =====================================================================================

from src.nn_blocks import match_time_steps, wave_block, res_se_cnn_decoder_block

# --- Advanced Model 2: Stacked Transformer Tower ---
def create_advanced_model_2_transformer_tower(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Strong CNN backbone to create rich features for the Transformer
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd) # Output shape: (None, 32, 128)
    
    # Stacked Transformer Tower
    # Each block attends to the output of the previous one, building deeper context.
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    x1 = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256, rate=0.3)(x1)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd) # Output shape: (None, 32, 128)

    # Merge and classify
    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model 3: Hybrid UNet + WaveNet ---
def create_advanced_model_3_unet_wave(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1_unet = unet_se_cnn(imu, unet_depth=3, base_filters=64, kernel_size=5)
    x1_wave = wave_block(imu, 64, 3, n=5, dropout_rate=0.3) # n=5 -> dilations up to 16
    
    x1_unet_matched, x1_wave_matched = match_time_steps(x1_unet, x1_wave)
    x1 = Concatenate()([x1_unet_matched, x1_wave_matched])
    
    x2 = tof_block(tof, wd)

    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def create_wave_net(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = wave_block(imu, 128, 3, n=4, dropout_rate=0.3) 
    x2 = tof_block(tof, wd)

    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model 4: Triple Stacked Block Design ---
def cnn_gru_block(x, filters, kernel_size, wd=1e-4):
    # A self-contained block combining CNN and GRU
    x_cnn = residual_se_cnn_block(x, filters, kernel_size, wd=wd)
    x_gru = Bidirectional(GRU(filters // 2, return_sequences=True))(x_cnn)
    return x_gru

def cnn_gru_block(x, filters, kernel_size, wd=1e-4):
    """
    A simplified and robust block that first applies a CNN, then a GRU.
    """
    # 1. CNN part for feature extraction and downsampling
    x = residual_se_cnn_block(x, filters, kernel_size, wd=wd)
    
    # 2. GRU part for sequence processing
    x = Bidirectional(GRU(filters, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    
    return x

def create_advanced_model_4_stacked_blocks(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Apply the hybrid block three times
    x1 = cnn_gru_block(imu, 64, 3)  # Output: (None, 64, 128)
    x1 = cnn_gru_block(x1, 128, 5) # Output: (None, 32, 256)
    
    # The final block will not return sequences to simplify the final merge
    x1 = Bidirectional(GRU(128, return_sequences=False))(x1) # Output: (None, 256)
    
    # Standard ToF branch, but we need to aggregate it to match x1
    x2 = tof_block_2(tof, wd) # Output: (None, 32, 128)
    x2 = GlobalAveragePooling1D()(x2) # Output: (None, 128)

    # Merge the two aggregated feature vectors
    x = Concatenate()([x1, x2]) # Output: (None, 256 + 128) = (None, 384)
    
    # Final classifier MLP
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model 5: UNet with BiLSTM Bottleneck ---
def unet_se_cnn_bilstm(x, unet_depth=3, base_filters=64, kernel_size=3, drop=0.3):
    filters = base_filters
    skips = []
    for _ in range(unet_depth):
        x = residual_se_cnn_block(x, filters, kernel_size, drop=drop)
        skips.append(x)
        filters *= 2
    
    # --- BiLSTM Bottleneck ---
    # Process the most compressed representation sequentially
    x = Bidirectional(LSTM(filters // 2, return_sequences=True))(x)
    
    for skip in reversed(skips):
        filters //= 2
        x = res_se_cnn_decoder_block(x, filters, kernel_size, drop=drop, skip_connection=skip)
    return x

def create_advanced_model_1_deep_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branches ---
    x1_unet = unet_se_cnn(imu, unet_depth=4, base_filters=128, kernel_size=5, drop=0.3)
    x1_conv_k3 = residual_se_cnn_block(imu, 64, 3)
    x1_conv_k7 = residual_se_cnn_block(imu, 64, 7)
    
    # --- FIX: Aggregate each branch BEFORE merging ---
    # This creates a fixed-size vector from each branch, avoiding shape conflicts.
    p1 = GlobalAveragePooling1D()(x1_unet)
    p2 = GlobalAveragePooling1D()(x1_conv_k3)
    p3 = GlobalAveragePooling1D()(x1_conv_k7)
    
    # --- ToF Branch ---
    x2 = tof_block_2(tof, wd)
    p4 = GlobalAveragePooling1D()(x2)

    # Concatenate the aggregated feature vectors
    x = Concatenate()([p1, p2, p3, p4])
    
    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def create_advanced_model_5_unet_bilstm(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Use the UNet with the BiLSTM bottleneck
    x1 = unet_se_cnn_bilstm(imu, unet_depth=3, base_filters=128, kernel_size=3)
    
    # Standard ToF branch
    x2 = tof_block_2(tof, wd)

    # --- FIX: Use the robust aggregation strategy instead of features_processing ---
    x1_pooled = GlobalAveragePooling1D()(x1)
    x2_pooled = GlobalAveragePooling1D()(x2)
    x = Concatenate()([x1_pooled, x2_pooled])

    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [3]:
# =====================================================================================
# 3 NEW ADVANCED MODEL ARCHITECTURES
# =====================================================================================
from src.nn_blocks import attention_layer

def create_advanced_model_A_dual_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Branch 1: A deep U-Net for the IMU data
    x1_raw = unet_se_cnn(imu, unet_depth=4, base_filters=128, kernel_size=5, drop=0.3)
    
    # Branch 2: A parallel, slightly lighter U-Net for the ToF/Thermal data
    x2_raw = unet_se_cnn(tof, unet_depth=3, base_filters=64, kernel_size=5, drop=0.3)

    # --- FIX: Project both branches to a common feature dimension (e.g., 128) ---
    # This ensures the input to features_processing is consistent.
    x1 = Conv1D(128, 1, padding='same', activation='relu', name='imu_projection')(x1_raw)
    x2 = Conv1D(128, 1, padding='same', activation='relu', name='tof_projection')(x2_raw)
    
    # Now both x1 and x2 have shape (None, 128, 128)
    # They can be safely passed to the features_processing block.
    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model B: Cross-Attention Fusion ---
# Hypothesis: Instead of just concatenating the IMU and ToF branches, we can create
# richer features by allowing them to "talk to each other." The IMU branch will learn
# what to pay attention to in the ToF data, and vice-versa.
def create_advanced_model_B_cross_attention(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # 1. Create strong, downsampled feature representations for both branches
    # Output Shape for both: (None, 32, 128)
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd)
    
    x2 = tof_block_2(tof, wd)

    # 2. Cross-Attention Fusion
    # The IMU branch queries the ToF branch for relevant context
    imu_attends_tof = tf.keras.layers.Attention()([x1, x2])
    # The ToF branch queries the IMU branch for relevant context
    tof_attends_imu = tf.keras.layers.Attention()([x2, x1])
    
    # 3. Create an enriched representation by concatenating all perspectives
    # The final tensor contains the original features plus the context-aware features.
    # Shape: (None, 32, 128 + 128 + 128 + 128) = (None, 32, 512)
    x = Concatenate()([x1, imu_attends_tof, x2, tof_attends_imu])
    
    # 4. Final Processing
    # We use a powerful sequence processor on this ultra-rich tensor
    x = Bidirectional(GRU(256, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    x = attention_layer(x)
    
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model C: Stacked Hybrid Blocks ---
# Hypothesis: A single block of (CNN -> RNN) is good. Repeatedly stacking this
# hybrid block will allow the model to learn progressively more abstract and

# powerful spatio-temporal features.
def cnn_lstm_block(x, filters, kernel_size, drop=0.2, wd=1e-4):
    # A self-contained, reusable block
    x = residual_se_cnn_block(x, filters, kernel_size, drop=drop, wd=wd)
    x = Bidirectional(LSTM(filters, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    return x

def create_advanced_model_C_stacked_hybrid(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch: Stacked Hybrid Blocks ---
    # Each block refines the output of the previous one
    # Input: (128, D) -> Block1: (64, 128) -> Block2: (32, 256)
    x1 = cnn_lstm_block(imu, 64, 3)
    x1 = cnn_lstm_block(x1, 128, 5)
    
    # --- ToF Branch ---
    # Output: (32, 128)
    x2 = tof_block_2(tof, wd)
    # Project ToF features to match the final IMU feature dimension (256)
    x2_projected = Dense(256, activation='relu')(x2)

    # Now both inputs have shape (None, 32, 256) and can be processed
    x = features_processing(x1, x2_projected)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [4]:
# =====================================================================================
# 3 NEW ADVANCED MODEL ARCHITECTURES
# =====================================================================================

# --- Advanced Model A: BERT-Fusion (Keras Implementation) ---
# Hypothesis: Using a Transformer (BERT) as a late-stage fusion layer for features
# from three separate, specialized branches will create the most powerful representation.
# This is a direct translation of the PyTorch model's core idea.
def create_advanced_model_A_bert_fusion(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof_and_thm = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)
    
    # We need to split ToF and Thermal for separate processing
    # Assuming thm_cols are the first 5 in the tof_and_thm tensor
    thm = tf.keras.layers.Lambda(lambda t: t[:, :, :5])(tof_and_thm)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, 5:])(tof_and_thm)

    # 1. Create three separate feature extraction branches
    # IMU Branch
    x_imu = residual_se_cnn_block(imu, 128, 3)
    x_imu = residual_se_cnn_block(x_imu, 256, 5) # Shape: (None, 32, 256)
    
    # Thermal Branch
    x_thm = residual_se_cnn_block(thm, 64, 3)
    x_thm = residual_se_cnn_block(x_thm, 128, 5)
    x_thm = Conv1D(256, 1, padding='same', activation='relu')(x_thm) # Project to 256 features
    
    # ToF Branch
    x_tof = residual_se_cnn_block(tof, 128, 3)
    x_tof = residual_se_cnn_block(x_tof, 256, 5) # Shape: (None, 32, 256)
    
    # 2. Concatenate along the feature axis and feed into a Transformer
    # Shape: (None, 32, 256+256+256) -> (None, 32, 768)
    x = Concatenate()([x_imu, x_thm, x_tof])
    
    # Transformer (BERT-like) layers for deep fusion
    x = TransformerBlock(embed_dim=768, num_heads=8, ff_dim=1024, rate=0.2)(x)
    x = TransformerBlock(embed_dim=768, num_heads=8, ff_dim=1024, rate=0.2)(x)
    
    # 3. Use Global Pooling to aggregate the time dimension
    x = GlobalAveragePooling1D()(x)
    
    # 4. Final Classifier MLP
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model B: Hyper-UNet ---
# Hypothesis: Since U-Nets are the top performers, an even deeper and wider U-Net
# with more filters and a deeper encoder/decoder structure will capture more complex features.
def create_advanced_model_B_hyper_unet(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # Branch 1: A very deep and wide U-Net for IMU data
    # unet_depth=5 creates a very deep model, base_filters=128 makes it wide.
    x1 = unet_se_cnn(imu, unet_depth=5, base_filters=128, kernel_size=5, drop=0.3)
    
    # Branch 2: A standard ToF block
    x2 = tof_block_2(tof, wd)

    # Project both branches to a common, large feature dimension before merging
    x1_proj = Conv1D(128, 1, padding='same', activation='relu')(x1)
    x2_proj = Conv1D(128, 1, padding='same', activation='relu')(x2)
    
    # Use the standard features_processing block to merge and classify
    x = features_processing(x1_proj, x2_proj)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# --- Advanced Model C: Parallel UNet-Transformer Hybrid ---
# Hypothesis: The IMU signal contains both local patterns (best for U-Net) and global
# context (best for Transformer). Processing the IMU with both backbones in parallel
# and fusing their outputs will create the ultimate feature representation.
def create_advanced_model_C_parallel_hybrid(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch is now two parallel streams ---
    
    # Stream 1: U-Net for multi-resolution analysis
    imu_unet = unet_se_cnn(imu, unet_depth=4, base_filters=128, kernel_size=5)
    
    # Stream 2: CNN -> Transformer Tower for global context
    imu_cnn = residual_se_cnn_block(imu, 64, 3)
    imu_cnn = residual_se_cnn_block(imu_cnn, 128, 5) # Shape: (None, 32, 128)
    imu_transformer = TransformerBlock(embed_dim=128, num_heads=4, ff_dim=256)(imu_cnn)
    
    # --- Fusion of IMU streams ---
    imu_unet_matched, imu_transformer_matched = match_time_steps(imu_unet, imu_transformer)
    x1 = Concatenate()([imu_unet_matched, imu_transformer_matched]) # Shape: (None, 32, 256)
    
    # --- ToF Branch ---
    x2 = tof_block_2(tof, wd) # Shape: (None, 32, 128)

    # --- FIX: Project both branches to a common, predictable feature dimension ---
    # Let's project both to 256 features, so the merged result is 512.
    x1_proj = Conv1D(256, 1, padding='same', activation='relu', name='imu_projection')(x1)
    x2_proj = Conv1D(256, 1, padding='same', activation='relu', name='tof_projection')(x2)
    
    # Now both x1_proj and x2_proj have shape (None, 32, 256)
    # They can be safely passed to the features_processing block.
    x = features_processing(x1_proj, x2_proj)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [5]:
# import polars as pl
# df = pl.read_parquet('output/imu_physics_feats.parquet')
# df.columns

In [6]:
from tensorflow.keras import Layer, Sequential

def ImuFeatureExtractorLayer(imu_input):
    """A Keras layer to perform on-the-fly feature engineering."""
    acc = imu_input[:, :, :3]  # Assuming raw acc_x, y, z are the first 3 features
    gyro = imu_input[:, :, 3:6] # Assuming raw rot_w,x,y,z -> angular velocity are next
    
    acc_mag = tf.norm(acc, axis=-1, keepdims=True)
    gyro_mag = tf.norm(gyro, axis=-1, keepdims=True)
    
    # Jerk (diff) requires padding to maintain time dimension
    jerk = tf.pad(acc[:, 1:, :] - acc[:, :-1, :], [[0, 0], [1, 0], [0, 0]])
    
    # Squared values
    acc_pow = tf.square(acc)
    
    # Concatenate all derived features
    return Concatenate()([acc, gyro, acc_mag, gyro_mag, jerk, acc_pow])

def create_new_model_1_in_model_fe(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    # IMPORTANT: This model expects the RAW acc/rot features, not the engineered ones.
    # You will need to adjust your data pipeline to feed the raw features.
    imu_raw = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # 1. On-the-fly feature engineering branch
    x1 = ImuFeatureExtractorLayer(imu_raw)
    
    # 2. Standard CNN backbone to process these rich features
    x1 = residual_se_cnn_block(x1, 128, 5)
    x1 = residual_se_cnn_block(x1, 256, 7)
    
    # 3. Standard ToF branch
    x2 = tof_block_2(tof, wd)

    # 4. Merge and classify
    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# =====================================================================================
# 3 NEW ADVANCED PANNs-BASED MODEL ARCHITECTURES
# =====================================================================================

def create_panns_model_A_rnn_head(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch: Parallel CNNs (PANNs Backbone) ---
    # Each branch downsamples time to 32 and outputs 128 features
    k3 = residual_se_cnn_block(imu, 128, 3)
    k5 = residual_se_cnn_block(imu, 128, 5)
    k7 = residual_se_cnn_block(imu, 128, 7)
    
    # Concatenate the multi-scale features
    # Shape: (None, 32, 128 + 128 + 128) = (None, 32, 384)
    x1 = Concatenate()([k3, k5, k7])
    
    # --- ToF Branch ---
    x2 = tof_block(tof, wd) # Shape: (None, 32, 128)

    # --- Merge and Process with RNN Head ---
    # Project ToF features to match the IMU feature dimension for a cleaner merge
    x2_proj = Conv1D(384, 1, padding='same', activation='relu')(x2)
    
    # Concatenate the full feature set
    x = Concatenate()([x1, x2_proj]) # Shape: (None, 32, 384 + 384) = (None, 32, 768)
    
    # Add a powerful RNN head to learn sequential patterns from the rich features
    x = Bidirectional(GRU(384, return_sequences=True, kernel_regularizer=l2(wd)))(x)
    x = attention_layer(x) # Use attention to summarize the sequence
    
    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

def pann_rnn_head_feat_processing(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch: Parallel CNNs (PANNs Backbone) ---
    # Each branch downsamples time to 32 and outputs 128 features
    k3 = residual_se_cnn_block(imu, 128, 3)
    k5 = residual_se_cnn_block(imu, 128, 5)
    k7 = residual_se_cnn_block(imu, 128, 7)
    
    # Shape: (None, 32, 128 + 128 + 128) = (None, 32, 384)
    x1 = Concatenate()([k3, k5, k7])
    x2 = tof_block(tof, wd) # Shape: (None, 32, 128)
    x2_proj = Conv1D(384, 1, padding='same', activation='relu')(x2)
    
    x = features_processing(x1, x2_proj)
    x = Dropout(0.4)(x)
    x = Dense(256, activation='relu')(x)
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [7]:
def ImuFeatureExtractorLayer(imu_input):
    """A Keras layer to perform on-the-fly feature engineering."""
    acc = tf.keras.layers.Lambda(lambda t: t[:, :, :3])(imu_input)
    gyro = tf.keras.layers.Lambda(lambda t: t[:, :, 3:6])(imu_input)
    
    # --- THIS IS THE FIX ---
    # Wrap each raw TensorFlow function in a Keras Lambda layer.
    
    acc_mag = tf.keras.layers.Lambda(lambda t: tf.norm(t, axis=-1, keepdims=True))(acc)
    gyro_mag = tf.keras.layers.Lambda(lambda t: tf.norm(t, axis=-1, keepdims=True))(gyro)
    
    # The tf.pad function also needs to be wrapped.
    jerk = tf.keras.layers.Lambda(
        lambda t: tf.pad(t[:, 1:, :] - t[:, :-1, :], [[0, 0], [1, 0], [0, 0]])
    )(acc)
    
    # tf.square is a simple operation, but for consistency, we can wrap it too.
    acc_pow = tf.keras.layers.Lambda(tf.square)(acc)
    
    # Concatenate all the resulting KerasTensors
    return Concatenate()([acc, gyro, acc_mag, gyro_mag, jerk, acc_pow])

def create_new_model_1_in_model_fe(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu_raw = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # 1. On-the-fly feature engineering branch
    x1 = ImuFeatureExtractorLayer(imu_raw)
    
    # 2. Standard CNN backbone to process these rich features
    x1 = residual_se_cnn_block(x1, 128, 5)
    x1 = residual_se_cnn_block(x1, 256, 7)
    
    # 3. Standard ToF branch
    x2 = tof_block_2(tof, wd)

    # --- FIX: Aggregate each branch BEFORE merging ---
    # This creates fixed-size vectors and avoids all shape conflicts.
    x1_pooled = GlobalAveragePooling1D()(x1)
    x2_pooled = GlobalAveragePooling1D()(x2)
    
    # Concatenate the aggregated feature vectors
    x = Concatenate()([x1_pooled, x2_pooled])
    
    # --- Final Classifier MLP ---
    x = Dropout(0.4)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(256, activation='relu')(x)
    
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

In [None]:
def generate_tof_features_for_inference(df: pl.DataFrame) -> pl.DataFrame:
    decay_weights = np.power(0.9, np.arange(64))
    x_coords, y_coords = np.meshgrid(np.arange(8), np.arange(8))
    feature_expressions = []
    for i in range(1, 6):
        pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
        if not all(col in df.columns for col in pixel_cols): continue
        list_expr = pl.concat_list([pl.when(pl.col(c) == -1).then(None).otherwise(pl.col(c)) for c in pixel_cols]).alias(f"tof_{i}_list")
        feature_expressions.extend([list_expr.list.mean().alias(f'tof_{i}_mean'), list_expr.list.std().alias(f'tof_{i}_std'),
                                    list_expr.list.min().alias(f'tof_{i}_min'), list_expr.list.max().alias(f'tof_{i}_max'),
                                    list_expr.list.median().alias(f'tof_{i}_median'), list_expr.list.diff().list.mean().alias(f'tof_{i}_diff_mean'),
                                    list_expr.list.drop_nulls().list.len().alias(f'tof_{i}_active_pixels'),
                                    # list_expr.list.drop_nulls().map_elements(pl_skew, return_dtype=pl.Float64).alias(f'tof_{i}_skew'),
                                    # list_expr.list.drop_nulls().map_elements(pl_kurtosis, return_dtype=pl.Float64).alias(f'tof_{i}_kurtosis')
                                    ])
        tof_data_exprs = [pl.when(pl.col(c) == -1).then(None).otherwise(pl.col(c)) for c in pixel_cols]
        feature_expressions.append(pl.sum_horizontal([(expr * weight).fill_null(0) for expr, weight in zip(tof_data_exprs, decay_weights)]).alias(f'tof_{i}_mean_decay'))
        weights_exprs = [(1 / (expr + 1e-6)).fill_null(0) for expr in tof_data_exprs]
        total_weight_expr = pl.sum_horizontal(weights_exprs)
        centroid_x_expr = pl.when(total_weight_expr > 1e-9).then(pl.sum_horizontal([(w * c) for w, c in zip(weights_exprs, x_coords.ravel())]) / total_weight_expr).otherwise(None)
        centroid_y_expr = pl.when(total_weight_expr > 1e-9).then(pl.sum_horizontal([(w * c) for w, c in zip(weights_exprs, y_coords.ravel())]) / total_weight_expr).otherwise(None)
        feature_expressions.extend([centroid_x_expr.alias(f'tof_{i}_centroid_x'), centroid_y_expr.alias(f'tof_{i}_centroid_y')])
    if feature_expressions:
        df = df.with_columns(feature_expressions)
    return df

: 

In [None]:
# =====================================================================================
# CONFIGURATION
# =====================================================================================
LR_INIT = 5e-4
WD = 3e-3
NUM_CLASSES = 18
BATCH_SIZE = 64
N_SPLITS = 5 
MAX_PAD_LEN = 128
FEATURE_DIR = Path('output')
RAW_DIR = Path('input/cmi-detect-behavior-with-sensor-data')
RANDOM_STATE = 42
SAMPLING_RATE_HZ = 200 # Use the correct sampling rate

from src.merge_feats_dynamic import merge_feature_sets
from src.functions import create_sequence_dataset, generate_gate_targets, train_model
from src.imu_physics_feats import calculate_angular_velocity
from src.nn_blocks import GatedMixupGenerator

# =====================================================================================
# TRAINING LOGIC
# =====================================================================================


def create_end_to_end_model(input_shape, imu_dim, wd=1e-4):
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof_thm = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    # --- IMU Branch: Learns from raw acc and rot ---
    # A deep CNN tower to extract hierarchical features from the raw IMU signals.
    x1 = residual_se_cnn_block(imu, 64, 3, drop=0.2, wd=wd)
    x1 = residual_se_cnn_block(x1, 128, 5, drop=0.2, wd=wd)
    x1 = residual_se_cnn_block(x1, 256, 7, drop=0.2, wd=wd) # Shape: (None, 16, 256)
    
    # --- ToF/Thm Branch: Learns from raw pixels and temperatures ---
    # A lighter CNN tower to handle the high dimensionality of the raw ToF/Thm data.
    x2 = residual_se_cnn_block(tof_thm, 64, 3, drop=0.2, wd=wd)
    x2 = residual_se_cnn_block(x2, 128, 5, drop=0.2, wd=wd) # Shape: (None, 32, 128)
    # Project to match the IMU branch's feature dimension
    x2 = Conv1D(256, 1, padding='same', activation='relu')(x2) # Shape: (None, 32, 256)

    # --- Merge and Classify ---
    # Use the standard features_processing block to merge and learn context
    x = features_processing(x1, x2)
    x = Dropout(0.3)(x) 
    main_out = Dense(NUM_CLASSES, activation="softmax", name="main_output")(x)
    gate_out = Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

# =====================================================================================
# TRAINING LOGIC
# =====================================================================================

# --- Step 1: Load and Clean RAW Data ---
print("  Loading and cleaning raw data from train.csv...")
RAW_DIR = Path('input/cmi-detect-behavior-with-sensor-data')
df = pl.read_csv(RAW_DIR / "train.csv")
demographics_df = pl.read_csv(RAW_DIR / "train_demographics.csv")
df = df.join(demographics_df, on='subject', how='left')

# Define all columns that contain sensor readings
sensor_cols = [c for c in df.columns if c.startswith(('acc_', 'rot_', 'thm_', 'tof_'))]

# Clean the raw data by replacing -1 and imputing nulls.
df = df.with_columns(
    [pl.when(pl.col(c) == -1).then(None).otherwise(pl.col(c)).alias(c) for c in sensor_cols]
).with_columns(
    pl.col(sensor_cols).forward_fill().backward_fill().fill_null(0).over("sequence_id")
)

le = LabelEncoder()
gesture_encoded = le.fit_transform(df.get_column('gesture'))
final_df = df.with_columns(pl.Series("gesture_int", gesture_encoded))  
print(f"  Final DataFrame created with shape: {final_df.shape}")

# --- Step 2: Define Feature Columns from RAW Data ---
raw_imu_cols = [c for c in final_df.columns if c.startswith(('acc_', 'rot_'))]
raw_tof_thm_cols = [c for c in final_df.columns if c.startswith(('thm_', 'tof_'))]
all_feature_cols = raw_imu_cols + raw_tof_thm_cols
imu_dim = len(raw_imu_cols)
print(f"  Training with {len(all_feature_cols)} total raw features ({imu_dim} IMU).")    

# Reorder the DataFrame to match the required structure for the model.
metadata_to_keep = ['sequence_id', 'sequence_counter', 'gesture', 'gesture_int', 'subject']
final_df = final_df.select(metadata_to_keep + all_feature_cols)
print("  DataFrame columns have been reordered for the model.")

# --- Step 3: Prepare for Cross-Validation ---
cv_info = final_df.group_by("sequence_id").agg(pl.first("gesture_int")).sort("sequence_id")
all_sequence_ids = cv_info.get_column("sequence_id").to_numpy()
y_for_split = cv_info.get_column("gesture_int").to_numpy()

input_shape = (MAX_PAD_LEN, len(all_feature_cols)) 
model_results = {}
model_builders = [("End_To_End_CNN", lambda: create_end_to_end_model(input_shape, imu_dim))]

for model_name, model_builder in model_builders:
    print("\n" + "="*60)
    print(f"▶ Training and Evaluating Model: {model_name}")
    print("="*60)

    kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    fold_accuracies = []
    all_preds = []
    all_labels = []
    best_epochs = []

    for fold_idx, (train_indices, val_indices) in enumerate(kf.split(all_sequence_ids, y_for_split)):
        print(f"\n=== Fold {fold_idx + 1}/{N_SPLITS} for {model_name} ===")
        train_ids = all_sequence_ids[train_indices]
        val_ids = all_sequence_ids[val_indices]

        train_df = final_df.filter(pl.col('sequence_id').is_in(train_ids))
        val_df = final_df.filter(pl.col('sequence_id').is_in(val_ids))
        
        scaler = StandardScaler()
        train_features_scaled = scaler.fit_transform(train_df.select(all_feature_cols))
        val_features_scaled = scaler.transform(val_df.select(all_feature_cols))
        
        X_train_scaled_features = pl.DataFrame(train_features_scaled, schema=all_feature_cols)
        X_val_scaled_features = pl.DataFrame(val_features_scaled, schema=all_feature_cols)

        meta_cols_to_keep = ['sequence_id', 'sequence_counter', 'gesture_int']
        train_df_final = train_df.select(meta_cols_to_keep).with_columns(X_train_scaled_features)
        val_df_final = val_df.select(meta_cols_to_keep).with_columns(X_val_scaled_features)

        # Gate target is based on raw ToF columns
        tof_v_cols = [c for c in train_df.columns if c.startswith('tof_v')]
        train_gate_target_df = generate_gate_targets(train_df, tof_v_cols)
        val_gate_target_df = generate_gate_targets(val_df, tof_v_cols)

        X_train, y_train, train_gate_target = create_sequence_dataset(train_df_final, all_feature_cols, train_gate_target_df)
        X_val, y_val, val_gate_target = create_sequence_dataset(val_df_final, all_feature_cols, val_gate_target_df)

        X_train_padded = pad_sequences(X_train, maxlen=MAX_PAD_LEN, padding='post', truncating='post', dtype='float32')
        X_val_padded = pad_sequences(X_val, maxlen=MAX_PAD_LEN, padding='post', truncating='post', dtype='float32')
        
        y_train_cat = to_categorical(y_train, num_classes=NUM_CLASSES)
        y_val_cat = to_categorical(y_val, num_classes=NUM_CLASSES)
        
        train_dataset = GatedMixupGenerator(
            X=X_train_padded, y=y_train_cat, gate_targets=train_gate_target,
            batch_size=BATCH_SIZE, imu_dim=imu_dim, alpha=0.2, masking_prob=0.25
        )
        val_dataset = tf.data.Dataset.from_tensor_slices((
            X_val_padded, {'main_output': y_val_cat, 'tof_gate': val_gate_target[:, np.newaxis]}
        )).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)

        del X_train, y_train, X_val, y_val, X_train_padded, X_val_padded
        gc.collect()
        
        model = model_builder()
        history = train_model(model, train_dataset, val_dataset, 150, LR_INIT, WD)
        
        monitor_metric = 'val_main_output_accuracy'
        best_epoch = np.argmax(history.history[monitor_metric]) + 1
        best_epochs.append(best_epoch)
        print(f"--- Fold {fold_idx + 1} Best Epoch: {best_epoch} ---")

        val_preds = model.predict(val_dataset)
        main_output_preds = val_preds['main_output']
        
        y_pred_fold = np.argmax(main_output_preds, axis=1)
        y_true_fold = np.argmax(y_val_cat, axis=1)
        fold_acc = accuracy_score(y_true_fold, y_pred_fold)
        fold_accuracies.append(fold_acc)
        print(f"Fold {fold_idx + 1} Accuracy: {fold_acc:.4f}")
        all_preds.append(y_pred_fold)
        all_labels.append(y_true_fold)

        del train_dataset, model, val_dataset
        gc.collect()

    # --- FINAL OOF REPORT for this model architecture ---
    print(f"\n=== OOF Summary for {model_name} ===")
    print(f"Per-fold Accuracies: {[round(a, 4) for a in fold_accuracies]}")
    print(f"Mean Accuracy: {np.mean(fold_accuracies):.4f} ± {np.std(fold_accuracies):.4f}")
    
    # --- NEW: Report on the best epochs found ---
    avg_best_epoch = int(np.mean(best_epochs))
    print(f"Best epochs per fold: {best_epochs}")
    print(f"Average best epoch: {avg_best_epoch}")
    
    # Store the results for this model
    model_results[model_name] = {
        'mean_accuracy': np.mean(fold_accuracies),
        'avg_best_epoch': avg_best_epoch
    }

    y_all_pred = np.concatenate(all_preds)
    y_all_true = np.concatenate(all_labels)
    print("\n=== Overall Classification Report ===")
    print(classification_report(y_all_true, y_all_pred, target_names=le.classes_, digits=4))

# --- FINAL SUMMARY ACROSS ALL MODELS ---
print("\n" + "="*60)
print("▶ FINAL MODEL EXPERIMENT SUMMARY")
print("="*60)
for model_name, results in model_results.items():
    print(f"  - {model_name}: Mean Accuracy = {results['mean_accuracy']:.4f}, Avg Best Epoch = {results['avg_best_epoch']}")

  Loading and cleaning raw data from train.csv...
  Final DataFrame created with shape: (574945, 349)
  Training with 332 total raw features (7 IMU).
  DataFrame columns have been reordered for the model.

▶ Training and Evaluating Model: End_To_End_CNN

=== Fold 1/5 for End_To_End_CNN ===


I0000 00:00:1755982007.169106 4113464 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4714 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1060, pci bus id: 0000:01:00.0, compute capability: 6.1
2025-08-23 21:46:47.185972: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 277243904 exceeds 10% of free system memory.
2025-08-23 21:46:48.677978: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 277243904 exceeds 10% of free system memory.


LR Scheduler: 102 steps per epoch, 15300 total decay steps.
Epoch 1/150


  self._warn_if_super_not_called()
I0000 00:00:1755982033.014432 4113620 cuda_dnn.cc:529] Loaded cuDNN version 90300
2025-08-23 21:47:14.138921: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.17GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m 34/102[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m10s[0m 149ms/step - loss: 4.1007 - main_output_accuracy: 0.0589 - main_output_loss: 3.4868 - tof_gate_loss: 1.0795

2025-08-23 21:47:24.504699: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.85GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - loss: 3.8539 - main_output_accuracy: 0.0900 - main_output_loss: 3.2815 - tof_gate_loss: 0.8719

2025-08-23 21:47:36.792014: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 277243904 exceeds 10% of free system memory.


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 205ms/step - loss: 3.8513 - main_output_accuracy: 0.0904 - main_output_loss: 3.2794 - tof_gate_loss: 0.8695 - val_loss: 3.0205 - val_main_output_accuracy: 0.2551 - val_main_output_loss: 2.5546 - val_tof_gate_loss: 0.3690
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 159ms/step - loss: 3.1322 - main_output_accuracy: 0.2071 - main_output_loss: 2.6931 - tof_gate_loss: 0.2318 - val_loss: 2.6234 - val_main_output_accuracy: 0.3096 - val_main_output_loss: 2.2088 - val_tof_gate_loss: 0.1430
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 150ms/step - loss: 2.8625 - main_output_accuracy: 0.2739 - main_output_loss: 2.4557 - tof_gate_loss: 0.1091 - val_loss: 2.3845 - val_main_output_accuracy: 0.3691 - val_main_output_loss: 1.9894 - val_tof_gate_loss: 0.0836
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 162ms/step - loss: 2.6621 - main_

2025-08-23 22:03:10.731939: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 277073920 exceeds 10% of free system memory.
2025-08-23 22:03:12.539651: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 277073920 exceeds 10% of free system memory.


LR Scheduler: 102 steps per epoch, 15300 total decay steps.
Epoch 1/150


  self._warn_if_super_not_called()


[1m 12/102[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m12s[0m 140ms/step - loss: 4.0874 - main_output_accuracy: 0.0686 - main_output_loss: 3.4652 - tof_gate_loss: 1.1210

2025-08-23 22:03:39.447708: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.89GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.


[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 225ms/step - loss: 3.8109 - main_output_accuracy: 0.1027 - main_output_loss: 3.2438 - tof_gate_loss: 0.8491 - val_loss: 3.0040 - val_main_output_accuracy: 0.2479 - val_main_output_loss: 2.5380 - val_tof_gate_loss: 0.3553
Epoch 2/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 172ms/step - loss: 3.1416 - main_output_accuracy: 0.2034 - main_output_loss: 2.7032 - tof_gate_loss: 0.2289 - val_loss: 2.5581 - val_main_output_accuracy: 0.3294 - val_main_output_loss: 2.1456 - val_tof_gate_loss: 0.1236
Epoch 3/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 164ms/step - loss: 2.8722 - main_output_accuracy: 0.2601 - main_output_loss: 2.4651 - tof_gate_loss: 0.1116 - val_loss: 2.3364 - val_main_output_accuracy: 0.3840 - val_main_output_loss: 1.9419 - val_tof_gate_loss: 0.0642
Epoch 4/150
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 162ms/step - loss: 2.7173 - main_

In [None]:
import torch.nn as nn
import torch
import math

device = 'cuda'

class ImuFeatureExtractorPT(nn.Module):
    def __init__(self,fs=100.,**kwargs):
        super().__init__()
        self.fs=fs
        k=15
        self.lpf=nn.Conv1d(6,6,k,padding=k//2,groups=6,bias=False)
        nn.init.kaiming_uniform_(self.lpf.weight,a=math.sqrt(5))
        self.lpf_acc=nn.Conv1d(3,3,k,padding=k//2,groups=3,bias=False)
        self.lpf_gyro=nn.Conv1d(3,3,k,padding=k//2,groups=3,bias=False)

        def forward(self,imu):
            acc,gyro=imu[:,:3,:],imu[:,3:6,:]
            acc_mag,gyro_mag=torch.norm(acc,dim=1,keepdim=True),torch.norm(gyro,dim=1,keepdim=True)
            jerk,gyro_delta=F.pad(acc[:,:,1:]-acc[:,:,:-1],(1,0)),F.pad(gyro[:,:,1:]-gyro[:,:,:-1],(1,0))
            acc_pow,gyro_pow=acc**2,gyro**2
            acc_lpf,acc_hpf=self.lpf_acc(acc),acc-self.lpf_acc(acc)
            gyro_lpf,gyro_hpf=self.lpf_gyro(gyro),gyro-self.lpf_gyro(gyro)
            return torch.cat([acc,gyro,acc_mag,gyro_mag,jerk,gyro_delta,acc_pow,gyro_pow,acc_lpf,acc_hpf,gyro_lpf,gyro_hpf],dim=1)
        
class SEBlockPT(nn.Module):
    def __init__(self,c,r=8):
        super().__init__()
        self.squeeze=nn.AdaptiveAvgPool1d(1)
        self.excitation=nn.Sequential(nn.Linear(c,c//r,bias=False),nn.ReLU(inplace=True),nn.Linear(c//r,c,bias=False),nn.Sigmoid())
        def forward(self,x):b,c,_=x.size()
        y=self.squeeze(x).view(b,c)
        y=self.excitation(y).view(b,c,1)
        return x*y.expand_as(x)
        
class ResidualSECNNBlockPT(nn.Module):
    def __init__(self,i,o,k,p=2,d=0.3):
        super().__init__()
        self.c1,self.b1=nn.Conv1d(i,o,k,padding=k//2,bias=False),nn.BatchNorm1d(o)
        self.c2,self.b2=nn.Conv1d(o,o,k,padding=k//2,bias=False),nn.BatchNorm1d(o)
        self.se=SEBlockPT(o)
        self.s=nn.Sequential(nn.Conv1d(i,o,1,bias=False),nn.BatchNorm1d(o))if i!=o else nn.Identity()
        self.p,self.d=nn.MaxPool1d(p),nn.Dropout(d)
        def forward(self,x):o=F.relu(self.b1(self.c1(x)))
        o=self.b2(self.c2(o))
        o=self.se(o)+self.s(x)
        return self.d(self.p(F.relu(o)))
    
class PublicTwoBranchModel(nn.Module):
    def __init__(self,p,i,t,n,d=[0.3]*4+[0.4,0.5,0.3],f=True,**kwargs):
        super().__init__()
        imu_dim=32 if f else i
        self.imu_fe=ImuFeatureExtractorPT(**kwargs)if f else nn.Identity()
        self.fn=7
        fk=torch.tensor(firwin(33,1.0,fs=10.0,pass_zero=False),dtype=torch.float32).view(1,1,-1).repeat(self.fn,1,1)
        self.register_buffer("fk",fk)
        self.ib1,self.ib2=ResidualSECNNBlockPT(imu_dim,64,3,d=d[0]),ResidualSECNNBlockPT(64,128,5,d=d[1])
        self.tc1,self.tb1,self.tp1,self.td1=nn.Conv1d(t,64,3,padding=1,bias=False),nn.BatchNorm1d(64),nn.MaxPool1d(2),nn.Dropout(d[2])
        self.tc2,self.tb2,self.tp2,self.td2=nn.Conv1d(64,128,3,padding=1,bias=False),nn.BatchNorm1d(128),nn.MaxPool1d(2),nn.Dropout(d[3])
        self.bilstm,self.ld=nn.LSTM(256,128,bidirectional=True,batch_first=True),nn.Dropout(d[4])
        self.att=type('A',(nn.Module,),{'__init__':lambda s,h:super(type(s),s).__init__()or setattr(s,'a',nn.Linear(h,1)),'forward':lambda s,x:torch.sum(x*F.softmax(torch.tanh(s.a(x)).squeeze(-1),dim=1).unsqueeze(-1),dim=1)})(256)
        self.d1,self.bnd1,self.dr1=nn.Linear(256,256,bias=False),nn.BatchNorm1d(256),nn.Dropout(d[5])
        self.d2,self.bnd2,self.dr2=nn.Linear(256,128,bias=False),nn.BatchNorm1d(128),nn.Dropout(d[6])
        self.clf=nn.Linear(128,n)
        self.m=torch.tensor([0,0,0,0,0,0,9e-3,1.08, -2.6e-3,3.7e-3,-5.3e-3,-2.8e-3,1.3e-3,-1.5e-4,0.63,0.62,0.60,0.62,0.63,0.65,7.4e-3,-3.4e-3,-7.5e-3,-2.6e-2,2.9e-2,-3.1e-2,-2e-3,-4.7e-3,-4.7e-3,-2.6e-2,1.5e-2,1e-2],dtype=torch.float32).view(1,-1,1).to(device)
        self.s=torch.tensor([1,1,1,1,1,1,0.2,0.85,0.31,0.26,0.29,0.23,0.3,0.32,1.02,0.88,0.86,1.09,1.02,0.9,0.46,0.2,0.2,1.22,0.95,0.66,0.29,0.34,0.81,0.65,1.1,1.55],dtype=torch.float32).view(1,-1,1).to(device)+1e-8

    def forward(self,x):
        ir,t=x[:,:,:self.fn].transpose(1,2),x[:,:,self.fn:].transpose(1,2)
        ife=self.imu_fe(ir)
        fl=F.conv1d(ife[:,:self.fn,:],self.fk,padding=self.fk.shape[-1]//2,groups=self.fn)
        imu=(torch.cat([fl,ife[:,self.fn:,:]],dim=1)-self.m)/self.s
        x1=self.ib2(self.ib1(imu))
        x2=self.td2(self.tp2(F.relu(self.tb2(self.tc2(self.td1(self.tp1(F.relu(self.tb1(self.tc1(t))))))))))
        lo,_=self.bilstm(torch.cat([x1,x2],dim=1).transpose(1,2))
        a=self.att(self.ld(lo))
        x=self.dr1(F.relu(self.bnd1(self.d1(a))))
        x=self.dr2(F.relu(self.bnd2(self.d2(x))))
        return self.clf(x)
        
    def pad_sequences_torch(s,m,p='post',v=0.0):
        r=[]
        [r.append(i[:m]if len(i)>=m else np.concatenate([i,np.full((m-len(i),i.shape[1]),v,dtype=np.float32)]if p=='post'else[np.full((m-len(i),i.shape[1]),v,dtype=np.float32),i]))for i in s]
        return np.array(r,dtype=np.float32)

In [None]:
class SEBlockKeras(tf.keras.layers.Layer):
    def __init__(self,r=8,**kwargs):
        super().__init__(**kwargs)
        self.r=r

    def build(self,s):
        c=s[-1]
        self.gap=GlobalAveragePooling1D()
        self.d1=Dense(c//self.r,activation='relu')
        self.d2=Dense(c,activation='sigmoid')
        self.rs=Reshape((1,c))
        super().build(s)
    
    def call(self,i):return Multiply()([i,self.rs(self.d2(self.d1(self.gap(i))))])

    def get_config(self):
        c=super().get_config()
        c.update({'r':self.r})
        return c

class ResidualSECNNBlockKeras(tf.keras.layers.Layer):
    def __init__(self,f,k,p=2,d=0.3,wd=1e-4,**kwargs):
        super().__init__(**kwargs)
        self.f,self.k,self.p,self.d,self.wd=f,k,p,d,wd
        self.c1=Conv1D(f,k,padding='same',use_bias=False,kernel_regularizer=l2(wd))
        self.b1=BatchNormalization()
        self.c2=Conv1D(f,k,padding='same',use_bias=False,kernel_regularizer=l2(wd))
        self.b2=BatchNormalization()
        self.se=SEBlockKeras()
        self.pool=MaxPooling1D(p)
        self.drop=Dropout(d)

    def build(self,s):
        self.s_conv=Conv1D(self.f,1,padding='same',use_bias=False,kernel_regularizer=l2(self.wd))if s[-1]!=self.f else None
        super().build(s)

    def call(self,i):
        x=Activation('relu')(self.b1(self.c1(i)))
        x=self.b2(self.c2(x))
        s=self.s_conv(i)if self.s_conv else i
        x=self.se(x)
        x=add([x,s])
        return self.drop(self.pool(Activation('relu')(x)))
    def get_config(self):
        c=super().get_config()
        c.update({'f':self.f,'k':self.k,'p':self.p,'d':self.d,'wd':self.wd})
        return c
    
class TransformerEncoderKeras(tf.keras.layers.Layer):
    def __init__(self,hs,nh,ffd,d=0.0,**kwargs):
        super().__init__(**kwargs)
        self.hs,self.nh,self.ffd,self.d=hs,nh,ffd,d
        self.mha=MultiHeadAttention(key_dim=hs,num_heads=nh,dropout=d)
        self.d1=Dropout(d)
        self.ln1=LayerNormalization(epsilon=1e-6)
        self.d_ff1=Dense(ffd,activation="relu")
        self.d_ff2=Dense(ffd)
        self.d2=Dropout(d)
        self.ln2=LayerNormalization(epsilon=1e-6)

    def build(self,s):
        self.out_d=Dense(s[-1])
        super().build(s)
    
    def call(self,i):
        x=self.ln1(i)
        x=self.mha(x,x)
        x=self.d1(x)
        res=x+i
        x=self.ln2(res)
        x=self.d_ff1(x)
        x=self.d_ff2(x)
        x=self.d2(x)
        x=self.out_d(x)
        return x+res
    
    def get_config(self):
        c=super().get_config()
        c.update({'hs':self.hs,'nh':self.nh,'ffd':self.ffd,'d':self.d})
        return c

def build_transformer_model(pad_len,imu_dim,tof_dim,n_classes,wd=1e-4):
    inp=Input(shape=(pad_len,imu_dim+tof_dim))
    x=GaussianNoise(0.01)(inp)
    imu,tof=Lambda(lambda t:t[:,:,:imu_dim])(x),Lambda(lambda t:t[:,:,imu_dim:])(x)
    x1=ResidualSECNNBlockKeras(64,3,d=0.2,wd=wd)(imu)
    x1=ResidualSECNNBlockKeras(128,5,d=0.2,wd=wd)(x1)
    x2_base=ResidualSECNNBlockKeras(64,3,d=0.25,wd=wd)(tof)
    x2_base=ResidualSECNNBlockKeras(128,3,d=0.25,wd=wd)(x2_base)
    gate=Dense(1,'sigmoid',name='tof_gate')(GlobalAveragePooling1D()(tof))
    x2=Multiply()([x2_base,gate])
    x=Concatenate()([x1,x2])
    x=Bidirectional(LSTM(256,return_sequences=True,kernel_regularizer=l2(wd)))(x)
    x=TransformerEncoderKeras(128,4,x.shape[-1],d=0.3)(x)
    x=GlobalAveragePooling1D()(x)
    for u,d in[(512,0.5),(128,0.3)]:x=Dropout(d)(Activation('relu')(BatchNormalization()(Dense(u,use_bias=False,kernel_regularizer=l2(wd))(x))))
    return Model(inputs=inp,outputs=[Dense(n_classes,'softmax',name='main_output')(x),gate])

model = build_transformer_model(pad_len, imu_dim, tof_dim, len(le.classes_), wd=WD)

In [None]:
BATCH_SIZE = 64
PAD_PERCENTILE = 95      
LR_INIT = 5e-4 
WD = 3e-3      
MIXUP_ALPHA = 0.4       
EPOCHS = 200   
PATIENCE = 60  
N_SPLITS = 5   
MASKING_PROB = 0.25      
GATE_LOSS_WEIGHT = 0.2  

if TRAIN:
    print("▶ 学習モード開始")
    # (このセクションの大部分は変更ありません)
    df = pd.read_csv(RAW_DIR/"train.csv").merge(pd.read_csv(RAW_DIR/"train_demographics.csv"), on='subject')
    le = LabelEncoder()
    df['gesture_int'] = le.fit_transform(df['gesture'])
    
    print("  特徴量生成中...")
    all_sequences = []
    # (特徴量生成のループ... 変更なし)
    for _, seq_df in tqdm(df.groupby('sequence_id'), desc="特徴量生成"):
        s = seq_df.copy().fillna(method='ffill').fillna(method='bfill').fillna(0)
        s[['linear_acc_x','linear_acc_y','linear_acc_z']] = remove_gravity_from_acc(s)
        s[['angular_vel_x','angular_vel_y','angular_vel_z']] = calculate_angular_velocity_from_quat(s)
        s['linear_acc_mag'] = np.linalg.norm(s[['linear_acc_x','linear_acc_y','linear_acc_z']].values, axis=1)
        s['linear_acc_mag_jerk'] = s['linear_acc_mag'].diff().fillna(0)
        s['angular_distance'] = calculate_angular_distance(s)
        for i in range(1, 6):
            if f'tof_{i}_v0' in s.columns:
                pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
                tof_data = s[pixel_cols].replace(-1, np.nan)
                s[f"tof_{i}_mean"], s[f"tof_{i}_std"] = tof_data.mean(axis=1), tof_data.std(axis=1)
        all_sequences.append(s)
    
    df = pd.concat(all_sequences).fillna(0)
    
    imu_cols = [c for c in df.columns if any(s in c for s in ['acc_','rot_','angular_']) and 'uncalib' not in c]
    tof_cols = [c for c in df.columns if 'tof_' in c or 'thm_' in c]
    final_feature_cols = imu_cols + tof_cols
    imu_dim, tof_dim = len(imu_cols), len(tof_cols)
    print(f"  特徴量数: IMU={imu_dim}, TOF/THM={tof_dim}, Total={len(final_feature_cols)}")
    
    X_list, y_list, groups, lens = [], [], [], []
    for _, seq_df in df.groupby('sequence_id'):
        X_list.append(seq_df[final_feature_cols].values.astype('float32'))
        y_list.append(seq_df['gesture_int'].iloc[0])
        groups.append(seq_df['subject'].iloc[0])
        lens.append(len(seq_df))
    
    scaler = StandardScaler().fit(np.concatenate(X_list, axis=0))
    pad_len = int(np.percentile(lens, PAD_PERCENTILE))
    X = pad_sequences([scaler.transform(x) for x in X_list], maxlen=pad_len, padding='post', dtype='float32')
    y_cat, y_stratify = to_categorical(y_list), np.array(y_list)
    
    oof_A = np.zeros_like(y_cat)
    sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(sgkf.split(X, y_stratify, groups)):
        print(f"\n===== FOLD {fold+1}/{N_SPLITS} =====")
        X_tr, X_val, y_tr, y_val = X[train_idx], X[val_idx], y_cat[train_idx], y_cat[val_idx]
        model = build_transformer_model(pad_len, imu_dim, tof_dim, len(le.classes_), wd=WD)
        model.compile(optimizer=AdamTF(CosineDecay(LR_INIT, len(X_tr)//BATCH_SIZE*EPOCHS)),
                      loss={'main_output': 'categorical_crossentropy', 'tof_gate': 'binary_crossentropy'},
                      loss_weights={'main_output': 1.0, 'tof_gate': GATE_LOSS_WEIGHT},
                      metrics={'main_output': 'accuracy'})
        cw = dict(enumerate(compute_class_weight('balanced', classes=np.unique(y_list), y=y_tr.argmax(1))))
        train_gen = GatedMixupGenerator(X_tr, y_tr, BATCH_SIZE, imu_dim, w=cw, a=MIXUP_ALPHA, m=MASKING_PROB)
        val_gen = GatedMixupGenerator(X_val, y_val, BATCH_SIZE, imu_dim)
        cb = EarlyStopping(patience=PATIENCE, restore_best_weights=True, verbose=1, monitor='val_main_output_accuracy', mode='max')
        model.fit(train_gen, epochs=EPOCHS, validation_data=val_gen, callbacks=[cb], verbose=1)
        model.export(EXPORT_DIR / f"model_A_fold_{fold}")
        
        oof_A[val_idx], _ = model.predict(X_val, batch_size=BATCH_SIZE)

    if CompetitionMetric is not None:
        oof_score = CompetitionMetric().calculate_hierarchical_f1(pd.DataFrame({'gesture':le.classes_[y_cat.argmax(1)]}), pd.DataFrame({'gesture':le.classes_[oof_A.argmax(1)]}))
        print(f"\n\n=================================================\n  CVスコア (OOF H-F1 Score): {oof_score:.4f}\n=================================================\n")
    
    joblib.dump(scaler, EXPORT_DIR/"scaler.pkl")
    np.save(EXPORT_DIR/"final_feature_cols.npy", np.array(final_feature_cols))
    np.save(EXPORT_DIR/"gesture_classes.npy", le.classes_)
    np.save(EXPORT_DIR/"sequence_maxlen.npy", pad_len)
    print("✔ 学習と成果物の保存が完了しました。")

In [None]:
def predict_final(sequence:pl.DataFrame,demographics:pl.DataFrame)->str:
    import numpy as np_local
    from tensorflow.keras.utils import pad_sequences as pad_sequences_local
    from scipy.stats import rankdata as rankdata_local
    import torch as torch_local
    scaler_B=joblib.load(PUBLIC_TF_MODEL_DIR/"scaler.pkl")
    final_feature_cols_B=np_local.load(PUBLIC_TF_MODEL_DIR/"feature_cols.npy",allow_pickle=True).tolist()
    pad_len_B=int(np_local.load(PUBLIC_TF_MODEL_DIR/"sequence_maxlen.npy"))
    scaler_C=joblib.load(PUBLIC_PT_MODEL_DIR/"scaler.pkl")
    final_feature_cols_C=np_local.load(PUBLIC_PT_MODEL_DIR/"feature_cols.npy",allow_pickle=True).tolist()
    pad_len_C=int(np_local.load(PUBLIC_PT_MODEL_DIR/"sequence_maxlen.npy"))
    df=sequence.to_pandas().fillna(method='ffill').fillna(method='bfill').fillna(0)
    df_A=df.copy()
    df_A[['linear_acc_x','linear_acc_y','linear_acc_z']]=remove_gravity_from_acc(df_A)
    df_A[['angular_vel_x','angular_vel_y','angular_vel_z']]=calculate_angular_velocity_from_quat(df_A)
    df_A['linear_acc_mag']=np_local.linalg.norm(df_A[['linear_acc_x','linear_acc_y','linear_acc_z']].values,axis=1)
    df_A['linear_acc_mag_jerk']=df_A['linear_acc_mag'].diff().fillna(0)
    df_A['angular_distance']=calculate_angular_distance(df_A)

    for i in range(1,6):
        if f'tof_{i}_v0' in df_A.columns:
            p=[f"tof_{i}_v{p}"for p in range(64)]
            t=df_A[p].replace(-1,np_local.nan)
            df_A[f"tof_{i}_mean"],df_A[f"tof_{i}_std"]=t.mean(axis=1),t.std(axis=1)

    df_A=df_A.fillna(0)
    mat_A=scaler_A.transform(df_A[final_feature_cols_A].values.astype(np_local.float32))
    pad_A=pad_sequences_local([mat_A],maxlen=pad_len_A,padding='post',dtype='float32')
    pred_A=np_local.mean([m.predict(pad_A,verbose=0)[0]for m in models_A],axis=0)
    df_B=df.copy()
    df_B['acc_mag']=np_local.sqrt(df_B['acc_x']**2+df_B['acc_y']**2+df_B['acc_z']**2)
    df_B['rot_angle']=2*np_local.arccos(df_B['rot_w'].clip(-1,1))
    df_B['acc_mag_jerk']=df_B['acc_mag'].diff().fillna(0)
    df_B['rot_angle_vel']=df_B['rot_angle'].diff().fillna(0)
    df_B[['linear_acc_x','linear_acc_y','linear_acc_z']]=remove_gravity_from_acc(df_B)
    df_B['linear_acc_mag']=np_local.linalg.norm(df_B[['linear_acc_x','linear_acc_y','linear_acc_z']].values,axis=1)
    df_B['linear_acc_mag_jerk']=df_B['linear_acc_mag'].diff().fillna(0)
    df_B[['angular_vel_x','angular_vel_y','angular_vel_z']]=calculate_angular_velocity_from_quat(df_B)
    df_B['angular_distance']=calculate_angular_distance(df_B)
    df_B=df_B.fillna(0)
    mat_B=scaler_B.transform(df_B[final_feature_cols_B].values.astype(np_local.float32))
    pad_B=pad_sequences_local([mat_B],maxlen=pad_len_B,padding='post',dtype='float32')
    pred_B=model_B.predict(pad_B,verbose=0)[0]
    df_C=df.copy().fillna(0)
    mat_C=scaler_C.transform(df_C[final_feature_cols_C].values.astype(np_local.float32))
    pad_C=pad_sequences_torch([mat_C],maxlen=pad_len_C,padding='post')
    
    with torch.no_grad():pt_input=torch.from_numpy(pad_C).to(device)
        preds_C=[m(pt_input)for m in pt_models]
        pred_C=torch.mean(torch.stack(preds_C),dim=0).cpu().numpy()[0]
        final_rank=(rankdata_local(pred_A)+rankdata_local(pred_B)+rankdata_local(pred_C))/3.0
        return str(gesture_classes[np_local.argmax(final_rank)])