In [1]:
import os
import polars as pl
import numpy as np
import gc
import polars as pl
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from scipy.spatial.transform import Rotation as R
from pathlib import Path
from pathlib import Path

def remove_gravity_polars(acc_df: pl.DataFrame, rot_df: pl.DataFrame) -> np.ndarray:
    """Removes the gravity component from accelerometer data using quaternion rotations."""
    acc_values = acc_df.select(['acc_x', 'acc_y', 'acc_z']).to_numpy()
    quat_values = rot_df.select(['rot_x', 'rot_y', 'rot_z', 'rot_w']).to_numpy()
    num_samples = acc_values.shape[0]
    linear_accel = np.zeros_like(acc_values)
    gravity_world = np.array([0, 0, 9.81])
    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            linear_accel[i] = acc_values[i]
            continue
        try:
            rotation = R.from_quat(quat_values[i])
            gravity_sensor_frame = rotation.apply(gravity_world, inverse=True)
            linear_accel[i] = acc_values[i] - gravity_sensor_frame
        except ValueError:
            linear_accel[i] = acc_values[i]
    return linear_accel

def calculate_angular_velocity(rot_df: pl.DataFrame, sampling_rate_hz: int) -> np.ndarray:
    """Calculates angular velocity from quaternion data."""
    quats = rot_df.select(['rot_x', 'rot_y', 'rot_z', 'rot_w']).to_numpy()
    angular_velocity = np.zeros_like(quats[:, :3])
    dt = 1.0 / sampling_rate_hz
    for i in range(1, len(quats)):
        try:
            q1 = R.from_quat(quats[i - 1])
            q2 = R.from_quat(quats[i])
            q_delta = q2 * q1.inv()
            rot_vec = q_delta.as_rotvec()
            angular_velocity[i] = rot_vec / dt
        except ValueError:
            angular_velocity[i] = 0
    return angular_velocity

def calculate_angular_acceleration(angular_velocity: np.ndarray, sampling_rate_hz: int) -> np.ndarray:
    """Calculates angular acceleration from angular velocity."""
    angular_accel = np.zeros_like(angular_velocity)
    dt = 1.0 / sampling_rate_hz
    angular_accel[1:] = np.diff(angular_velocity, axis=0) / dt
    return angular_accel

def calculate_gravity_orientation(rot_df: pl.DataFrame) -> np.ndarray:
    """Calculates the orientation of each sensor axis with respect to the world gravity vector."""
    quat_values = rot_df.select(['rot_x', 'rot_y', 'rot_z', 'rot_w']).to_numpy()
    num_samples = quat_values.shape[0]
    orientation_angles = np.zeros((num_samples, 3))
    gravity_world = np.array([0, 0, 1.0])
    for i in range(num_samples):
        if np.all(np.isnan(quat_values[i])) or np.all(np.isclose(quat_values[i], 0)):
            continue
        try:
            rotation = R.from_quat(quat_values[i])
            sensor_axes_world = rotation.apply(np.eye(3))
            for j in range(3):
                dot_product = np.dot(sensor_axes_world[j], gravity_world)
                orientation_angles[i, j] = np.arccos(np.clip(dot_product, -1.0, 1.0))
        except ValueError:
            continue
    return orientation_angles

def calculate_angular_distance(rot_df: pl.DataFrame) -> np.ndarray:
    quat_values = rot_df.select(['rot_x', 'rot_y', 'rot_z', 'rot_w']).to_numpy()
    angular_dist = np.zeros(len(quat_values))
    for i in range(len(quat_values) - 1):
        q1, q2 = quat_values[i], quat_values[i+1]
        if np.all(np.isnan(q1)) or np.all(np.isnan(q2)):
            continue
        try:
            r1, r2 = R.from_quat(q1), R.from_quat(q2)
            relative_rotation = r1.inv() * r2
            angular_dist[i] = np.linalg.norm(relative_rotation.as_rotvec())
        except ValueError:
            pass
    return angular_dist    

# =====================================================================================
# MAIN PROCESSING FUNCTION
# =====================================================================================
def add_all_imu_features_polars(df: pl.DataFrame, sampling_rate_hz: int) -> pl.DataFrame:
    """Main function to add all IMU features to the DataFrame."""
    df = df.sort(["sequence_id", "sequence_counter"])
    df = df.with_columns(
        (pl.col("acc_x")**2 + pl.col("acc_y")**2 + pl.col("acc_z")**2).sqrt().alias("acc_mag"),
    )
    df = df.with_columns(
        pl.col("acc_mag").diff().over("sequence_id").fill_null(0).alias("acc_mag_jerk"),
    )

    grouped = df.partition_by("sequence_id", maintain_order=True)
    all_feature_dfs = []
    for group in grouped:
        acc_df = group.select(["acc_x", "acc_y", "acc_z"])
        rot_df = group.select(["rot_x", "rot_y", "rot_z", "rot_w"])
        feature_df_group = pl.DataFrame({
            "sequence_counter": group["sequence_counter"],
            "sequence_id": group["sequence_id"]
        })
        linear_acc = remove_gravity_polars(acc_df, rot_df)
        feature_df_group = feature_df_group.with_columns(
            pl.DataFrame(linear_acc, schema=["linear_acc_x", "linear_acc_y", "linear_acc_z"])
        )
        angular_vel = calculate_angular_velocity(rot_df, sampling_rate_hz)
        feature_df_group = feature_df_group.with_columns(
            pl.DataFrame(angular_vel, schema=["angular_vel_x", "angular_vel_y", "angular_vel_z"])
        )
        angular_accel = calculate_angular_acceleration(angular_vel, sampling_rate_hz)
        feature_df_group = feature_df_group.with_columns(
            pl.DataFrame(angular_accel, schema=["angular_accel_x", "angular_accel_y", "angular_accel_z"])
        )
        angular_dist = calculate_angular_distance(rot_df)
        feature_df_group = feature_df_group.with_columns(pl.DataFrame(angular_dist, schema=["angular_distance"]))
        
        gravity_orientation = calculate_gravity_orientation(rot_df)
        feature_df_group = feature_df_group.with_columns(
            pl.DataFrame(gravity_orientation, schema=["grav_orient_x", "grav_orient_y", "grav_orient_z"])
        )
        all_feature_dfs.append(feature_df_group)

    if all_feature_dfs:
        features_to_add = pl.concat(all_feature_dfs)
        df = df.join(features_to_add, on=["sequence_id", "sequence_counter"], how="left")

    df = df.with_columns([
        (pl.col("linear_acc_x")**2 + pl.col("linear_acc_y")**2 + pl.col("linear_acc_z")**2).sqrt().alias("linear_acc_mag"),
        (pl.col("angular_vel_x")**2 + pl.col("angular_vel_y")**2 + pl.col("angular_vel_z")**2).sqrt().alias("angular_vel_mag"),
        (pl.col("angular_accel_x")**2 + pl.col("angular_accel_y")**2 + pl.col("angular_accel_z")**2).sqrt().alias("angular_accel_mag"),
    ])
    df = df.with_columns([
        pl.col("linear_acc_mag").diff().over("sequence_id").fill_null(0).alias("linear_acc_mag_jerk"),
        pl.col("angular_vel_mag").diff().over("sequence_id").fill_null(0).alias("angular_vel_mag_jerk"),
        pl.col("angular_accel_mag").diff().over("sequence_id").fill_null(0).alias("angular_accel_mag_jerk"),
    ])
    return df

2025-09-07 12:22:46.240411: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757244166.262266 2196047 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757244166.268879 2196047 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757244166.291738 2196047 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757244166.291772 2196047 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1757244166.291774 2196047 computation_placer.cc:177] computation placer alr

In [2]:
def process_tof_features(df: pl.DataFrame) -> pl.DataFrame:
    tof_aggregated_cols_template = []
    for i in range(1, 6): 
        tof_aggregated_cols_template.extend([
            f'tof_{i}_mean', f'tof_{i}_std', f'tof_{i}_min', f'tof_{i}_max',
            f'tof_{i}_diff_mean', f'tof_{i}_mean_decay',
            f'tof_{i}_active_pixels', f'tof_{i}_centroid_x', f'tof_{i}_centroid_y'
        ])

    final_feature_cols = tof_aggregated_cols_template
    
    metadata_cols = ['sequence_id', 'sequence_counter', 'subject', 'gesture']

    print(f"  Total {len(final_feature_cols)} ToF statistical features will be engineered.")

    decay_weights = np.power(0.9, np.arange(64))
    x_coords, y_coords = np.meshgrid(np.arange(8), np.arange(8))

    print("  Building and executing feature engineering expressions...")
    feature_expressions = []
    for i in range(1, 6):
        pixel_cols = [f"tof_{i}_v{p}" for p in range(64)]
        list_expr = pl.concat_list([pl.when(pl.col(c) == -1).then(None).otherwise(pl.col(c)) for c in pixel_cols]).alias(f"tof_{i}_list")
        feature_expressions.extend([
            list_expr.list.mean().alias(f'tof_{i}_mean'),
            list_expr.list.std().alias(f'tof_{i}_std'),
            list_expr.list.min().alias(f'tof_{i}_min'),
            list_expr.list.max().alias(f'tof_{i}_max'),
            list_expr.list.diff().list.mean().alias(f'tof_{i}_diff_mean'),
            list_expr.list.drop_nulls().list.len().alias(f'tof_{i}_active_pixels'),
        ])
        tof_data_exprs = [pl.when(pl.col(c) == -1).then(None).otherwise(pl.col(c)) for c in pixel_cols]
        feature_expressions.append(pl.sum_horizontal([(expr * weight).fill_null(0) for expr, weight in zip(tof_data_exprs, decay_weights)]).alias(f'tof_{i}_mean_decay'))
        weights_exprs = [(1 / (expr + 1e-6)).fill_null(0) for expr in tof_data_exprs]
        total_weight_expr = pl.sum_horizontal(weights_exprs)
        centroid_x_expr = pl.when(total_weight_expr > 1e-9).then(pl.sum_horizontal([(w * c) for w, c in zip(weights_exprs, x_coords.ravel())]) / total_weight_expr).otherwise(None)
        centroid_y_expr = pl.when(total_weight_expr > 1e-9).then(pl.sum_horizontal([(w * c) for w, c in zip(weights_exprs, y_coords.ravel())]) / total_weight_expr).otherwise(None)
        feature_expressions.extend([centroid_x_expr.alias(f'tof_{i}_centroid_x'), centroid_y_expr.alias(f'tof_{i}_centroid_y')])

    base_feats = [f'thm_{i}' for i in range (1,6)] + ['acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z']
    df_featured = df.with_columns(feature_expressions)
    
    float_cols = [c for c in final_feature_cols if c.endswith(('_mean', '_std', '_min', '_max', '_diff_mean', '_mean_decay', '_centroid_x', '_centroid_y'))]
    int_cols = [c for c in final_feature_cols if c.endswith('_active_pixels')]

    float_imputation = pl.col(float_cols).replace([np.inf, -np.inf], None).fill_nan(None).forward_fill().backward_fill().fill_null(0).over("sequence_id")
    int_imputation = pl.col(int_cols).forward_fill().backward_fill().fill_null(0).over("sequence_id")

    final_df_imputed = df_featured.with_columns(float_imputation, int_imputation)

    final_df = final_df_imputed.select(metadata_cols + base_feats + final_feature_cols)
    return final_df

In [3]:
RAW_DIR = Path("input/cmi-detect-behavior-with-sensor-data")
OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CLEAN_DATA_FILE = OUTPUT_DIR / "cleaned_base_train_data.parquet"
FILTER_PROBLEM_SUBJECTS = True


print("▶ Starting Raw Data Cleaning Script...")

# --- Step 1: Load and Merge Raw Data ---
df = pl.read_csv(RAW_DIR / "train.csv")
# df = df[:10_000]
demographics_df = pl.read_csv(RAW_DIR / "train_demographics.csv")
df = df.join(demographics_df, on='subject', how='left')
print(f"  Initial merged shape: {df.shape}")

# --- Step 2: Filtering ---
df = df.filter(pl.col("sequence_id") != "SEQ_011975")
print(f"  Shape after removing SEQ_011975: {df.shape}")

# Define all 320 raw ToF columns
raw_tof_cols = [f'tof_{i}_v{j}' for i in range(1, 6) for j in range(64)]

# # Calculate the ratio of valid ToF readings for each sequence
# df = df.with_columns(
#     # Step 1: Count the number of valid ToF readings in each ROW.
#     # A reading is valid if it's not null AND not -1.
#     pl.sum_horizontal(
#         pl.col(c).is_not_null() & (pl.col(c) != -1) for c in raw_tof_cols
#     ).alias("valid_tofs_per_row")
# ).with_columns(
#     # Step 2: Calculate the ratio for the entire SEQUENCE.
#     # Sum the valid counts per row and divide by the total possible readings.
#     (
#         pl.col("valid_tofs_per_row").sum().over("sequence_id") / (pl.len().over("sequence_id") * len(raw_tof_cols))
#     ).alias("valid_tof_ratio")
# ).filter(
#     # Step 3: Apply the filter
#     pl.col("valid_tof_ratio") >= 0.2
# ).drop("valid_tofs_per_row", "valid_tof_ratio") # Clean up temporary columns

# print(f"  Shape after valid ToF data ratio filter (>= 0.2): {df.shape}")

if FILTER_PROBLEM_SUBJECTS:
    problem_subjects = ["SUBJ_045235", "SUBJ_019262"]
    df = df.filter(~pl.col("subject").is_in(problem_subjects))
    print(f"  Shape after removing problem subjects: {df.shape}")

# --- Step 3: Value Transformation ---

df = df.with_columns(
    [pl.when(pl.col(c) == -1).then(500).otherwise(pl.col(c)).alias(c) for c in raw_tof_cols]
)
print("  Replaced -1 TOF values with 500.")

# --- Step 4: Ultimate NaN Filling ---
print("\n  Performing final imputation sweep...")
cols_to_impute = [c for c in df.columns if c not in ['row_id', 'sequence_id', 'subject', 'gesture', 'behavior', 'orientation']]

df = df.with_columns(
    pl.col(cols_to_impute)
        .forward_fill()
        .backward_fill()
        .fill_null(0)
        .over("sequence_id")
)
print("  Final imputation complete.")

df = process_tof_features(df)
df = add_all_imu_features_polars(df, 200)

▶ Starting Raw Data Cleaning Script...
  Initial merged shape: (574945, 348)
  Shape after removing SEQ_011975: (574861, 348)
  Shape after removing problem subjects: (562604, 348)
  Replaced -1 TOF values with 500.

  Performing final imputation sweep...
  Final imputation complete.
  Total 45 ToF statistical features will be engineered.
  Building and executing feature engineering expressions...


In [4]:
from tensorflow.keras import backend as k
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as k
from tensorflow import argmax, minimum, shape
from tensorflow.data import AUTOTUNE, Dataset
from tensorflow.keras import Layer, Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.utils import pad_sequences, Sequence, to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Dense, Input, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Concatenate,
    BatchNormalization, GRU, Dropout, add, Activation, Multiply, Reshape,
    LayerNormalization, Add, Bidirectional, LSTM, UpSampling1D, Lambda, GaussianNoise,
    Input, GlobalMaxPooling1D
)

def generate_gate_targets(df: pl.DataFrame, tof_cols: list) -> pl.DataFrame:
    gate_df = df.group_by("sequence_id").agg(
        pl.any_horizontal(pl.col(tof_cols).is_not_null().any()).alias("has_tof")
    )
    return gate_df.with_columns(pl.col("has_tof").cast(pl.Float32))

def create_sequence_dataset(df: pl.DataFrame, feature_cols: list, gate_df: pl.DataFrame):
    sequences = []
    labels = []
    gate_targets = [] 

    df_with_gate = df.join(gate_df, on='sequence_id', how='left')

    for seq_id, group in df_with_gate.group_by('sequence_id', maintain_order=True):
        sequences.append(group.select(feature_cols).to_numpy())
        labels.append(group.select('gesture_int').item(0, 0))
        gate_targets.append(group.select('has_tof').item(0, 0))

    return np.array(sequences, dtype=object), np.array(labels), np.array(gate_targets)    

class GatedMixupGenerator(Sequence):
    def __init__(self, X, y, gate_targets, batch_size, imu_dim, class_weight=None, alpha=0.2, masking_prob=0.0):
        self.X, self.y = X, y
        self.gate_targets = gate_targets  
        self.batch = batch_size
        self.imu_dim = imu_dim
        self.class_weight = class_weight
        self.alpha = alpha
        self.masking_prob = masking_prob
        self.indices = np.arange(len(X))
        
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch))

    def __getitem__(self, i):
        idx = self.indices[i*self.batch:(i+1)*self.batch]
        Xb, yb = self.X[idx].copy(), self.y[idx].copy()
        
        gate_target = self.gate_targets[idx].copy()

        if self.masking_prob > 0:
            for i in range(len(Xb)):
                # If the gate is 1.0 (has ToF) AND we hit the random chance...
                if gate_target[i] == 1.0 and np.random.rand() < self.masking_prob:
                    Xb[i, :, self.imu_dim:] = 0  # Zero out the ToF features
                    gate_target[i] = 0.0         # Set the gate to 0 for this augmented sample

        # The rest of the logic (class weights, mixup) can remain the same
        sample_weights = np.ones(len(Xb), dtype='float32')
        if self.class_weight:
            y_integers = yb.argmax(axis=1)
            sample_weights = np.array([self.class_weight[i] for i in y_integers])

        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
            perm = np.random.permutation(len(Xb))
            X_mix = lam * Xb + (1 - lam) * Xb[perm]
            y_mix = lam * yb + (1 - lam) * yb[perm]
            gate_target_mix = lam * gate_target + (1 - lam) * gate_target[perm]
            sample_weights_mix = lam * sample_weights + (1 - lam) * sample_weights[perm]
            return X_mix, {'main_output': y_mix, 'tof_gate': gate_target_mix[:, np.newaxis]}, sample_weights_mix

        return Xb, {'main_output': yb, 'tof_gate': gate_target[:, np.newaxis]}, sample_weights        

In [5]:
# cv_info = df.group_by("sequence_id").agg(pl.first("gesture_int")).sort("sequence_id")
# all_sequence_ids = cv_info.get_column("sequence_id").to_numpy()
# tr_ix, val_ix = train_test_split(all_sequence_ids, test_size=0.2)

In [6]:
def crop_or_pad(inputs):
    x, skip = inputs
    x_len = shape(x)[1]
    skip_len = shape(skip)[1]
    min_len = minimum(x_len, skip_len)
    return x[:, :min_len, :], skip[:, :min_len, :]

def crop_or_pad_output_shape(input_shapes):
    shape1, shape2 = input_shapes
    min_time_steps = min(shape1[1], shape2[1])
    num_features = shape1[2]
    output_shape = (None, min_time_steps, num_features)
    return [output_shape, output_shape]

def match_time_steps(x, skip):    
    x, skip = Lambda(
        crop_or_pad, 
        output_shape=crop_or_pad_output_shape 
    )([x, skip])
    return x, skip

def se_block(x, reduction=8):
    ch = x.shape[-1]
    se = GlobalAveragePooling1D()(x)
    se = Dense(ch // reduction, activation='relu')(se)
    se = Dense(ch, activation='sigmoid')(se)
    se = Reshape((1, ch))(se)
    return Multiply()([x, se])

def residual_se_cnn_block(x, filters, kernel_size, pool_size=2, drop=0.3, wd=1e-4):
    """
    Output: (B, T, # filters)
    """
    shortcut = x
    for _ in range(2):
        x = Conv1D(filters, kernel_size, padding='same', use_bias=False,
                   kernel_regularizer=l2(wd))(x)
        x = LayerNormalization()(x)
        x = Activation('relu')(x)
    x = se_block(x)
    if shortcut.shape[-1] != filters:
        shortcut = Conv1D(filters, 1, padding='same', use_bias=False,
                          kernel_regularizer=l2(wd))(shortcut)
        shortcut = LayerNormalization()(shortcut)
    x = add([x, shortcut])
    x = Activation('relu')(x)
    x = MaxPooling1D(pool_size)(x)
    x = Dropout(drop)(x)
    return x

def residual_se_cnn_block(x, filters, kernel_size, pool_size=2, drop=0.3, wd=1e-4):
    shortcut = x
    for _ in range(2):
        x = Conv1D(filters, kernel_size, padding='same', use_bias=False,
                   kernel_regularizer=l2(wd))(x)
        x = BatchNormalization()(x)
        x = Activation('relu')(x)
    x = se_block(x)
    if shortcut.shape[-1] != filters:
        shortcut = Conv1D(filters, 1, padding='same', use_bias=False,
                          kernel_regularizer=l2(wd))(shortcut)
        shortcut = BatchNormalization()(shortcut)
    x = add([x, shortcut])
    x = Activation('relu')(x)
    x = MaxPooling1D(pool_size)(x)
    x = Dropout(drop)(x)
    return x

def res_se_cnn_decoder_block(x, filters, kernel_size, drop=0.3, wd=1e-4, skip_connection=None):
    x = UpSampling1D(size=2)(x)
    x = Conv1D(filters, kernel_size, padding='same', use_bias=False,
               kernel_regularizer=l2(wd))(x)
    x = LayerNormalization()(x)
    x = Activation('relu')(x)

    if skip_connection is not None:
        x, skip_connection = match_time_steps(x, skip_connection)
        x = Concatenate()([x, skip_connection])

    x = Conv1D(filters, kernel_size, padding='same', use_bias=False,
               kernel_regularizer=l2(wd))(x)
    x = LayerNormalization()(x)
    x = Activation('relu')(x)

    x = se_block(x)
    x = Dropout(drop)(x)
    return x

def unet_se_cnn(x, unet_depth=3, base_filters=64, kernel_size=3, drop=0.3):
    filters = base_filters
    skips = []
    
    # Encoder
    for _ in range(unet_depth):
        x = residual_se_cnn_block(x, filters, kernel_size, drop=drop)
        skips.append(x)
        filters *= 2
    
    # Bottleneck
    c_shape = x.shape[-1]
    x = Dense(128)(x)
    x = Dense(c_shape)(x)
    
    # Decoder 
    for skip in reversed(skips):
        filters //= 2
        x = res_se_cnn_decoder_block(x, filters, kernel_size, drop=drop, skip_connection=skip)
    
    return x
        
class GatedMixupGenerator(Sequence):
    def __init__(self, X, y, gate_targets, batch_size, imu_dim, class_weight=None, alpha=0.2, masking_prob=0.0):
        self.X, self.y = X, y
        self.gate_targets = gate_targets  
        self.batch = batch_size
        self.imu_dim = imu_dim
        self.class_weight = class_weight
        self.alpha = alpha
        self.masking_prob = masking_prob
        self.indices = np.arange(len(X))
        
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch))

    def __getitem__(self, i):
        idx = self.indices[i*self.batch:(i+1)*self.batch]
        Xb, yb = self.X[idx].copy(), self.y[idx].copy()
        
        gate_target = self.gate_targets[idx].copy()

        if self.masking_prob > 0:
            for i in range(len(Xb)):
                # If the gate is 1.0 (has ToF) AND we hit the random chance...
                if gate_target[i] == 1.0 and np.random.rand() < self.masking_prob:
                    Xb[i, :, self.imu_dim:] = 0  # Zero out the ToF features
                    gate_target[i] = 0.0         # Set the gate to 0 for this augmented sample

        # The rest of the logic (class weights, mixup) can remain the same
        sample_weights = np.ones(len(Xb), dtype='float32')
        if self.class_weight:
            y_integers = yb.argmax(axis=1)
            sample_weights = np.array([self.class_weight[i] for i in y_integers])

        if self.alpha > 0:
            lam = np.random.beta(self.alpha, self.alpha)
            perm = np.random.permutation(len(Xb))
            X_mix = lam * Xb + (1 - lam) * Xb[perm]
            y_mix = lam * yb + (1 - lam) * yb[perm]
            gate_target_mix = lam * gate_target + (1 - lam) * gate_target[perm]
            sample_weights_mix = lam * sample_weights + (1 - lam) * sample_weights[perm]
            return X_mix, {'main_output': y_mix, 'tof_gate': gate_target_mix[:, np.newaxis]}, sample_weights_mix

        return Xb, {'main_output': yb, 'tof_gate': gate_target[:, np.newaxis]}, sample_weights    

def on_epoch_end(self):
    np.random.shuffle(self.indices)    

def time_sum(x):
    return k.sum(x, axis=1)

def squeeze_last_axis(x):
    return tf.squeeze(x, axis=-1)

def expand_last_axis(x):
    return tf.expand_dims(x, axis=-1)

def tof_block(tof_inputs, wd=1e-4):
    x2_base = Conv1D(64, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(tof_inputs)
    x2_base = BatchNormalization()(x2_base); x2_base = Activation('relu')(x2_base)
    x2_base = MaxPooling1D(2)(x2_base); x2_base = Dropout(0.2)(x2_base)
    x2_base = Conv1D(128, 3, padding='same', use_bias=False, kernel_regularizer=l2(wd))(x2_base)
    x2_base = BatchNormalization()(x2_base); x2_base = Activation('relu')(x2_base)

    gate_input = GlobalAveragePooling1D()(tof_inputs)
    gate_input = Dense(16, activation='relu')(gate_input)

    gate = Dense(1, activation='sigmoid', name='tof_gate_dense')(gate_input)
    return Multiply()([x2_base, gate])

def attention_layer(inputs):
    score = Dense(1, activation='tanh')(inputs)
    score = Lambda(squeeze_last_axis)(score)
    weights = Activation('softmax')(score)
    weights = Lambda(expand_last_axis)(weights)
    context = Multiply()([inputs, weights])
    context = Lambda(time_sum)(context)
    return context    

def features_processing(x1, x2, wd=1e-4):
    merged = Concatenate()([x1, x2])
    xa = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    xb = Bidirectional(GRU(128, return_sequences=True, kernel_regularizer=l2(wd)))(merged)
    xc = GaussianNoise(0.09)(merged)
    xc = Dense(16, activation='elu')(xc)
    
    x = Concatenate()([xa, xb, xc])
    x = Dropout(0.4)(x)
    x = attention_layer(x)

    for units, drop in [(256, 0.5), (128, 0.3)]:
        x = Dense(units, use_bias=False, kernel_regularizer=l2(wd))(x)
        x = BatchNormalization()(x); x = Activation('relu')(x)
        x = Dropout(drop)(x)

    return x

In [7]:
LR_INIT = 5e-4
WD = 3e-3
NUM_CLASSES = 18
BATCH_SIZE = 64
N_SPLITS = 4 
MAX_PAD_LEN = 128
RANDOM_STATE = 42

meta_cols = ['row_id', 'sequence_id', 'sequence_counter', 'subject']

raw_tof_cols = [f'tof_{i}_v{j}' for i in range(1,6) for j in range(0, 64)]

imu_cols = [
    'acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z', 
    'linear_acc_x', 'linear_acc_y', 'linear_acc_z', 'linear_acc_mag', 'linear_acc_mag_jerk',
    'angular_vel_x', 'angular_vel_y', 'angular_vel_z', 'angular_distance',
    'angular_accel_x', 'angular_accel_y', 'angular_accel_z'
]

tof_cols = [
    'thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5',
    'tof_1_mean', 'tof_1_std', 'tof_1_min', 'tof_1_max', 'tof_1_diff_mean', 'tof_1_mean_decay', 'tof_1_active_pixels', 'tof_1_centroid_x', 'tof_1_centroid_y',
    'tof_2_mean', 'tof_2_std', 'tof_2_min', 'tof_2_max', 'tof_2_diff_mean', 'tof_2_mean_decay', 'tof_2_active_pixels', 'tof_2_centroid_x', 'tof_2_centroid_y',
    'tof_3_mean', 'tof_3_std', 'tof_3_min', 'tof_3_max', 'tof_3_diff_mean', 'tof_3_mean_decay', 'tof_3_active_pixels', 'tof_3_centroid_x', 'tof_3_centroid_y',
    'tof_4_mean', 'tof_4_std', 'tof_4_min', 'tof_4_max', 'tof_4_diff_mean', 'tof_4_mean_decay', 'tof_4_active_pixels', 'tof_4_centroid_x', 'tof_4_centroid_y',
    'tof_5_mean', 'tof_5_std', 'tof_5_min', 'tof_5_max', 'tof_5_diff_mean', 'tof_5_mean_decay', 'tof_5_active_pixels', 'tof_5_centroid_x', 'tof_5_centroid_y'
]
feature_cols = imu_cols + tof_cols
imu_dim = len(imu_cols)

# =====================================================================================
# MODEL DEFINITION
# =====================================================================================
def create_model(dataset, imu_dim, wd=1e-4):
    sample_batch = next(iter(dataset))
    input_shape = sample_batch[0].shape[1:]
    inp = tf.keras.layers.Input(shape=input_shape)
    imu = tf.keras.layers.Lambda(lambda t: t[:, :, :imu_dim])(inp)
    tof = tf.keras.layers.Lambda(lambda t: t[:, :, imu_dim:])(inp)

    x1 = unet_se_cnn(imu, 3, base_filters=128, kernel_size=3)
    x2 = tof_block(tof, wd)

    x = features_processing(x1, x2)
    x = tf.keras.layers.Dropout(0.3)(x) 
    main_out = tf.keras.layers.Dense(18, activation="softmax", name="main_output")(x)
    gate_out = tf.keras.layers.Dense(1, activation="sigmoid", name="tof_gate")(x)
    
    return tf.keras.models.Model(inputs=inp, outputs={"main_output": main_out, "tof_gate": gate_out})

print("\n▶ PHASE: Training on train_val split...")

print("\n  Engineering all features from the clean base data...")
imu_dim = len(imu_cols)

print("  Encoding labels...")
le = LabelEncoder()
gesture_int_col = le.fit_transform(df['gesture'].to_pandas())
df = df.with_columns(pl.Series(name="gesture_int", values=gesture_int_col))

cv_info = df.group_by("sequence_id").agg(pl.first("gesture_int")).sort("sequence_id")
all_sequence_ids = cv_info.get_column("sequence_id").to_numpy()
train_ix, val_ix = train_test_split(all_sequence_ids, test_size=0.2)

train_df, val_df = df.filter(pl.col('sequence_id').is_in(train_ix)), df.filter(pl.col('sequence_id').is_in(val_ix))

print("  Fitting final scaler on all training data...")
scaler = StandardScaler()
scaler.fit(train_df[feature_cols].to_numpy())

print("  Preparing full dataset for training...")
# Apply scaling to the full dataset
train_scaled_features = scaler.transform(train_df[feature_cols])
val_scaled_features = scaler.transform(val_df[feature_cols])

X_train_scaled_features = pl.DataFrame(train_scaled_features, schema=feature_cols)
X_val_scaled_features = pl.DataFrame(val_scaled_features, schema=feature_cols)

# This print syntax is specific to your original code
print(X_train_scaled_features[:, :imu_dim].columns)
print(X_train_scaled_features[:, imu_dim:].columns)

# Add gesture_int and sequence_id for sequence creation
meta_cols_to_keep = ['sequence_id', 'gesture_int']
train_df_final = train_df.select(meta_cols_to_keep).with_columns(X_train_scaled_features)
val_df_final = val_df.select(meta_cols_to_keep).with_columns(X_val_scaled_features)

# Generate gate targets for the full dataset
train_gate_df = generate_gate_targets(train_df_final, tof_cols)
val_gate_df = generate_gate_targets(val_df_final, tof_cols)

# Create sequences from the full dataset
X_train, y_train, train_gate_df = create_sequence_dataset(train_df_final, feature_cols, train_gate_df)
X_val, y_val, val_gate_df = create_sequence_dataset(val_df_final, feature_cols, val_gate_df)

X_train_padded = pad_sequences(X_train, maxlen=MAX_PAD_LEN, padding='post', truncating='pre', dtype='float32')
X_val_padded = pad_sequences(X_val, maxlen=MAX_PAD_LEN, padding='post', truncating='pre', dtype='float32')
y_train_cat = to_categorical(y_train, num_classes=NUM_CLASSES)
y_val_cat = to_categorical(y_val, num_classes=NUM_CLASSES)


del train_df_final, val_df_final, X_train_scaled_features, X_val_scaled_features, X_train, y_train, X_val, y_val
gc.collect()    

FINAL_EPOCHS = 150 

full_train_dataset = GatedMixupGenerator(
    X=X_train_padded, y=y_train_cat, gate_targets=train_gate_df,
    batch_size=BATCH_SIZE, imu_dim=imu_dim, alpha=0.2, masking_prob=0.25
)

val_dataset = tf.data.Dataset.from_tensor_slices((
    X_val_padded, {'main_output': y_val_cat, 'tof_gate': val_gate_df[:, np.newaxis]}
)).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)

early_stopping = EarlyStopping(
    monitor='val_main_output_accuracy',
    mode='max',
    patience=20,
    restore_best_weights=True
)
callbacks = [early_stopping]
model = create_model(full_train_dataset, imu_dim)
optimizer = tf.keras.optimizers.AdamW(learning_rate=LR_INIT, weight_decay=WD)
model.compile(optimizer=optimizer, loss={'main_output': 'categorical_crossentropy', 'tof_gate': 'binary_crossentropy'},
              loss_weights={'main_output': 1.0, 'tof_gate': 0.5}, metrics={"main_output": "accuracy"})
history = model.fit(full_train_dataset, epochs=FINAL_EPOCHS, verbose=1, validation_data=val_dataset, callbacks=callbacks)


▶ PHASE: Training on train_val split...

  Engineering all features from the clean base data...
  Encoding labels...
  Fitting final scaler on all training data...
  Preparing full dataset for training...
['acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z', 'linear_acc_x', 'linear_acc_y', 'linear_acc_z', 'linear_acc_mag', 'linear_acc_mag_jerk', 'angular_vel_x', 'angular_vel_y', 'angular_vel_z', 'angular_distance', 'angular_accel_x', 'angular_accel_y', 'angular_accel_z']
['thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5', 'tof_1_mean', 'tof_1_std', 'tof_1_min', 'tof_1_max', 'tof_1_diff_mean', 'tof_1_mean_decay', 'tof_1_active_pixels', 'tof_1_centroid_x', 'tof_1_centroid_y', 'tof_2_mean', 'tof_2_std', 'tof_2_min', 'tof_2_max', 'tof_2_diff_mean', 'tof_2_mean_decay', 'tof_2_active_pixels', 'tof_2_centroid_x', 'tof_2_centroid_y', 'tof_3_mean', 'tof_3_std', 'tof_3_min', 'tof_3_max', 'tof_3_diff_mean', 'tof_3_mean_decay', 'tof_3_active_pixels', 'tof_3_centroid_x', 'tof_3_centroid_y', 't

I0000 00:00:1757244405.763903 2196047 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4714 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1060, pci bus id: 0000:01:00.0, compute capability: 6.1
2025-09-07 12:26:45.769853: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 56171520 exceeds 10% of free system memory.
2025-09-07 12:26:46.043547: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 56171520 exceeds 10% of free system memory.


Epoch 1/150


  self._warn_if_super_not_called()
I0000 00:00:1757244428.311502 2196860 cuda_dnn.cc:529] Loaded cuDNN version 90300
2025-09-07 12:27:12.005679: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.09GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-09-07 12:27:12.473195: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.66GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-09-07 12:27:12.754171: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.54GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that th

[1m 81/100[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m3s[0m 207ms/step - loss: 4.0788 - main_output_accuracy: 0.1136 - main_output_loss: 3.2311 - tof_gate_loss: 0.4702

2025-09-07 12:27:31.226321: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.40GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-09-07 12:27:31.517985: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.30GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-09-07 12:27:31.676522: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:310] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.35GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-09-07 12:27:31.714025: W external/local_xla/xla/ts

[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step - loss: 4.0040 - main_output_accuracy: 0.1235 - main_output_loss: 3.1693 - tof_gate_loss: 0.4459

2025-09-07 12:27:35.738267: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 56171520 exceeds 10% of free system memory.


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 256ms/step - loss: 4.0005 - main_output_accuracy: 0.1240 - main_output_loss: 3.1664 - tof_gate_loss: 0.4448 - val_loss: 3.2743 - val_main_output_accuracy: 0.1698 - val_main_output_loss: 2.5627 - val_tof_gate_loss: 0.2796
Epoch 2/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 201ms/step - loss: 3.0296 - main_output_accuracy: 0.2772 - main_output_loss: 2.3627 - tof_gate_loss: 0.2290 - val_loss: 2.3478 - val_main_output_accuracy: 0.3660 - val_main_output_loss: 1.7911 - val_tof_gate_loss: 0.0982
Epoch 3/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 208ms/step - loss: 2.7633 - main_output_accuracy: 0.3376 - main_output_loss: 2.1605 - tof_gate_loss: 0.2203 - val_loss: 2.0282 - val_main_output_accuracy: 0.4547 - val_main_output_loss: 1.5347 - val_tof_gate_loss: 0.0899
Epoch 4/150
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 216ms/step - loss: 2.5234 - main_

In [17]:
np.argmax(history.history['val_main_output_accuracy'])+1

np.int64(56)

In [21]:
history.history['val_main_output_accuracy']

[0.16981132328510284,
 0.3660377264022827,
 0.4547169804573059,
 0.5163521766662598,
 0.5842767357826233,
 0.6113207340240479,
 0.6364780068397522,
 0.6402515769004822,
 0.5830188393592834,
 0.6566037535667419,
 0.6754717230796814,
 0.6729559898376465,
 0.6635220050811768,
 0.6767295598983765,
 0.6817610263824463,
 0.6880503296852112,
 0.6811320781707764,
 0.6886792182922363,
 0.7188678979873657,
 0.6974842548370361,
 0.696855366230011,
 0.7081760764122009,
 0.7188678979873657,
 0.6918238997459412,
 0.7270440459251404,
 0.7238993644714355,
 0.7069182395935059,
 0.7144653797149658,
 0.7383647561073303,
 0.696855366230011,
 0.7446540594100952,
 0.7421383857727051,
 0.7163522243499756,
 0.7270440459251404,
 0.7446540594100952,
 0.7163522243499756,
 0.7471697926521301,
 0.7597483992576599,
 0.7647798657417297,
 0.7622641324996948,
 0.7616352438926697,
 0.7610062956809998,
 0.7647798657417297,
 0.7628930807113647,
 0.7528302073478699,
 0.7823899388313293,
 0.7553459405899048,
 0.76729559898

In [20]:
np.max(history.history['val_main_output_accuracy'])

np.float64(0.7987421154975891)