In [1]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import joblib
import warnings
import random
from scipy.fft import rfft, rfftfreq
from scipy.stats import skew, kurtosis
from scipy.signal import find_peaks
import polars as pl

In [2]:
# --- Configuration ---
TRAIN = True  # Set to True to train and save models, False for inference
MODEL_DIR = "model"
BASE_PATH = "input/"

warnings.filterwarnings('ignore')
random.seed(42)
np.random.seed(42)

In [3]:
# ---  Feature Engineering ---
def feature_set(df: pd.DataFrame, use_extra: bool = False) -> dict:
    """
    Extracts an enhanced set of features from sensor data for gesture prediction.

    Features include:
    - Basic statistics (mean, std, min, max, median)
    - Advanced statistics (energy, range, quantiles, skew, kurtosis)
    - Peak detection for IMU signals
    - Frequency band energy from FFT
    - Aggregated thermal and ToF features (max/std, rate of change)
    """
    feats = {}
    imu_cols = [c for c in df.columns if any(x in c for x in ['acc_', 'rot_'])]

    # Vector magnitudes and jerk features
    df['acc_mag'] = np.sqrt(df['acc_x']**2 + df['acc_y']**2 + df['acc_z']**2)
    df['rot_mag'] = np.sqrt(df['rot_x']**2 + df['rot_y']**2 + df['rot_z']**2)
    df['acc_jerk_x'] = df['acc_x'].diff().fillna(0)
    df['acc_jerk_y'] = df['acc_y'].diff().fillna(0)
    df['acc_jerk_z'] = df['acc_z'].diff().fillna(0)
    imu_cols.extend(['acc_mag', 'rot_mag', 'acc_jerk_x', 'acc_jerk_y', 'acc_jerk_z'])

    # Feature Extraction for IMU Sensors
    for col in imu_cols:
        arr = df[col].values
        # Basic Statistics
        feats[f'{col}_mean'] = np.mean(arr)
        feats[f'{col}_std'] = np.std(arr)
        feats[f'{col}_min'] = np.min(arr)
        feats[f'{col}_max'] = np.max(arr)
        feats[f'{col}_median'] = np.median(arr)
        feats[f'{col}_range'] = feats[f'{col}_max'] - feats[f'{col}_min']
        # Advanced Statistics
        feats[f'{col}_q25'] = np.quantile(arr, 0.25)
        feats[f'{col}_q75'] = np.quantile(arr, 0.75)
        feats[f'{col}_energy'] = np.sum(arr**2) / len(arr)
        feats[f'{col}_skew'] = skew(arr)
        feats[f'{col}_kurtosis'] = kurtosis(arr)
        # Rolling Window Features
        for window in [5, 10, 15, 20, 25, 50]:
            rolling_mean = df[col].rolling(window=window, min_periods=1).mean()
            rolling_std = df[col].rolling(window=window, min_periods=1).std().fillna(0)
            feats[f'{col}_roll_mean_{window}_mean'] = rolling_mean.mean()
            feats[f'{col}_roll_std_{window}_mean'] = rolling_std.mean()
        # Frequency Domain Features (FFT)
        fft_vals = rfft(arr)
        fft_mags = np.abs(fft_vals)
        if len(fft_mags) > 1:
            feats[f'{col}_fft_mean'] = np.mean(fft_mags)
            feats[f'{col}_fft_max'] = np.max(fft_mags)
            freqs = rfftfreq(len(arr))
            feats[f'{col}_fft_dominant_freq'] = freqs[np.argmax(fft_mags[1:]) + 1] if len(fft_mags) > 1 else 0
            # Energy in Frequency Bands
            bands = [(0, 2), (2, 5), (5, 10)]
            for low, high in bands:
                mask = (freqs >= low) & (freqs < high)
                feats[f'{col}_fft_energy_{low}_{high}'] = np.sum(fft_mags[mask])
        # Peak Detection
        peaks, _ = find_peaks(arr, height=0)
        feats[f'{col}_num_peaks'] = len(peaks) / len(arr)

    # Cross-Sensor Correlations
    for (c1, c2) in [('acc_x', 'acc_y'), ('acc_x', 'acc_z'), ('acc_y', 'acc_z'),
                     ('rot_x', 'rot_y'), ('rot_x', 'rot_z'), ('rot_y', 'rot_z')]:
        if c1 in df.columns and c2 in df.columns:
            feats[f'{c1}_{c2}_corr'] = np.corrcoef(df[c1], df[c2])[0, 1]

    # Extra Sensor Features (Thermal & ToF)
    if use_extra:
        thm_cols = [c for c in df.columns if 'thm' in c]
        tof_cols = [c for c in df.columns if 'tof' in c]
        # Thermal Features
        if thm_cols:
            thm_data = df[thm_cols].values
            feats['thm_max_across_sensors'] = np.nanmax(thm_data, axis=1).mean()
            feats['thm_std_across_sensors'] = np.nanstd(thm_data, axis=1).mean()
            for col in thm_cols:
                feats[f'{col}_mean'] = np.nanmean(df[col])
                feats[f'{col}_max'] = np.nanmax(df[col])
                feats[f'{col}_diff_mean'] = df[col].diff().abs().mean()
        # ToF Features
        if tof_cols:
            tof_data = df[tof_cols].values
            valid_tof = tof_data >= 0
            min_tof_per_time = np.min(tof_data, axis=1, where=valid_tof, initial=10000)
            feats['tof_min_across_sensors'] = np.mean(min_tof_per_time)
            feats['tof_num_valid_sensors'] = np.sum(valid_tof, axis=1).mean()
            for col in tof_cols:
                valid_vals = df[col][df[col] >= 0]
                if valid_vals.size > 0:
                    feats[f'{col}_mean'] = np.mean(valid_vals)
                    feats[f'{col}_min'] = np.min(valid_vals)
                    feats[f'{col}_diff_mean'] = df[col].diff().abs().mean()

    return feats

def build_features(data: pd.DataFrame, demographics: pd.DataFrame, use_extra: bool) -> tuple[pd.DataFrame, list]:
    """Builds feature matrix and returns features with subject groups for CV."""
    features = []
    groups = []
    for seq_id, g in data.groupby('sequence_id'):
        subj = g['subject'].iloc[0]
        groups.append(subj)
        feats = feature_set(g, use_extra=use_extra)
        feats['sequence_id'] = seq_id
        demo_row = demographics[demographics['subject'] == subj]
        if not demo_row.empty:
            demo = demo_row.iloc[0]
            for dcol in ['age', 'sex', 'height_cm', 'handedness']:
                feats[dcol] = demo.get(dcol, np.nan)
        features.append(feats)
    df_feats = pd.DataFrame(features).set_index('sequence_id')
    if 'sex' in df_feats.columns:
        df_feats['sex'] = df_feats['sex'].map({'M': 1, 'F': 0})
    if 'handedness' in df_feats.columns:
        df_feats['handedness'] = df_feats['handedness'].map({'R': 1, 'L': 0, 'A': 2})
    return df_feats, groups

# --- Training Pipeline ---
def train_cv(X: pd.DataFrame, y: pd.Series, groups: list, name: str = "model"):
    """Trains a LightGBM model with GroupKFold CV."""
    print(f"\n--- Training {name} model ---")
    print(f"Training with {X.shape[0]} sequences and {X.shape[1]} features.")
    val_scores = []
    lgbm_params = {
        'objective': 'multiclass', 'metric': 'multi_logloss',
        'n_estimators': 3000, 'learning_rate': 0.015,
        'feature_fraction': 0.7, 'bagging_fraction': 0.7,
        'bagging_freq': 1, 'lambda_l1': 0.3, 'lambda_l2': 0.3,
        'num_leaves': 60, 'min_child_samples': 20,
        'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt', 'verbose': -1
    }
    for fold, (tr_idx, val_idx) in enumerate(GroupKFold(n_splits=5).split(X, y, groups)):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
        model = lgb.LGBMClassifier(**lgbm_params)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                  eval_metric='multi_logloss',
                  callbacks=[lgb.early_stopping(150, verbose=False)])
        preds = model.predict(X_val)
        f1 = f1_score(y_val, preds, average='macro')
        val_scores.append(f1)
        print(f"Fold {fold} F1-Macro: {f1:.4f}")
    print(f"Mean CV F1-Macro: {np.mean(val_scores):.4f}")
    model = lgb.LGBMClassifier(**lgbm_params)
    model.fit(X, y)
    joblib.dump(model, os.path.join(MODEL_DIR, f"model_{name}.pkl"))
    print(f"Saved {name} model to {MODEL_DIR}/model_{name}.pkl")
    return model

def train_pipeline():
    """Executes the training pipeline for IMU-only and full-sensor models."""
    print("===== Starting Training Pipeline =====")
    print("Loading Data...")
    train = pd.read_csv(BASE_PATH + "train.csv")
    train_d = pd.read_csv(BASE_PATH + "train_demographics.csv")
    train = train.groupby("sequence_id").apply(lambda g: g.ffill().bfill()).reset_index(drop=True)
    print("Encoding...")
    le_gesture = LabelEncoder()
    train["e_gesture"] = le_gesture.fit_transform(train["gesture"])
    all_seq_ids = train['sequence_id'].unique()
    imu_only_seq_ids = np.random.choice(all_seq_ids, size=int(0.5 * len(all_seq_ids)), replace=False)
    extra_seq_ids = [sid for sid in all_seq_ids if sid not in imu_only_seq_ids]
    print(f"Created {len(imu_only_seq_ids)} IMU-only and {len(extra_seq_ids)} full-sensor sequences.")

    print("Building features for IMU-only model...")
    X_imu, groups_imu = build_features(train[train['sequence_id'].isin(imu_only_seq_ids)], train_d, use_extra=False)
    y_imu = train.groupby('sequence_id')['e_gesture'].first().loc[X_imu.index]
    print("Building features for full-sensor model...")
    X_extra, groups_extra = build_features(train[train['sequence_id'].isin(extra_seq_ids)], train_d, use_extra=True)
    y_extra = train.groupby('sequence_id')['e_gesture'].first().loc[X_extra.index]

    imu_cols = X_imu.columns
    extra_cols = X_extra.columns
    X_imu = X_imu[imu_cols].copy()
    X_extra = X_extra[extra_cols].copy()  # Use all extra columns

    model_imu = train_cv(X_imu, y_imu, groups_imu, "imu")
    model_extra = train_cv(X_extra, y_extra, groups_extra, "extra")
    joblib.dump(le_gesture, os.path.join(MODEL_DIR, "le_gesture.pkl"))
    print("Saved label encoder to le_gesture.pkl")
    print("===== Training Pipeline Complete =====")

# --- Inference Pipeline ---
def has_extra_sensors(df: pd.DataFrame) -> bool:
    """Checks if a sequence has valid thermal/ToF sensor data."""
    thm_cols = [c for c in df.columns if 'thm_' in c]
    tof_cols = [c for c in df.columns if 'tof_' in c]
    if not thm_cols or not tof_cols:
        return False
    has_thm = df[thm_cols].notna().any().any()
    has_tof = (df[tof_cols] != -1).any().any()
    return has_thm and has_tof

def predict(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:
    """Predicts the gesture for a single sequence."""
    sequence_pd = sequence.to_pandas()
    demographics_pd = demographics.to_pandas()
    try:
        model_imu = joblib.load(os.path.join(MODEL_DIR, "model_imu.pkl"))
        model_extra = joblib.load(os.path.join(MODEL_DIR, "model_extra.pkl"))
        le_gesture = joblib.load(os.path.join(MODEL_DIR, "le_gesture.pkl"))
        use_extra = has_extra_sensors(sequence_pd)
        df_feats, _ = build_features(sequence_pd.assign(sequence_id=0), demographics_pd, use_extra)
        model = model_extra if use_extra else model_imu
        model_cols = model.feature_name_
        df_feats = df_feats.reindex(columns=model_cols, fill_value=0)
        pred = model.predict(df_feats)[0]
        return le_gesture.inverse_transform([pred])[0]
    except Exception as e:
        print(f"Prediction error: {e}")
        le_gesture = joblib.load(os.path.join(MODEL_DIR, "le_gesture.pkl"))
        return le_gesture.classes_[0]

In [4]:
# --- Main Execution Block ---
if __name__ == "__main__":
    if TRAIN:
        os.makedirs(MODEL_DIR, exist_ok=True)
        train_pipeline()
    else:
        print("===== Inference Mode =====")
        print(f"Models will be loaded from '{MODEL_DIR}'.")
        # Kaggle Inference Server Setup
        try:
            import kaggle_evaluation.cmi_inference_server
            inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)
            if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
                inference_server.serve()
                print("Inference server started for submission.")
            else:
                print("Starting local inference gateway for testing...")
                inference_server.run_local_gateway(
                    data_paths=(
                        os.path.join(BASE_PATH, 'test.csv'),
                        os.path.join(BASE_PATH, 'test_demographics.csv'),
                    )
                )
        except (ImportError, ModuleNotFoundError):
            print("Kaggle evaluation module not found. Skipping inference server setup.")

===== Starting Training Pipeline =====
Loading Data...
Encoding...
Created 4075 IMU-only and 4076 full-sensor sequences.
Building features for IMU-only model...
Building features for full-sensor model...

--- Training imu model ---
Training with 4075 sequences and 370 features.
Fold 0 F1-Macro: 0.5758
Fold 1 F1-Macro: 0.5630
Fold 2 F1-Macro: 0.5816
Fold 3 F1-Macro: 0.5536
Fold 4 F1-Macro: 0.5563
Mean CV F1-Macro: 0.5660
Saved imu model to model/model_imu.pkl

--- Training extra model ---
Training with 4076 sequences and 1349 features.
Fold 0 F1-Macro: 0.5403
Fold 1 F1-Macro: 0.6147
Fold 2 F1-Macro: 0.6419
Fold 3 F1-Macro: 0.6456
Fold 4 F1-Macro: 0.6177
Mean CV F1-Macro: 0.6120
Saved extra model to model/model_extra.pkl
Saved label encoder to le_gesture.pkl
===== Training Pipeline Complete =====
