# Imports and configs

In [52]:
from sklearn.model_selection import KFold, cross_val_score, cross_validate, GroupKFold
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import clone

from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

plt.rcParams['font.family'] = ['MS Gothic', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

import seaborn as sns
import pandas as pd
import numpy as np
import polars as pl
import warnings
import optuna
import joblib
import glob
import gc
import time
from tqdm import tqdm
import os

warnings.filterwarnings("ignore")

In [53]:
class CFG:
    train_path = "input/train.csv"
    train_demographic_path = "input/train_demographics.csv"
    test_path = "input/test.csv"
    test_demographic_path = "input/test_demographics.csv"

    target = "gesture"
    features_train_only = ['sequence_type', 'orientation']

    n_folds = 5
    seed = 42

    run_optuna = True
    n_optuna_trials = 100 

# Data loading

In [54]:
def reduce_mem_usage(dataframe, dataset):    
    print('Reducing memory usage for:', dataset)
    initial_mem_usage = dataframe.memory_usage().sum() / 1024**2

    for col in dataframe.columns:
        col_type = dataframe[col].dtype

        # 数値型の列のみ処理
        if np.issubdtype(col_type, np.number):
            c_min = dataframe[col].min()
            c_max = dataframe[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dataframe[col] = dataframe[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dataframe[col] = dataframe[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dataframe[col] = dataframe[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dataframe[col] = dataframe[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dataframe[col] = dataframe[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dataframe[col] = dataframe[col].astype(np.float32)
                else:
                    dataframe[col] = dataframe[col].astype(np.float64)

    final_mem_usage = dataframe.memory_usage().sum() / 1024**2
    print('--- Memory usage before: {:.2f} MB'.format(initial_mem_usage))
    print('--- Memory usage after: {:.2f} MB'.format(final_mem_usage))
    print('--- Decreased memory usage by {:.1f}%\n'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))

    return dataframe

In [55]:
train = pd.read_csv(CFG.train_path).reset_index(drop=True)
train_d = pd.read_csv(CFG.train_demographic_path).reset_index(drop=True)
test = pd.read_csv(CFG.test_path).reset_index(drop=True)
test_d = pd.read_csv(CFG.test_demographic_path).reset_index(drop=True)

In [56]:
# train_reduced = train.copy()
# train_reduced = reduce_mem_usage(train_reduced, "train")
# test_reduced = test.copy()
# test_reduced = reduce_mem_usage(test_reduced, "test")

# X = train_reduced.drop(columns=CFG.target, axis=1)
# y = train_reduced[CFG.target]
# X_test = test_reduced

# X.head()

# Preprocessing

In [57]:
# Fill missing values per sequence
train = train.groupby("sequence_id").apply(lambda g: g.ffill().bfill()).reset_index(drop=True)
test = test.groupby("sequence_id").apply(lambda g: g.ffill().bfill()).reset_index(drop=True)

# Remove low-variance columns
def get_low_var_cols(df, threshold=0.95):
    return [col for col in df.columns if df[col].nunique() <= 1 or (df[col] == df[col].iloc[0]).mean() > threshold]

lowv = get_low_var_cols(train)
train = train.drop(columns=lowv)
test = test.drop(columns=[c for c in lowv if c in test.columns])

# Encode target
le_gesture = LabelEncoder()
train["e_gesture"] = le_gesture.fit_transform(train["gesture"])

#  Simulate IMU-only scenarios in 50% of training data 
all_seq_ids = train['sequence_id'].unique()
imu_only_seq_ids = np.random.choice(all_seq_ids, size=int(0.5*len(all_seq_ids)), replace=False)
extra_seq_ids = [sid for sid in all_seq_ids if sid not in imu_only_seq_ids]

# Set thermal and ToF columns to NaN for IMU-only sequences
thm_cols = [c for c in train.columns if 'thm_' in c]
tof_cols = [c for c in train.columns if 'tof_' in c]
train.loc[train['sequence_id'].isin(imu_only_seq_ids), thm_cols+tof_cols] = np.nan

print(f"Created {len(imu_only_seq_ids)} IMU-only and {len(extra_seq_ids)} full-sensor sequences")


Created 4075 IMU-only and 4076 full-sensor sequences


In [58]:
def feature_set(df, use_extra=False):
    """Extract comprehensive features from sensor data"""
    imu_cols = [c for c in df.columns if any(x in c for x in ['acc_', 'rot_'])]
    feats = {}
    
    # IMU features with  statistics
    for col in imu_cols:
        arr = df[col].values
        feats[f'{col}_mean'] = np.mean(arr)
        feats[f'{col}_std'] = np.std(arr)
        feats[f'{col}_min'] = np.min(arr)
        feats[f'{col}_max'] = np.max(arr)
        feats[f'{col}_median'] = np.median(arr)
        # Additional statistical features
        feats[f'{col}_energy'] = np.sum(arr**2) / len(arr)
        feats[f'{col}_first'] = arr[0]
        feats[f'{col}_last'] = arr[-1]
        feats[f'{col}_range'] = np.max(arr) - np.min(arr)
        feats[f'{col}_mad'] = np.mean(np.abs(np.diff(arr)))  # Mean absolute difference
        feats[f'{col}_zcr'] = ((arr[:-1]*arr[1:]) < 0).mean()  # Zero crossing rate
    
    # Cross-sensor correlations
    for (c1, c2) in [('acc_x','acc_y'),('acc_x','acc_z'),('acc_y','acc_z'),
                     ('rot_x','rot_y'),('rot_x','rot_z'),('rot_y','rot_z')]:
        if c1 in df.columns and c2 in df.columns:
            feats[f'{c1}_{c2}_corr'] = np.corrcoef(df[c1], df[c2])[0,1]
    
    #  extra sensor features
    if use_extra:
        # Thermal features
        thm_cols = [c for c in df.columns if 'thm_' in c]
        for col in thm_cols:
            arr = df[col].values
            feats[f'{col}_mean'] = np.nanmean(arr)
            feats[f'{col}_std'] = np.nanstd(arr)
            feats[f'{col}_min'] = np.nanmin(arr)
            feats[f'{col}_max'] = np.nanmax(arr)
        
        #  ToF features
        tof_cols = [c for c in df.columns if 'tof_' in c]
        if tof_cols:
            all_tof = np.stack([df[c].values for c in tof_cols], axis=1)
            valid_mask = (all_tof >= 0)
            valid_vals = all_tof[valid_mask]
            feats['tof_count'] = valid_mask.sum()
            feats['tof_valid_mean'] = np.nanmean(valid_vals) if valid_vals.size > 0 else np.nan
            feats['tof_valid_std']  = np.nanstd(valid_vals) if valid_vals.size > 0 else np.nan
            feats['tof_valid_min']  = np.nanmin(valid_vals) if valid_vals.size > 0 else np.nan
            feats['tof_valid_max']  = np.nanmax(valid_vals) if valid_vals.size > 0 else np.nan
    
    return feats

In [59]:
def build_features(data, demographics, use_extra=False):
    """Build feature matrix for training"""
    features = []
    for seq_id, g in data.groupby('sequence_id'):
        feats = feature_set(g, use_extra=use_extra)
        feats['sequence_id'] = seq_id
        
        # Add demographics
        subj = g['subject'].iloc[0]
        demo = demographics[demographics['subject']==subj].iloc[0] if (demographics['subject']==subj).any() else {}
        for dcol in ['age','adult_child','sex','handedness','height_cm','shoulder_to_wrist_cm','elbow_to_wrist_cm']:
            feats[dcol] = demo[dcol] if dcol in demo else np.nan
        feats['sex_F'] = int(feats.get('sex',0)==0)
        feats['sex_M'] = int(feats.get('sex',0)==1)
        feats['handed_L'] = int(feats.get('handedness',0)==0)
        feats['handed_R'] = int(feats.get('handedness',0)==1)
        features.append(feats)
    
    return pd.DataFrame(features).set_index('sequence_id')

In [60]:
# Build separate feature sets for IMU-only and full-sensor data
print("Building features for IMU-only sequences...")
X_imu = build_features(train[train['sequence_id'].isin(imu_only_seq_ids)], train_d, use_extra=False)
y_imu = train.groupby('sequence_id')['e_gesture'].first().loc[X_imu.index]

print("Building features for full-sensor sequences...")
X_extra = build_features(train[train['sequence_id'].isin(extra_seq_ids)], train_d, use_extra=True)
y_extra = train.groupby('sequence_id')['e_gesture'].first().loc[X_extra.index]

print(f"IMU-only features: {X_imu.shape}")
print(f"Full-sensor features: {X_extra.shape}")

Building features for IMU-only sequences...
Building features for full-sensor sequences...
IMU-only features: (4075, 94)
Full-sensor features: (4076, 119)


# Training base model

In [61]:
# 共通設定
kf = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)

In [62]:
def run_cross_validation(model, X, y, model_name="Model", cv_folds=None):
    """
    fold別詳細結果表示対応クロスバリデーション関数（F1スコア評価版）
    
    Parameters:
    -----------
    model : sklearn compatible model
        学習させるモデル
    X : pandas.DataFrame
        特徴量データ
    y : pandas.Series
        目的変数
    model_name : str
        モデル名
    cv_folds : KFold object
        クロスバリデーション設定
    
    Returns:
    --------
    dict : CV結果の辞書
    """
    
    if cv_folds is None:
        cv_folds = KFold(n_splits=CFG.n_folds, shuffle=False, random_state=None)
    
    print(f"=== {model_name} Cross Validation 開始 ===")
    
    # 結果保存用
    fold_results = []
    f1_macro_scores = []
    f1_binary_scores = []
    final_scores = []
    times = []
    
    # 各fold実行
    for fold, (train_idx, val_idx) in enumerate(cv_folds.split(X, y)):
        start_time = time.time()
        
        # データ分割
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # モデル学習
        model.fit(X_train, y_train)
        
        # 予測
        y_pred = model.predict(X_val)
        
        # 評価指標計算（train_cv関数と同じ方法）
        f1_macro = f1_score(y_val, y_pred, average='macro')
        f1_binary = f1_score((y_val == 0), (y_pred == 0), average='binary')
        final_score = (f1_macro + f1_binary) / 2
        
        # 実行時間
        fold_time = time.time() - start_time
        
        # 結果保存
        f1_macro_scores.append(f1_macro)
        f1_binary_scores.append(f1_binary)
        final_scores.append(final_score)
        times.append(fold_time)
        
        fold_results.append({
            'fold': fold,
            'f1_macro': f1_macro,
            'f1_binary': f1_binary,
            'final_score': final_score,
            'time': fold_time
        })
        
        # fold別結果表示
        print(f"--- Fold {fold} - Final Score: {final_score:.4f} (F1_macro: {f1_macro:.4f}, F1_binary: {f1_binary:.4f}) - Time: {fold_time:.2f} s")
    
    # 全体統計
    results = {
        'fold_results': fold_results,
        'f1_macro': {
            'mean': np.mean(f1_macro_scores),
            'std': np.std(f1_macro_scores),
            'scores': f1_macro_scores
        },
        'f1_binary': {
            'mean': np.mean(f1_binary_scores),
            'std': np.std(f1_binary_scores),
            'scores': f1_binary_scores
        },
        'final_score': {
            'mean': np.mean(final_scores),
            'std': np.std(final_scores),
            'scores': final_scores
        },
        'time': {
            'mean': np.mean(times),
            'total': np.sum(times)
        }
    }
    
    # 全体結果表示
    print(f"\n{model_name} Overall Results:")
    print(f"  F1 Macro: {results['f1_macro']['mean']:.6f} ± {results['f1_macro']['std']:.6f}")
    print(f"  F1 Binary: {results['f1_binary']['mean']:.6f} ± {results['f1_binary']['std']:.6f}")
    print(f"  Final Score: {results['final_score']['mean']:.6f} ± {results['final_score']['std']:.6f}")
    print(f"  Total Time: {results['time']['total']:.2f} s")
    print()
    
    # 全データでモデルを学習し直してから保存
    print(f"Training final {model_name} model on full dataset...")
    final_model = clone(model)
    final_model.fit(X, y)
    
    # モデル保存
    model_filename = f"models/model_{model_name.lower()}.pkl"
    joblib.dump(final_model, model_filename)
    print(f"Model saved to: {model_filename}")
    
    return results

## LightGBM (gbdt)

In [63]:
imu_lgbm_model = LGBMClassifier(
    objective='multiclass',
    metric='multi_logloss',
    device='gpu',
    gpu_platform_id=0,
    gpu_device_id=0,
    learning_rate=0.05,
    n_estimators=1500,
    num_leaves=50,
    max_depth=-1,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=2,
    lambda_l1=0.1,
    lambda_l2=0.1,
    random_state=CFG.seed,
    verbose=-1
)

full_lgbm_model = LGBMClassifier(
    objective='multiclass',
    metric='multi_logloss',
    device='gpu',
    gpu_platform_id=0,
    gpu_device_id=0,
    learning_rate=0.05,
    n_estimators=1500,
    num_leaves=50,
    max_depth=-1,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=2,
    lambda_l1=0.1,
    lambda_l2=0.1,
    random_state=CFG.seed,
    verbose=-1
)

In [64]:
# Train IMU-only model
imu_lgbm_results = run_cross_validation(
    model=imu_lgbm_model,
    X=X_imu,
    y=y_imu,
    model_name="IMU-only-LightGBM",
    cv_folds=kf
)

=== IMU-only-LightGBM Cross Validation 開始 ===
--- Fold 0 - Final Score: 0.5084 (F1_macro: 0.5127, F1_binary: 0.5042) - Time: 57.17 s
--- Fold 1 - Final Score: 0.4787 (F1_macro: 0.4924, F1_binary: 0.4651) - Time: 61.00 s
--- Fold 2 - Final Score: 0.4713 (F1_macro: 0.4932, F1_binary: 0.4493) - Time: 60.17 s
--- Fold 3 - Final Score: 0.4658 (F1_macro: 0.4933, F1_binary: 0.4384) - Time: 61.83 s
--- Fold 4 - Final Score: 0.4848 (F1_macro: 0.4737, F1_binary: 0.4960) - Time: 58.35 s

IMU-only-LightGBM Overall Results:
  F1 Macro: 0.493052 ± 0.012338
  F1 Binary: 0.470590 ± 0.025685
  Final Score: 0.481821 ± 0.014795
  Total Time: 298.53 s

Training final IMU-only-LightGBM model on full dataset...
Model saved to: models/model_imu-only-lightgbm.pkl


In [65]:
# Train full-sensor model
full_lgbm_results = run_cross_validation(
    model=full_lgbm_model,
    X=X_extra,
    y=y_extra,
    model_name="Full-sensor-LightGBM",
    cv_folds=kf
)

=== Full-sensor-LightGBM Cross Validation 開始 ===
--- Fold 0 - Final Score: 0.6434 (F1_macro: 0.5606, F1_binary: 0.7261) - Time: 45.76 s
--- Fold 1 - Final Score: 0.6278 (F1_macro: 0.5996, F1_binary: 0.6560) - Time: 43.63 s
--- Fold 2 - Final Score: 0.5996 (F1_macro: 0.5957, F1_binary: 0.6034) - Time: 44.63 s
--- Fold 3 - Final Score: 0.6164 (F1_macro: 0.5929, F1_binary: 0.6400) - Time: 43.71 s
--- Fold 4 - Final Score: 0.6029 (F1_macro: 0.5762, F1_binary: 0.6296) - Time: 45.66 s

Full-sensor-LightGBM Overall Results:
  F1 Macro: 0.585007 ± 0.014573
  F1 Binary: 0.651039 ± 0.041246
  Final Score: 0.618023 ± 0.016172
  Total Time: 223.39 s

Training final Full-sensor-LightGBM model on full dataset...
Model saved to: models/model_full-sensor-lightgbm.pkl


In [66]:
# Save label encoder
joblib.dump(le_gesture, "models/le_gesture.pkl")

['models/le_gesture.pkl']

# Submission

In [27]:
def has_extra_sensors(df):
    """Detect if sequence has valid thermal/ToF sensor data"""
    thm_cols = [c for c in df.columns if 'thm_' in c]
    tof_cols = [c for c in df.columns if 'tof_' in c]
    
    if not thm_cols or not tof_cols:
        return False
    
    thm_vals = df[thm_cols].values
    tof_vals = df[tof_cols].values
    
    # Check if all values are missing/invalid
    if np.all(np.isnan(thm_vals)) and np.all((tof_vals == -1) | (np.isnan(tof_vals))):
        return False
    
    # Return True if sufficient valid data exists
    return (np.isnan(thm_vals).mean() < 0.8) or (np.all(tof_vals != -1) and (tof_vals != -1).mean() > 0.2)

In [28]:
def extract_features_for_test(sequence_df: pd.DataFrame, demo_df: pd.DataFrame):
    """Extract features for test sequence and determine sensor availability"""
    use_extra = has_extra_sensors(sequence_df)
    feats = feature_set(sequence_df, use_extra=use_extra)
    
    # Add demographics
    demo_row = demo_df.iloc[0] if len(demo_df) > 0 else {}
    for dcol in ['age','adult_child','sex','handedness','height_cm','shoulder_to_wrist_cm','elbow_to_wrist_cm']:
        feats[dcol] = demo_row.get(dcol, np.nan)
    feats['sex_F'] = int(feats.get('sex', 0) == 0)
    feats['sex_M'] = int(feats.get('sex', 0) == 1)
    feats['handed_L'] = int(feats.get('handedness', 0) == 0)
    feats['handed_R'] = int(feats.get('handedness', 0) == 1)
    
    return pd.DataFrame([feats]), use_extra

In [None]:
# モデル・エンコーダを読み込む
model_imu = joblib.load("models/model_imu-only-lightgbm.pkl")
model_extra = joblib.load("models/model_full-sensor-lightgbm.pkl")
le_gesture = joblib.load("models/le_gesture.pkl")

def predict(sequence: pl.DataFrame, demographics: pl.DataFrame) -> str:
    """Smart prediction using appropriate model based on available sensors"""
    sequence = sequence.to_pandas()
    demographics = demographics.to_pandas()
    
    try:
        feats, use_extra = extract_features_for_test(sequence, demographics)
        model = model_extra if use_extra else model_imu
        pred = model.predict(feats)[0]
        return le_gesture.inverse_transform([pred])[0]
        
    except Exception as e:
        return le_gesture.classes_[0]


In [49]:
# Inference Server Setup 
import kaggle_evaluation.cmi_inference_server

inference_server = kaggle_evaluation.cmi_inference_server.CMIInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        data_paths=(
            'input/test.csv',
            'input/test_demographics.csv',
        )
    )

In [50]:
pd.read_parquet('submission.parquet').reset_index(drop=True)

Unnamed: 0,sequence_id,gesture
0,SEQ_000001,Above ear - pull hair
1,SEQ_000011,Above ear - pull hair
