# Imports and configs

In [None]:
from google.colab import drive
import os

# 1. Googleドライブをマウント
drive.mount('/content/drive', force_remount=True)

# 2. 作業用フォルダの作成と移動

WORKING_DIR = "/content/drive/MyDrive/Kaggle/MABe"

# カレントディレクトリ変更
os.chdir(WORKING_DIR)
print(f"現在の作業ディレクトリ: {os.getcwd()}")

In [None]:
# @title スコア評価用関数
"""F Beta customized for the data format of the MABe challenge."""

import json

from collections import defaultdict

import pandas as pd
import polars as pl


class HostVisibleError(Exception):
    pass


def single_lab_f1(lab_solution: pl.DataFrame, lab_submission: pl.DataFrame, beta: float = 1) -> float:
    label_frames: defaultdict[str, set[int]] = defaultdict(set)
    prediction_frames: defaultdict[str, set[int]] = defaultdict(set)

    for row in lab_solution.to_dicts():
        label_frames[row['label_key']].update(range(row['start_frame'], row['stop_frame']))

    for video in lab_solution['video_id'].unique():
        active_labels: str = lab_solution.filter(pl.col('video_id') == video)['behaviors_labeled'].first()  # ty: ignore
        active_labels: set[str] = set(json.loads(active_labels))
        predicted_mouse_pairs: defaultdict[str, set[int]] = defaultdict(set)

        for row in lab_submission.filter(pl.col('video_id') == video).to_dicts():
            # Since the labels are sparse, we can't evaluate prediction keys not in the active labels.
            if ','.join([str(row['agent_id']), str(row['target_id']), row['action']]) not in active_labels:
                continue

            new_frames = set(range(row['start_frame'], row['stop_frame']))
            # Ignore truly redundant predictions.
            new_frames = new_frames.difference(prediction_frames[row['prediction_key']])
            prediction_pair = ','.join([str(row['agent_id']), str(row['target_id'])])
            if predicted_mouse_pairs[prediction_pair].intersection(new_frames):
                # A single agent can have multiple targets per frame (ex: evading all other mice) but only one action per target per frame.
                raise HostVisibleError('Multiple predictions for the same frame from one agent/target pair')
            prediction_frames[row['prediction_key']].update(new_frames)
            predicted_mouse_pairs[prediction_pair].update(new_frames)

    tps = defaultdict(int)
    fns = defaultdict(int)
    fps = defaultdict(int)
    for key, pred_frames in prediction_frames.items():
        action = key.split('_')[-1]
        matched_label_frames = label_frames[key]
        tps[action] += len(pred_frames.intersection(matched_label_frames))
        fns[action] += len(matched_label_frames.difference(pred_frames))
        fps[action] += len(pred_frames.difference(matched_label_frames))

    distinct_actions = set()
    for key, frames in label_frames.items():
        action = key.split('_')[-1]
        distinct_actions.add(action)
        if key not in prediction_frames:
            fns[action] += len(frames)

    action_f1s = []
    for action in distinct_actions:
        if tps[action] + fns[action] + fps[action] == 0:
            action_f1s.append(0)
        else:
            action_f1s.append((1 + beta**2) * tps[action] / ((1 + beta**2) * tps[action] + beta**2 * fns[action] + fps[action]))
    return sum(action_f1s) / len(action_f1s)


def mouse_fbeta(solution: pd.DataFrame, submission: pd.DataFrame, beta: float = 1) -> float:
    """
    Doctests:
    >>> solution = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 10, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ... ])
    >>> submission = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 10},
    ... ])
    >>> mouse_fbeta(solution, submission)
    1.0

    >>> solution = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 10, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ... ])
    >>> submission = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'mount', 'start_frame': 0, 'stop_frame': 10}, # Wrong action
    ... ])
    >>> mouse_fbeta(solution, submission)
    0.0

    >>> solution = pd.DataFrame([
    ...     {'video_id': 123, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 9, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ...     {'video_id': 123, 'agent_id': 1, 'target_id': 2, 'action': 'mount', 'start_frame': 15, 'stop_frame': 24, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ... ])
    >>> submission = pd.DataFrame([
    ...     {'video_id': 123, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 9},
    ... ])
    >>> "%.12f" % mouse_fbeta(solution, submission)
    '0.500000000000'

    >>> solution = pd.DataFrame([
    ...     {'video_id': 123, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 9, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ...     {'video_id': 123, 'agent_id': 1, 'target_id': 2, 'action': 'mount', 'start_frame': 15, 'stop_frame': 24, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ...     {'video_id': 345, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 9, 'lab_id': 2, 'behaviors_labeled': '["1,2,attack"]'},
    ...     {'video_id': 345, 'agent_id': 1, 'target_id': 2, 'action': 'mount', 'start_frame': 15, 'stop_frame': 24, 'lab_id': 2, 'behaviors_labeled': '["1,2,attack"]'},
    ... ])
    >>> submission = pd.DataFrame([
    ...     {'video_id': 123, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 9},
    ... ])
    >>> "%.12f" % mouse_fbeta(solution, submission)
    '0.250000000000'

    >>> # Overlapping solution events, one prediction matching both.
    >>> solution = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 10, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 10, 'stop_frame': 20, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ... ])
    >>> submission = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 20},
    ... ])
    >>> mouse_fbeta(solution, submission)
    1.0

    >>> solution = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 10, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 30, 'stop_frame': 40, 'lab_id': 1, 'behaviors_labeled': '["1,2,attack"]'},
    ... ])
    >>> submission = pd.DataFrame([
    ...     {'video_id': 1, 'agent_id': 1, 'target_id': 2, 'action': 'attack', 'start_frame': 0, 'stop_frame': 40},
    ... ])
    >>> mouse_fbeta(solution, submission)
    0.6666666666666666
    """
    if len(solution) == 0 or len(submission) == 0:
        raise ValueError('Missing solution or submission data')

    expected_cols = ['video_id', 'agent_id', 'target_id', 'action', 'start_frame', 'stop_frame']

    for col in expected_cols:
        if col not in solution.columns:
            raise ValueError(f'Solution is missing column {col}')
        if col not in submission.columns:
            raise ValueError(f'Submission is missing column {col}')

    solution: pl.DataFrame = pl.DataFrame(solution)
    submission: pl.DataFrame = pl.DataFrame(submission)
    assert (solution['start_frame'] <= solution['stop_frame']).all()
    assert (submission['start_frame'] <= submission['stop_frame']).all()
    solution_videos = set(solution['video_id'].unique())
    # Need to align based on video IDs as we can't rely on the row IDs for handling public/private splits.
    submission = submission.filter(pl.col('video_id').is_in(solution_videos))

    solution = solution.with_columns(
        pl.concat_str(
            [
                pl.col('video_id').cast(pl.Utf8),
                pl.col('agent_id').cast(pl.Utf8),
                pl.col('target_id').cast(pl.Utf8),
                pl.col('action'),
            ],
            separator='_',
        ).alias('label_key'),
    )
    submission = submission.with_columns(
        pl.concat_str(
            [
                pl.col('video_id').cast(pl.Utf8),
                pl.col('agent_id').cast(pl.Utf8),
                pl.col('target_id').cast(pl.Utf8),
                pl.col('action'),
            ],
            separator='_',
        ).alias('prediction_key'),
    )

    lab_scores = []
    for lab in solution['lab_id'].unique():
        lab_solution = solution.filter(pl.col('lab_id') == lab).clone()
        lab_videos = set(lab_solution['video_id'].unique())
        lab_submission = submission.filter(pl.col('video_id').is_in(lab_videos)).clone()
        lab_scores.append(single_lab_f1(lab_solution, lab_submission, beta=beta))

    return sum(lab_scores) / len(lab_scores)


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str, beta: float = 1) -> float:
    """
    F1 score for the MABe Challenge
    """
    solution = solution.drop(row_id_column_name, axis='columns', errors='ignore')
    submission = submission.drop(row_id_column_name, axis='columns', errors='ignore')
    return mouse_fbeta(solution, submission, beta=beta)

In [None]:
!pip install optuna

In [None]:
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score
from sklearn.base import clone
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from tqdm.notebook import tqdm
from scipy.ndimage import gaussian_filter1d
from scipy.fft import rfft, rfftfreq
import numpy as np
import itertools
import warnings
import optuna
import joblib
import glob
import gc
import time
from datetime import timedelta

optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')

In [None]:
# Drive上のzipファイルの場所
DRIVE_ZIP_PATH = "/content/drive/MyDrive/Kaggle/MABe/MABe-mouse-behavior-detection.zip"
LOCAL_DIR = "/content/input_data"
print("--- Copying & Unzipping Data to Local Disk ---")

# 1. フォルダ作成
if not os.path.exists(LOCAL_DIR):
    os.makedirs(LOCAL_DIR)

# 2. コピー & 解凍
if not os.path.exists(f"{LOCAL_DIR}/train.csv"):
    print("Copying zip file...")
    !cp "{DRIVE_ZIP_PATH}" /content/temp_dataset.zip
    print("Unzipping...")
    !unzip -q /content/temp_dataset.zip -d "{LOCAL_DIR}"
    !rm /content/temp_dataset.zip
    print("Done!")
else:
    print("Data already exists in local disk.")


In [None]:

class CFG:
    input_dir = "/content/input_data"
    !ls {input_dir}
    train_path = f"{input_dir}/train.csv"
    test_path = f"{input_dir}/test.csv"
    train_annotation_path = f"{input_dir}/train_annotation"
    train_tracking_path = f"{input_dir}/train_tracking"
    test_tracking_path = f"{input_dir}/test_tracking"

    model_path = "."      # モデル保存先
    model_name = "xgboost"

    mode = "validate"
    #mode = "submit"

    n_splits = 5
    cv = StratifiedGroupKFold(n_splits)

    model = XGBClassifier(
            verbosity=0,
            random_state=42,
            device='cuda',
            tree_method='hist',
            n_estimators=3000,

            # Optuna Best Params
            max_depth=6,
            learning_rate=0.132,
            min_child_weight=10,
            subsample=0.728,
            colsample_bytree=0.771,
            reg_alpha=1.58,
            reg_lambda=3.44,
            early_stopping_rounds=50,
        )

print(f"CFG setup complete.")
print(f"Model will be saved to: {CFG.model_path}")

# Data loading and preprocessing

In [None]:
train = pd.read_csv(CFG.train_path)
train['n_mice'] = 4 - train[['mouse1_strain', 'mouse2_strain', 'mouse3_strain', 'mouse4_strain']].isna().sum(axis=1)
train_without_mabe22_c_c = train[~train['lab_id'].str.startswith(('MABe22', 'CalMS21', 'CRIM13'))] #学習から除外する過去データの指定
train = train[~train['lab_id'].str.startswith(('CalMS21', 'CRIM13'))]
test = pd.read_csv(CFG.test_path)

In [None]:
body_parts_tracked_list = list(np.unique(train.body_parts_tracked)) #トラッキングされたボディパーツのセット毎のリスト作成

## Creating solution data

In [None]:
def create_solution_df(dataset):
    solution = []
    for _, row in tqdm(dataset.iterrows(), total=len(dataset)):

        lab_id = row['lab_id']
        if lab_id.startswith(('MABe22', 'CalMS21', 'CRIM13')):
            continue

        video_id = row['video_id']
        path = f"{CFG.train_annotation_path}/{lab_id}/{video_id}.parquet"
        try:
            annot = pd.read_parquet(path)
        except FileNotFoundError:
            continue

        annot['lab_id'] = lab_id
        annot['video_id'] = video_id
        annot['behaviors_labeled'] = row['behaviors_labeled']
        annot['target_id'] = np.where(annot.target_id != annot.agent_id, annot['target_id'].apply(lambda s: f"mouse{s}"), 'self')
        annot['agent_id'] = annot['agent_id'].apply(lambda s: f"mouse{s}")
        solution.append(annot)

    solution = pd.concat(solution)

    return solution

if CFG.mode == 'validate':
    solution = create_solution_df(train_without_mabe22_c_c)

## Data generator

In [None]:
# 1. 速度チェック (スパイク除去用)

def clean_speed_outliers(df, pix_per_cm, threshold_speed_cm=10.0, smooth_sigma=1.0):
    """
    前のフレームから threshold_speed_cm 以上移動している場合、異常値として除去する。
    """
    df_clean = df.copy()

    # cm -> px 変換
    threshold_speed_px = threshold_speed_cm * pix_per_cm

    total_outliers = 0
    total_points = 0

    # 元々の欠損箇所を記録
    original_na_mask = df_clean.isna()

    try:
        mouse_ids = df_clean.columns.get_level_values('mouse_id').unique()
        bodyparts = df_clean.columns.get_level_values('bodypart').unique()
    except KeyError:
        mouse_ids = df_clean.columns.get_level_values(1).unique()
        bodyparts = df_clean.columns.get_level_values(2).unique()

    for mid in mouse_ids:
        for bp in bodyparts:
            col_x = ('x', mid, bp)
            col_y = ('y', mid, bp)

            if col_x not in df_clean.columns or col_y not in df_clean.columns:
                continue

            # --- 速度チェック ---
            dx = df_clean[col_x].diff()
            dy = df_clean[col_y].diff()
            speed = np.sqrt(dx**2 + dy**2)

            outlier_mask = speed > threshold_speed_px
            count = outlier_mask.sum()

            if count > 0:
                total_outliers += count
                # 異常値をNaNにする
                df_clean.loc[outlier_mask, col_x] = np.nan
                df_clean.loc[outlier_mask, col_y] = np.nan

                # 線形補間
                df_clean[col_x] = df_clean[col_x].interpolate(method='linear', limit_direction='both')
                df_clean[col_y] = df_clean[col_y].interpolate(method='linear', limit_direction='both')

                # 平滑化
                df_clean[col_x] = gaussian_filter1d(df_clean[col_x].fillna(method='bfill').fillna(method='ffill'), sigma=smooth_sigma).astype(np.float32)
                df_clean[col_y] = gaussian_filter1d(df_clean[col_y].fillna(method='bfill').fillna(method='ffill'), sigma=smooth_sigma).astype(np.float32)

            total_points += len(df_clean)

    # 元々NaNだった場所をNaNに戻す
    df_clean = df_clean.mask(original_na_mask, np.nan)

    return df_clean

# 2. 距離チェック

def clean_distance_outliers(df, pix_per_cm, threshold_dist_cm=15.0, smooth_sigma=1.0):
    """
    重心からの距離が threshold_dist_cm 以上離れている場合、異常値として除去する。
    """
    df_clean = df.copy()

    # cm -> px 変換
    threshold_dist_px = threshold_dist_cm * pix_per_cm

    total_outliers = 0
    total_points = 0

    # 元々の欠損箇所を記録
    original_na_mask = df_clean.isna()

    try:
        mouse_ids = df_clean.columns.get_level_values('mouse_id').unique()
        bodyparts = df_clean.columns.get_level_values('bodypart').unique()
    except KeyError:
        mouse_ids = df_clean.columns.get_level_values(1).unique()
        bodyparts = df_clean.columns.get_level_values(2).unique()

    for mid in mouse_ids:
        # 重心取得 (基準点)
        if ('x', mid, 'body_center') in df_clean.columns:
            center_x = df_clean[('x', mid, 'body_center')]
            center_y = df_clean[('y', mid, 'body_center')]
        else:
            # 重心がなければ平均で代用
            center_x = df_clean.xs(mid, level=1, axis=1).xs('x', level=1, axis=1).mean(axis=1)
            center_y = df_clean.xs(mid, level=1, axis=1).xs('y', level=1, axis=1).mean(axis=1)

        for bp in bodyparts:
            # body_center 自体はチェックしない
            if bp == 'body_center': continue

            col_x = ('x', mid, bp)
            col_y = ('y', mid, bp)

            if col_x not in df_clean.columns: continue

            # --- 距離チェック ---
            dist_from_center = np.sqrt((df_clean[col_x] - center_x)**2 + (df_clean[col_y] - center_y)**2)
            outlier_mask = dist_from_center > threshold_dist_px

            count = outlier_mask.sum()

            if count > 0:
                total_outliers += count
                # 異常値をNaNにする
                df_clean.loc[outlier_mask, col_x] = np.nan
                df_clean.loc[outlier_mask, col_y] = np.nan

                # 線形補間
                df_clean[col_x] = df_clean[col_x].interpolate(method='linear', limit_direction='both')
                df_clean[col_y] = df_clean[col_y].interpolate(method='linear', limit_direction='both')

                # 平滑化
                df_clean[col_x] = gaussian_filter1d(df_clean[col_x].fillna(method='bfill').fillna(method='ffill'), sigma=smooth_sigma).astype(np.float32)
                df_clean[col_y] = gaussian_filter1d(df_clean[col_y].fillna(method='bfill').fillna(method='ffill'), sigma=smooth_sigma).astype(np.float32)

            total_points += len(df_clean)

    # 元々NaNだった場所をNaNに戻す
    df_clean = df_clean.mask(original_na_mask, np.nan)

    return df_clean

In [None]:
drop_body_parts =  [
    'headpiece_bottombackleft', 'headpiece_bottombackright', 'headpiece_bottomfrontleft', 'headpiece_bottomfrontright',
    'headpiece_topbackleft', 'headpiece_topbackright', 'headpiece_topfrontleft', 'headpiece_topfrontright',
    'spine_1', 'spine_2', 'tail_middle_1', 'tail_middle_2', 'tail_midpoint'
]

def generate_mouse_data(dataset, traintest, traintest_directory=None, generate_single=True, generate_pair=True):
    if traintest_directory is None:
        traintest_directory = f"CFG.input_dir/{traintest}_tracking"

    for _, row in dataset.iterrows():
        lab_id = row.lab_id
        if lab_id.startswith(('MABe22', 'CalMS21', 'CRIM13')) or type(row.behaviors_labeled) != str:
            continue

        video_id = row.video_id
        path = f"{traintest_directory}/{lab_id}/{video_id}.parquet"

        try:
            vid = pd.read_parquet(path)
        except FileNotFoundError:
            continue

        if len(np.unique(vid.bodypart)) > 5:
            vid = vid.query("~ bodypart.isin(@drop_body_parts)")

        pvid = vid.pivot(columns=['mouse_id', 'bodypart'], index='video_frame', values=['x', 'y'])
        pvid = pvid.astype(np.float32)

        del vid
        gc.collect()

        # 1. 速度チェック (スパイク除去)
        if 'clean_speed_outliers' in globals():
            pvid = clean_speed_outliers(pvid,
                                        pix_per_cm=row.pix_per_cm_approx,
                                        threshold_speed_cm=5.0)

        # 2. Body Center 補完
        try:
            # pvidの構造: Columns=(x/y, mouse_id, bodypart)
            mouse_ids = np.unique(pvid.columns.get_level_values('mouse_id'))

            for mid in mouse_ids:
                try:
                    # マウスのデータを抽出 (Level 0: x/y, Level 1: bodypart)
                    mouse_data = pvid.xs(mid, level='mouse_id', axis=1)
                    available_parts = list(np.unique(mouse_data.columns.get_level_values('bodypart')))
                    # body_center が既に存在する場合は何もしない
                    if 'body_center' in available_parts:
                        continue
                    # 1. 優先度3: 低 - 全パーツの平均
                    mean_x = mouse_data.xs('x', level=0, axis=1).mean(axis=1)
                    mean_y = mouse_data.xs('y', level=0, axis=1).mean(axis=1)

                    # final_x, final_y の初期値として平均値をセット
                    final_x = mean_x
                    final_y = mean_y

                    # 2. 優先度2 中 - 耳と尻尾
                    if {'ear_left', 'ear_right', 'tail_base'}.issubset(available_parts):
                        neck_x = (mouse_data[('x', 'ear_left')] + mouse_data[('x', 'ear_right')]) / 2
                        neck_y = (mouse_data[('y', 'ear_left')] + mouse_data[('y', 'ear_right')]) / 2
                        tail_x = mouse_data[('x', 'tail_base')]
                        tail_y = mouse_data[('y', 'tail_base')]

                        ratio_ear = 0.541
                        est_x_ear = neck_x + (tail_x - neck_x) * ratio_ear
                        est_y_ear = neck_y + (tail_y - neck_y) * ratio_ear

                        # 優先度2の値を上書き
                        final_x = est_x_ear.combine_first(final_x)
                        final_y = est_y_ear.combine_first(final_y)

                    # 3. 優先度1 高 - 本物の首と尻尾
                    if {'neck', 'tail_base'}.issubset(available_parts):
                        neck_x = mouse_data[('x', 'neck')]
                        neck_y = mouse_data[('y', 'neck')]
                        tail_x = mouse_data[('x', 'tail_base')]
                        tail_y = mouse_data[('y', 'tail_base')]

                        ratio_neck = 0.568
                        est_x_neck = neck_x + (tail_x - neck_x) * ratio_neck
                        est_y_neck = neck_y + (tail_y - neck_y) * ratio_neck

                        # 優先度1の値を上書き
                        final_x = est_x_neck.combine_first(final_x)
                        final_y = est_y_neck.combine_first(final_y)

                    # 4. 仕上げ: 線形補間
                    final_x = final_x.interpolate(method='linear', limit_direction='both')
                    final_y = final_y.interpolate(method='linear', limit_direction='both')

                    pvid[('x', mid, 'body_center')] = final_x
                    pvid[('y', mid, 'body_center')] = final_y

                except Exception as e:
                    # 個別のマウスで失敗しても他は続ける
                    print(f"Error imputation mouse {mid}: {e}")
                    pass

        except Exception as e:
            print(f"❌ Error in Body Center Imputation: {e}")
            print(f"Error in processing video {video_id}: {e}")

        # 3. 距離チェック (仕上げ)
        if 'clean_distance_outliers' in globals():
            pvid = clean_distance_outliers(pvid,
                                           pix_per_cm=row.pix_per_cm_approx,
                                           threshold_dist_cm=15.0)

        pvid = pvid.reorder_levels([1, 2, 0], axis=1).T.sort_index().T
        pvid /= row.pix_per_cm_approx

        vid_behaviors = json.loads(row.behaviors_labeled)
        vid_behaviors = sorted(list({b.replace("'", "") for b in vid_behaviors}))
        vid_behaviors = [b.split(',') for b in vid_behaviors]
        vid_behaviors = pd.DataFrame(vid_behaviors, columns=['agent', 'target', 'action'])

        if traintest == 'train':
            try:
                annot = pd.read_parquet(path.replace('train_tracking', 'train_annotation'))
            except FileNotFoundError:
                continue

        # データ生成 (Single)
        if generate_single:
            vid_behaviors_subset = vid_behaviors.query("target == 'self'")
            for mouse_id_str in np.unique(vid_behaviors_subset.agent):
                try:
                    mouse_id = int(mouse_id_str[-1])
                    vid_agent_actions = np.unique(vid_behaviors_subset.query("agent == @mouse_id_str").action)
                    single_mouse = pvid.loc[:, mouse_id]
                    assert len(single_mouse) == len(pvid)
                    single_mouse_meta = pd.DataFrame({
                        'video_id': video_id,
                        'agent_id': mouse_id_str,
                        'target_id': 'self',
                        'video_frame': single_mouse.index
                    })
                    if traintest == 'train':
                        single_mouse_label = pd.DataFrame(0.0, columns=vid_agent_actions, index=single_mouse.index)
                        annot_subset = annot.query("(agent_id == @mouse_id) & (target_id == @mouse_id)")
                        for i in range(len(annot_subset)):
                            annot_row = annot_subset.iloc[i]
                            single_mouse_label.loc[annot_row['start_frame']:annot_row['stop_frame'], annot_row.action] = 1.0
                        yield 'single', single_mouse, single_mouse_meta, single_mouse_label
                    else:
                        yield 'single', single_mouse, single_mouse_meta, vid_agent_actions
                except KeyError:
                    pass

        # データ生成 (Pair)
        if generate_pair:
            vid_behaviors_subset = vid_behaviors.query("target != 'self'")
            if len(vid_behaviors_subset) > 0:
                for agent, target in itertools.permutations(np.unique(pvid.columns.get_level_values('mouse_id')), 2):
                    agent_str = f"mouse{agent}"
                    target_str = f"mouse{target}"
                    vid_agent_actions = np.unique(vid_behaviors_subset.query("(agent == @agent_str) & (target == @target_str)").action)
                    mouse_pair = pd.concat([pvid[agent], pvid[target]], axis=1, keys=['A', 'B'])
                    assert len(mouse_pair) == len(pvid)
                    mouse_pair_meta = pd.DataFrame({
                        'video_id': video_id,
                        'agent_id': agent_str,
                        'target_id': target_str,
                        'video_frame': mouse_pair.index
                    })
                    if traintest == 'train':
                        mouse_pair_label = pd.DataFrame(0.0, columns=vid_agent_actions, index=mouse_pair.index)
                        annot_subset = annot.query("(agent_id == @agent) & (target_id == @target)")
                        for i in range(len(annot_subset)):
                            annot_row = annot_subset.iloc[i]
                            mouse_pair_label.loc[annot_row['start_frame']:annot_row['stop_frame'], annot_row.action] = 1.0
                        yield 'pair', mouse_pair, mouse_pair_meta, mouse_pair_label
                    else:
                        yield 'pair', mouse_pair, mouse_pair_meta, vid_agent_actions

## Transforming coordinates

In [None]:
def safe_rolling(series, window, func, min_periods=None):
    if min_periods is None:
        min_periods = max(1, window // 4)
    return series.rolling(window, min_periods=min_periods, center=True).apply(func, raw=True)

def _scale(n_frames_at_30fps, fps, ref=30.0):
    return max(1, int(round(n_frames_at_30fps * float(fps) / ref)))

def _scale_signed(n_frames_at_30fps, fps, ref=30.0):
    if n_frames_at_30fps == 0:
        return 0
    s = 1 if n_frames_at_30fps > 0 else -1
    mag = max(1, int(round(abs(n_frames_at_30fps) * float(fps) / ref)))
    return s * mag

def _fps_from_meta(meta_df, fallback_lookup, default_fps=30.0):
    if 'frames_per_second' in meta_df.columns and pd.notnull(meta_df['frames_per_second']).any():
        return float(meta_df['frames_per_second'].iloc[0])
    vid = meta_df['video_id'].iloc[0]
    return float(fallback_lookup.get(vid, default_fps))

def add_curvature_features(X, center_x, center_y, fps):
    vel_x = center_x.diff()
    vel_y = center_y.diff()
    acc_x = vel_x.diff()
    acc_y = vel_y.diff()

    cross_prod = vel_x * acc_y - vel_y * acc_x
    vel_mag = np.sqrt(vel_x**2 + vel_y**2)
    curvature = np.abs(cross_prod) / (vel_mag**3 + 1e-6)

    for w in [25, 50, 75]:
        ws = _scale(w, fps)
        X[f'curv_mean_{w}'] = curvature.rolling(ws, min_periods=max(1, ws // 5)).mean()

    angle = np.arctan2(vel_y, vel_x)
    angle_change = np.abs(angle.diff())
    w = 30
    ws = _scale(w, fps)
    X[f'turn_rate_{w}'] = angle_change.rolling(ws, min_periods=max(1, ws // 5)).sum()

    return X

def add_multiscale_features(X, center_x, center_y, fps):
    speed = np.sqrt(center_x.diff()**2 + center_y.diff()**2) * float(fps)

    scales = [20, 40, 60, 80]
    for scale in scales:
        ws = _scale(scale, fps)
        if len(speed) >= ws:
            X[f'sp_m{scale}'] = speed.rolling(ws, min_periods=max(1, ws // 4)).mean()
            X[f'sp_s{scale}'] = speed.rolling(ws, min_periods=max(1, ws // 4)).std()

    if len(scales) >= 2 and f'sp_m{scales[0]}' in X.columns and f'sp_m{scales[-1]}' in X.columns:
        X['sp_ratio'] = X[f'sp_m{scales[0]}'] / (X[f'sp_m{scales[-1]}'] + 1e-6)

    return X

def add_state_features(X, center_x, center_y, fps):
    speed = np.sqrt(center_x.diff()**2 + center_y.diff()**2) * float(fps)
    w_ma = _scale(15, fps)
    speed_ma = speed.rolling(w_ma, min_periods=max(1, w_ma // 3)).mean()

    try:
        bins = [-np.inf, 0.5 * fps, 2.0 * fps, 5.0 * fps, np.inf]
        speed_states = pd.cut(speed_ma, bins=bins, labels=[0, 1, 2, 3]).astype(float)

        for window in [20, 40, 60, 80]:
            ws = _scale(window, fps)
            if len(speed_states) >= ws:
                for state in [0, 1]:
                    X[f's{state}_{window}'] = (
                        (speed_states == state).astype(float)
                        .rolling(ws, min_periods=max(1, ws // 5)).mean()
                    )
                state_changes = (speed_states != speed_states.shift(1)).astype(float)
                X[f'trans_{window}'] = state_changes.rolling(ws, min_periods=max(1, ws // 5)).sum()
    except Exception:
        pass

    return X

def add_longrange_features(X, center_x, center_y, fps):
    for window in [30, 60, 120]:
        ws = _scale(window, fps)
        if len(center_x) >= ws:
            X[f'x_ml{window}'] = center_x.rolling(ws, min_periods=max(5, ws // 6)).mean()
            X[f'y_ml{window}'] = center_y.rolling(ws, min_periods=max(5, ws // 6)).mean()

    for span in [30, 60, 120]:
        s = _scale(span, fps)
        X[f'x_e{span}'] = center_x.ewm(span=s, min_periods=1).mean()
        X[f'y_e{span}'] = center_y.ewm(span=s, min_periods=1).mean()

    return X

def add_interaction_features(X, mouse_pair, avail_A, avail_B, fps):
    if 'body_center' not in avail_A or 'body_center' not in avail_B:
        return X

    rel_x = mouse_pair['A']['body_center']['x'] - mouse_pair['B']['body_center']['x']
    rel_y = mouse_pair['A']['body_center']['y'] - mouse_pair['B']['body_center']['y']
    rel_dist = np.sqrt(rel_x**2 + rel_y**2)

    A_vx = mouse_pair['A']['body_center']['x'].diff()
    A_vy = mouse_pair['A']['body_center']['y'].diff()
    B_vx = mouse_pair['B']['body_center']['x'].diff()
    B_vy = mouse_pair['B']['body_center']['y'].diff()

    A_lead = (A_vx * rel_x + A_vy * rel_y) / (np.sqrt(A_vx**2 + A_vy**2) * rel_dist + 1e-6)
    B_lead = (B_vx * (-rel_x) + B_vy * (-rel_y)) / (np.sqrt(B_vx**2 + B_vy**2) * rel_dist + 1e-6)

    for window in [30, 60]:
        ws = _scale(window, fps)
        X[f'A_ld{window}'] = A_lead.rolling(ws, min_periods=max(1, ws // 6)).mean()
        X[f'B_ld{window}'] = B_lead.rolling(ws, min_periods=max(1, ws // 6)).mean()

    approach = -rel_dist.diff()
    chase = approach * B_lead
    w = 30
    ws = _scale(w, fps)
    X[f'chase_{w}'] = chase.rolling(ws, min_periods=max(1, ws // 6)).mean()

    for window in [60, 120]:
        ws = _scale(window, fps)
        A_sp = np.sqrt(A_vx**2 + A_vy**2)
        B_sp = np.sqrt(B_vx**2 + B_vy**2)
        X[f'sp_cor{window}'] = A_sp.rolling(ws, min_periods=max(1, ws // 6)).corr(B_sp)

    return X

In [None]:
def add_pose_pca(X, coords_df, n_components=3, prefix="pose"):
    """
    マウスの全座標データをPCAにかけて、姿勢を表す主成分特徴量を追加する
    coords_df: マウスの座標データフレーム (MultiIndex: bodypart, x/y)
    """
    # 1. 座標データをフラットな数値データとして準備
    # (NaNが含まれているとPCAできないので、補間 -> 0埋めで安全に対処)
    coords_flat = coords_df.select_dtypes(include=[np.number])
    coords_filled = coords_flat.interpolate(method='linear', limit_direction='both').fillna(0)

    # 2. 列数が足りているか確認 (部位数が極端に少ない場合はスキップ)
    if coords_filled.shape[1] < n_components:
        return X

    try:
        # 3. PCA実行
        # "whiten=True" で正規化することで、スケールの違いを吸収しやすくなります
        pca = PCA(n_components=n_components, whiten=True)
        components = pca.fit_transform(coords_filled)

        # 4. 特徴量として追加
        for i in range(n_components):
            X[f'{prefix}_pca{i+1}'] = components[:, i]

            # (オプション) 変化量(動きの勢い)も追加するとさらに強力
            # X[f'{prefix}_pca{i+1}_vel'] = pd.Series(components[:, i]).diff().fillna(0)

    except Exception as e:
        # エラーが起きても元のXを返して止まらないようにする
        pass

    return X

In [None]:
def add_relative_angle_features(X, mouse_pair, fps):
    # 入力チェック (最低限の構造確認)
    if X is None: return None
    if 'A' not in mouse_pair or 'B' not in mouse_pair: return X

    try:
        # 必要な部位の座標を取得 (なければNaNのDataFrameが返るようにする)
        def get_coords(mouse_df, part):
            if part in mouse_df.columns.get_level_values(0):
                return mouse_df[part]
            else:
                # 部位がない場合は全フレームNaNのダミーを返す
                return pd.DataFrame(np.nan, index=mouse_df.index, columns=['x', 'y'])

        # Aの座標
        nose_A = get_coords(mouse_pair['A'], 'nose')
        tail_A = get_coords(mouse_pair['A'], 'tail_base')
        center_A = get_coords(mouse_pair['A'], 'body_center')

        # Bの座標
        nose_B = get_coords(mouse_pair['B'], 'nose')
        tail_B = get_coords(mouse_pair['B'], 'tail_base')
        center_B = get_coords(mouse_pair['B'], 'body_center')

        # 1. ベクトル計算
        vec_A_x = nose_A['x'] - tail_A['x']
        vec_A_y = nose_A['y'] - tail_A['y']
        vec_B_x = nose_B['x'] - tail_B['x']
        vec_B_y = nose_B['y'] - tail_B['y']
        vec_AB_x = center_B['x'] - center_A['x']
        vec_AB_y = center_B['y'] - center_A['y']

        # 2. 角度計算
        theta_A = np.arctan2(vec_A_y, vec_A_x)
        theta_B = np.arctan2(vec_B_y, vec_B_x)
        theta_AB = np.arctan2(vec_AB_y, vec_AB_x)

        # 3. 相対角度
        angle_A2B = (theta_AB - theta_A + np.pi) % (2 * np.pi) - np.pi
        theta_BA = np.arctan2(-vec_AB_y, -vec_AB_x)
        angle_B2A = (theta_BA - theta_B + np.pi) % (2 * np.pi) - np.pi
        angle_face = (theta_A - theta_B + np.pi) % (2 * np.pi) - np.pi

        # 4. 特徴量追加
        # (NaNを含む計算でもPandas/Numpyは処理してNaNを返すのでOK)
        X['cos_A2B'] = np.cos(angle_A2B)
        X['sin_A2B'] = np.sin(angle_A2B)
        X['cos_B2A'] = np.cos(angle_B2A)
        X['cos_face'] = np.cos(angle_face)

        fov_thresholds = {'30': np.deg2rad(30), '45': np.deg2rad(45)}
        temp_features = {}

        for deg, rad in fov_thresholds.items():
            # NaNとの比較はFalseになるが、念のためfillna(False)等はしないでおく
            # (欠損しているなら特徴量もNaNであるべき)
            is_watching_A = (np.abs(angle_A2B) < rad).astype(float)
            # 元がNaNだった場所は0(False)になってしまうので、マスクしてNaNに戻す
            is_watching_A[angle_A2B.isna()] = np.nan

            is_watching_B = (np.abs(angle_B2A) < rad).astype(float)
            is_watching_B[angle_B2A.isna()] = np.nan

            is_mutual = (is_watching_A * is_watching_B)

            X[f'watch_{deg}deg'] = is_watching_A
            temp_features[f'watch_A_{deg}'] = is_watching_A
            temp_features[f'mutual_{deg}'] = is_mutual

        for w in [10, 30, 60]:
            ws = _scale(w, fps)
            X[f'ang_std_{w}'] = pd.Series(angle_A2B).rolling(ws, min_periods=1).std()
            X[f'm_cos_A2B_{w}'] = X['cos_A2B'].rolling(ws, min_periods=1).mean()
            for deg in fov_thresholds.keys():
                X[f'watch_ratio_{deg}deg_{w}'] = temp_features[f'watch_A_{deg}'].rolling(ws, min_periods=1).mean()
                X[f'watch_dur_{deg}deg_{w}'] = temp_features[f'watch_A_{deg}'].ewm(span=ws, min_periods=1).mean()
                X[f'mutual_ratio_{deg}deg_{w}'] = temp_features[f'mutual_{deg}'].rolling(ws, min_periods=1).mean()

    except Exception:
        # エラーが起きても、作成できた特徴量だけは残して返す
        pass

    return X

In [None]:
# ==========================================
# 1. 楕円近似特徴量の計算関数 (安全装置付き)
# ==========================================
def add_ellipse_features(X, coords_df, fps, prefix=""):
    """
    マウスの全座標データから楕円近似による形状特徴量を計算する。
    coords_df: MultiIndex columns (bodypart, x/y) を持つ座標データフレーム
    prefix: 列名の接頭辞 (例: "ag_", "tg_")
    """
    try:
        # 座標データ (x, y) だけを取り出す
        xs = coords_df.xs('x', level=1, axis=1)
        ys = coords_df.xs('y', level=1, axis=1)

        # ★★★ 安全装置: 有効な点の数が少なすぎる場合はスキップ ★★★
        # 各フレームごとに、NaNでない点の数をカウント
        valid_points_count = xs.count(axis=1)

        # 3点未満のフレームは計算不能なので除外するためのマスクを作成
        # (楕円を決めるには最低3点の分散が必要)
        valid_mask = valid_points_count >= 3

        if valid_mask.sum() == 0:
            return X # 計算できるフレームが一つもない場合は終了

        # 計算対象のデータだけ抽出
        xs_valid = xs.loc[valid_mask]
        ys_valid = ys.loc[valid_mask]

        # 部位ごとの分散と共分散を計算
        var_x = xs_valid.var(axis=1)
        var_y = ys_valid.var(axis=1)

        # 共分散: ((x - mean_x) * (y - mean_y)).mean()
        mean_x = xs_valid.mean(axis=1)
        mean_y = ys_valid.mean(axis=1)
        cov_xy = ((xs_valid.subtract(mean_x, axis=0)) * (ys_valid.subtract(mean_y, axis=0))).mean(axis=1)

        # 固有値計算 (楕円の長軸・短軸の分散)
        term1 = (var_x + var_y) / 2
        term2 = np.sqrt((var_x - var_y)**2 + 4 * cov_xy**2) / 2

        lambda1 = term1 + term2 # 長軸方向の分散 (大きい方)
        lambda2 = term1 - term2 # 短軸方向の分散 (小さい方)

        # --- 特徴量作成 (計算結果を元のインデックスに戻す) ---

        # 1. 楕円の形状比率 (長軸 / 短軸)
        # lambda2 が 0 に近すぎると無限大になるのでクリップ
        ratio = np.sqrt(lambda1 / (lambda2.clip(lower=1e-6)))
        X.loc[valid_mask, f'{prefix}ellipse_ratio'] = ratio

        # 2. 楕円の面積 (簡易版: lambda1 * lambda2)
        area = np.sqrt(lambda1 * lambda2)
        X.loc[valid_mask, f'{prefix}ellipse_area'] = area

        # 3. 楕円の角度 (Orientation) - pi/2 ~ pi/2
        angle = 0.5 * np.arctan2(2 * cov_xy, var_x - var_y)
        X.loc[valid_mask, f'{prefix}ellipse_angle'] = angle

        # --- 統計量を追加 (移動平均など) ---
        for w in [10, 30, 60]:
            ws = _scale(w, fps)
            # 姿勢の変化率
            col_ratio = f'{prefix}ellipse_ratio'
            if col_ratio in X.columns:
                X[f'{prefix}shape_chg_{w}'] = X[col_ratio].diff().abs().rolling(ws, min_periods=1).mean()

            # 面積の変化
            col_area = f'{prefix}ellipse_area'
            if col_area in X.columns:
                X[f'{prefix}area_chg_{w}'] = X[col_area].diff().rolling(ws, min_periods=1).mean()

    except Exception:
        # 計算エラーが起きても元のXを返して止まらない
        pass

    return X

def add_social_proximity_features(X, mouse_pair, fps):
    # 重心または鼻を使って距離を計算 (重心優先)
    if 'body_center' in mouse_pair['A'].columns.get_level_values(0) and \
       'body_center' in mouse_pair['B'].columns.get_level_values(0):
        p1, p2 = mouse_pair['A']['body_center'], mouse_pair['B']['body_center']
    elif 'nose' in mouse_pair['A'].columns.get_level_values(0) and \
         'nose' in mouse_pair['B'].columns.get_level_values(0):
        p1, p2 = mouse_pair['A']['nose'], mouse_pair['B']['nose']
    else:
        return X

    # 距離計算 (データは既にcm単位になっている前提)
    dist = np.sqrt((p1['x'] - p2['x'])**2 + (p1['y'] - p2['y'])**2)

    # --- 1. 近接率 (Proximity Density) ---
    # 閾値 (cm): 密着(2.5), 近距離(5.0), 中距離(10.0)
    thresholds = [2.5, 5.0, 10.0]
    windows = [30, 90] # 短期(1秒), 長期(3秒)

    for th in thresholds:
        # 距離が閾値以下かどうかのフラグ (0 or 1)
        is_close = (dist < th).astype(float)

        for w in windows:
            ws = _scale(w, fps)
            # 移動平均をとることで「密度（割合）」になる
            X[f'prox_rate_{th}cm_{w}'] = is_close.rolling(ws, min_periods=1).mean()

    # --- 2. 連続近接カウンター (Duration) ---
    # 5cm以内にいる連続フレーム数をカウント (簡易実装: 指数加重移動平均で代用)
    # ずっと近くにいると値が大きくなり続ける
    is_close_5cm = (dist < 5.0).astype(float)
    # spanを大きくすると、途切れずに継続しているときに値が蓄積される
    X['prox_dur_5cm'] = is_close_5cm.ewm(span=_scale(120, fps), min_periods=1).mean()

    # --- 3. 距離のヒストグラム的特徴 (分位点) ---
    # 過去60フレームにおける距離の「最小・25%・50%・75%」点
    ws_long = _scale(60, fps)
    roll = dist.rolling(ws_long, min_periods=ws_long//2)
    X['dist_p05'] = roll.quantile(0.05) # ほぼ最小値 (異常値除け)
    X['dist_p25'] = roll.quantile(0.25) # 第一四分位数
    X['dist_p50'] = roll.median()       # 中央値
    # 四分位範囲 (IQR): 距離のばらつきの安定した指標
    X['dist_iqr'] = X['dist_p50'] - X['dist_p25']

    return X

def add_movement_features(X, center_x, center_y, fps):
    # 1. 速度 (Speed) [cm/s]
    # diff() は「1フレーム間の移動距離」なので、fpsを掛けて秒速にする
    dist = np.sqrt(center_x.diff()**2 + center_y.diff()**2)
    speed = dist * float(fps)

    # 2. 加速度 (Acceleration) [cm/s^2]
    # 速度の差分 * fps
    accel = speed.diff() * float(fps)

    # 3. 加加速度 (Jerk) [cm/s^3]
    # 加速度の差分 * fps
    #jerk = accel.diff() * float(fps)

    # --- 特徴量として追加 ---
    # そのままの値（瞬間値）
    X['speed'] = speed.fillna(0)
   # X['accel'] = accel.fillna(0)
   # X['jerk']  = jerk.fillna(0)

    # 絶対値（大きさ）
  #  X['accel_abs'] = accel.abs().fillna(0)
  #  X['jerk_abs']  = jerk.abs().fillna(0)

    # 移動平均（トレンド）
   # for w in [5, 15, 30]: # 短めのウィンドウで急な変化を捉える
   #     ws = _scale(w, fps)

        # 加速度の平均（勢い）
   #     X[f'accel_m{w}'] = X['accel_abs'].rolling(ws, min_periods=1).mean()

        # Jerkの平均（動きの荒さ・滑らかさ）
        # 値が小さいほど滑らか、大きいほど荒っぽい（攻撃など）
    #    X[f'jerk_m{w}'] = X['jerk_abs'].rolling(ws, min_periods=1).mean()

   #     # 速度の変化 (長期的な加速・減速トレンド)
    #    X[f'speed_change_{w}'] = speed - speed.shift(w)

        # 加速度の変化 (Jerkに近いが、より長いスパンでの変化)
    #    X[f'accel_change_{w}'] = accel - accel.shift(w)

        # 直近の「衝撃」の最大値 (Attackの瞬間に反応しやすい)
   # w_short = _scale(10, fps)
   # X['max_jerk_10'] = X['jerk_abs'].rolling(w_short, min_periods=1).max()

    return X

In [None]:
def add_future_features(X, fps):
    # 未来を見たい重要な列を指定 (速度、距離、角度など)
    # 未来を見たい重要な列 (シングル・ペア共通のスーパーセット)

    important_features = [
        # 1. 動きの基本 (既存)
      #  'speed', 'accel_abs',

        # 2. ★追加: 動きの質・方向 (重要！)
      #  'jerk_abs',      # 急激な衝撃 (Attack/Escapeの予兆)
      #  'vel_forward',   # 前進か後退か (Approach vs Retreat)
        'turn_rate_30',  # 旋回 (探索や追跡の開始)

        # 3. ★追加: 姿勢・形状
        'body_shrink_ratio', # 立ち上がり (Rear)
        'ellipse_ratio',     # 体の伸び縮み (Huddle/Chase)

        # 4. ★追加: 場所
        'dist_to_wall',      # 壁際への移動 (不安行動)

        # 5. ★追加: 社会的関係 (ペア用)
        'cos_A2B',           # 相手を向いているか (注目)
        'prox_rate_5.0cm_30' # 最近近づいていたか (親密度)
    ]

    # Xに含まれているものだけを対象にする
    target_cols = [c for c in X.columns if c in important_features]

    # 追加したいウィンドウサイズ
    windows = [30, 60]

    # データフレームを反転 (未来 -> 過去)
    X_reversed = X.iloc[::-1]

    for col in target_cols:
        for w in windows:
            ws = _scale(w, fps)

            # 反転した状態で移動平均を計算
            # (つまり、現在地点から「未来」に向かっての平均になる)
            future_mean = X_reversed[col].rolling(ws, min_periods=1).mean()

            # 再反転して元の順序に戻す
            future_mean = future_mean.iloc[::-1]

            # 特徴量として追加
            X[f'fut_m{w}_{col}'] = future_mean

            # 簡易的に「未来平均 - 現在値」も有効

            X[f'fut_diff_{w}_{col}'] = future_mean - X[col]

    return X

def calculate_band_power(y, fps, low_freq=5.0, high_freq=10.0):
    """
    指定された周波数帯域のパワーを計算する
    """
    n = len(y)
    if n == 0: return 0.0

    # 直流成分除去
    y = y - np.mean(y)

    # FFT実行
    fft_val = np.fft.rfft(y)
    power = np.abs(fft_val)**2

    # 周波数軸を作成
    freqs = np.fft.rfftfreq(n, d=1/fps)

    # 指定帯域のマスクを作成
    mask = (freqs >= low_freq) & (freqs <= high_freq)

    # 帯域内のパワー合計を返す
    if mask.sum() > 0:
        return np.sum(power[mask])
    else:
        return 0.0

def add_frequency_features(X, fps):
    # 対象とする特徴量
    targets = []
    if 'accel_abs' in X.columns: targets.append('accel_abs') # 動きの激しさ
    if 'jerk_abs' in X.columns: targets.append('jerk_abs')   # 動きの滑らかさ
    if 'body_shrink_ratio' in X.columns: targets.append('body_shrink_ratio') # 姿勢の震え

    if not targets: return X

    # ウィンドウサイズ (約1秒〜2秒)
    # FFTの周波数分解能はウィンドウサイズで決まるため、ある程度の長さが必要
    w = 60 # 約2秒
    ws = _scale(w, fps)

    # 注目する帯域のリスト (Hz)
    # 1-5Hz: 歩行などのゆっくりしたリズム
    # 5-10Hz: Groomingなどの細かい震え
    # 10Hz+: 非常に速い動き (Attackなど)
    bands = [
        (1.0, 5.0, 'low'),
        (5.0, 10.0, 'mid'),
        (10.0, 15.0, 'high')
    ]

    for col in targets:
        # まずNaNを埋める (FFTエラー回避)
        # 前方・後方埋め -> 0埋め の順で安全に
        series = X[col].fillna(method='ffill').fillna(method='bfill').fillna(0)

        for low, high, name in bands:
            # ラムダ関数で fps などの引数を固定して apply に渡す
            # raw=True で numpy 配列として渡す (高速化)
            X[f'fft_{col}_{name}'] = series.rolling(ws).apply(
                lambda y: calculate_band_power(y, fps, low, high),
                raw=True
            )

            # 計算できなかった部分は0埋め
            X[f'fft_{col}_{name}'] = X[f'fft_{col}_{name}'].fillna(0)

    return X

In [None]:
def transform_single(single_mouse, body_parts_tracked, fps, video_id=None):
    available_body_parts = single_mouse.columns.get_level_values(0)

    X = pd.DataFrame({
        f"{p1}+{p2}": np.square(single_mouse[p1] - single_mouse[p2]).sum(axis=1, skipna=False)
        for p1, p2 in itertools.combinations(body_parts_tracked, 2)
        if p1 in available_body_parts and p2 in available_body_parts
    })
    X = X.reindex(columns=[f"{p1}+{p2}" for p1, p2 in itertools.combinations(body_parts_tracked, 2)], copy=False)

    # =========================================================
    # ★★★ 追加: 指定パーツの距離変化 (Change/Diff) ★★★
    # =========================================================

    # 対象とするパーツのリスト
    target_parts = ["body_center", "ear_left", "ear_right", "head", "nose", "tail_base", "tail_tip"]

    # 実際にデータに存在するパーツだけに絞る
    valid_targets = [p for p in target_parts if p in available_body_parts]

    # 対象パーツ間の距離カラムを特定
    # カラム名が "p1+p2" の形式であることを利用してフィルタリング
    target_cols = []
    for col in X.columns:
        if '+' in col:
            p1, p2 = col.split('+')
            # 両方のパーツがターゲットリストに含まれている場合のみ対象
            if p1 in valid_targets and p2 in valid_targets:
                target_cols.append(col)

    if len(target_cols) > 0:
        lag_frames = 10
        lag = _scale(lag_frames, fps)

        # 指定した列だけを取り出して差分計算
        X_diff = X[target_cols].diff(lag)

        # 列名変更 (例: nose+tail_chg10)
        X_diff.columns = [f"{c}_chg{lag_frames}" for c in X_diff.columns]

        # 結合
        X = pd.concat([X, X_diff], axis=1)

        # (オプション) 対象パーツだけの伸縮合計
        X[f'major_parts_expansion_{lag_frames}'] = X_diff.sum(axis=1)
    # =========================================================
    # =========================================================

    if all(p in single_mouse.columns for p in ['ear_left', 'ear_right', 'tail_base']):
        lag = _scale(10, fps)
        shifted = single_mouse[['ear_left', 'ear_right', 'tail_base']].shift(lag)
        speeds = pd.DataFrame({
            'sp_lf': np.square(single_mouse['ear_left'] - shifted['ear_left']).sum(axis=1, skipna=False),
            'sp_rt': np.square(single_mouse['ear_right'] - shifted['ear_right']).sum(axis=1, skipna=False),
            'sp_lf2': np.square(single_mouse['ear_left'] - shifted['tail_base']).sum(axis=1, skipna=False),
            'sp_rt2': np.square(single_mouse['ear_right'] - shifted['tail_base']).sum(axis=1, skipna=False),
        })
        X = pd.concat([X, speeds], axis=1)

    if 'nose+tail_base' in X.columns and 'ear_left+ear_right' in X.columns:
        X['elong'] = X['nose+tail_base'] / (X['ear_left+ear_right'] + 1e-6)

    if all(p in available_body_parts for p in ['nose', 'body_center', 'tail_base']):
        v1 = single_mouse['nose'] - single_mouse['body_center']
        v2 = single_mouse['tail_base'] - single_mouse['body_center']
        X['body_ang'] = (v1['x'] * v2['x'] + v1['y'] * v2['y']) / (
            np.sqrt(v1['x']**2 + v1['y']**2) * np.sqrt(v2['x']**2 + v2['y']**2) + 1e-6)

    if 'body_center' in available_body_parts:
        cx = single_mouse['body_center']['x']
        cy = single_mouse['body_center']['y']

        for w in [5, 15, 30, 60, 90, 120]:
            ws = _scale(w, fps)
            roll = dict(min_periods=1, center=True)
            X[f'cx_m{w}'] = cx.rolling(ws, **roll).mean()
            X[f'cy_m{w}'] = cy.rolling(ws, **roll).mean()
            X[f'cx_s{w}'] = cx.rolling(ws, **roll).std()
            X[f'cy_s{w}'] = cy.rolling(ws, **roll).std()
            X[f'x_rng{w}'] = cx.rolling(ws, **roll).max() - cx.rolling(ws, **roll).min()
            X[f'y_rng{w}'] = cy.rolling(ws, **roll).max() - cy.rolling(ws, **roll).min()
            X[f'disp{w}'] = np.sqrt(cx.diff().rolling(ws, min_periods=1).sum()**2 +
                                     cy.diff().rolling(ws, min_periods=1).sum()**2)
            X[f'act{w}'] = np.sqrt(cx.diff().rolling(ws, min_periods=1).var() +
                                   cy.diff().rolling(ws, min_periods=1).var())

        X = add_curvature_features(X, cx, cy, fps)
        X = add_multiscale_features(X, cx, cy, fps)
        X = add_state_features(X, cx, cy, fps)
        X = add_longrange_features(X, cx, cy, fps)

      #  X = add_movement_features(X, cx, cy, fps)

    # =================================================
    # ★★★ ここを追加！ (Rear特化の特徴量) ★★★
    # =================================================
    if 'neck' in available_body_parts and 'tail_base' in available_body_parts:
        # 首と尻尾の距離
        d_neck_tail = np.sqrt((single_mouse['neck']['x'] - single_mouse['tail_base']['x'])**2 +
                              (single_mouse['neck']['y'] - single_mouse['tail_base']['y'])**2)

        # 通常時の体長（移動平均の最大値）との比率
        # (立ち上がると短くなる -> 値が小さくなる)
        w_long = _scale(60, fps)
        X['body_shrink_ratio'] = d_neck_tail / (d_neck_tail.rolling(w_long, min_periods=1).max() + 1e-6)

        # 長さの変化量
        X['body_len_change'] = d_neck_tail.diff()

    if 'ear_left' in available_body_parts and 'ear_right' in available_body_parts:
        # 両耳の距離
        d_ears = np.sqrt((single_mouse['ear_left']['x'] - single_mouse['ear_right']['x'])**2 +
                         (single_mouse['ear_left']['y'] - single_mouse['ear_right']['y'])**2)

        # 普段より耳が離れているか？ (カメラに近づいたか)
        w_long = _scale(60, fps)
        X['head_size_ratio'] = d_ears / (d_ears.rolling(w_long, min_periods=1).mean() + 1e-6)

    # =================================================

    if all(p in available_body_parts for p in ['nose', 'tail_base']):
        nt_dist = np.sqrt((single_mouse['nose']['x'] - single_mouse['tail_base']['x'])**2 +
                          (single_mouse['nose']['y'] - single_mouse['tail_base']['y'])**2)
        for lag in [10, 20, 40]:
            l = _scale(lag, fps)
            X[f'nt_lg{lag}'] = nt_dist.shift(l)
            X[f'nt_df{lag}'] = nt_dist - nt_dist.shift(l)

    if all(p in available_body_parts for p in ['ear_left', 'ear_right']):
        ear_d = np.sqrt((single_mouse['ear_left']['x'] - single_mouse['ear_right']['x'])**2 +
                        (single_mouse['ear_left']['y'] - single_mouse['ear_right']['y'])**2)
        for off in [-30, -20, -10, 10, 20, 30]:
            o = _scale_signed(off, fps)
            X[f'ear_o{off}'] = ear_d.shift(-o)
        w = _scale(30, fps)
        X['ear_con'] = ear_d.rolling(w, min_periods=1, center=True).std() / \
                       (ear_d.rolling(w, min_periods=1, center=True).mean() + 1e-6)
    # 単独の姿勢PCA
    X = add_pose_pca(X, single_mouse, n_components=3, prefix="ag") # agent
    X = add_ellipse_features(X, single_mouse, fps, prefix="")
    if video_id is not None and 'add_arena_features' in globals():
        try:
            X = add_arena_features(X, single_mouse, video_id, fps)
        except Exception as e:
            print(f"Warning: Arena features failed for {video_id}: {e}")
            pass

    # ★★★ 追加: 未来の特徴量 ★★★
    X = add_future_features(X, fps)
    # ★★★ 追加: 周波数特徴量 ★★★
    # (計算に時間がかかるので、まずは single だけに入れる等の調整もアリ)
 #   X = add_frequency_features(X, fps)

    return X.astype(np.float32, copy=False)
def transform_pair(mouse_pair, body_parts_tracked, fps):
    avail_A = mouse_pair['A'].columns.get_level_values(0)
    avail_B = mouse_pair['B'].columns.get_level_values(0)

    X = pd.DataFrame({
        f"12+{p1}+{p2}": np.square(mouse_pair['A'][p1] - mouse_pair['B'][p2]).sum(axis=1, skipna=False)
        for p1, p2 in itertools.product(body_parts_tracked, repeat=2)
        if p1 in avail_A and p2 in avail_B
    })
    X = X.reindex(columns=[f"12+{p1}+{p2}" for p1, p2 in itertools.product(body_parts_tracked, repeat=2)], copy=False)

    # =========================================================
    # ★★★ 追加: 指定パーツの距離変化 (Change/Diff) - Pair版 ★★★
    # =========================================================

    # 対象とするパーツのリスト
    target_parts = ["body_center", "ear_left", "ear_right", "head", "nose", "tail_base", "tail_tip"]

    # 実際にデータに存在するパーツだけに絞る (AとBそれぞれ)
    valid_targets_A = [p for p in target_parts if p in avail_A]
    valid_targets_B = [p for p in target_parts if p in avail_B]

    # 対象パーツ間の距離カラムを特定
    target_cols = []

    # 全列を走査するより、有効なパーツの組み合わせから列名を作って探す方が速い
    for p1 in valid_targets_A:
        for p2 in valid_targets_B:
            col_name = f"12+{p1}+{p2}"
            if col_name in X.columns:
                target_cols.append(col_name)

    if len(target_cols) > 0:
        lag_frames = 10
        lag = _scale(lag_frames, fps)

        # 指定した列だけを取り出して差分計算
        X_diff = X[target_cols].diff(lag)

        # 列名変更 (例: 12+nose+tail_chg10)
        X_diff.columns = [f"{c}_chg{lag_frames}" for c in X_diff.columns]

        # 結合
        X = pd.concat([X, X_diff], axis=1)

        # (オプション) 全体の接近・離反の合計
        # ペア間の距離変化の合計なので、マイナスなら「全体的に近づいている」、プラスなら「離れている」
        X[f'major_parts_approach_{lag_frames}'] = X_diff.sum(axis=1)
    # =========================================================
    # =========================================================

    if ('A', 'ear_right') in mouse_pair.columns and ('B', 'ear_right') in mouse_pair.columns:
        lag = _scale(10, fps)
        shA = mouse_pair['A']['ear_right'].shift(lag)
        shB = mouse_pair['B']['ear_right'].shift(lag)
        speeds = pd.DataFrame({
            'sp_A': np.square(mouse_pair['A']['ear_right'] - shA).sum(axis=1, skipna=False),
            'sp_AB': np.square(mouse_pair['A']['ear_right'] - shB).sum(axis=1, skipna=False),
            'sp_B': np.square(mouse_pair['B']['ear_right'] - shB).sum(axis=1, skipna=False),
        })
        X = pd.concat([X, speeds], axis=1)

    if 'nose+tail_base' in X.columns and 'ear_left+ear_right' in X.columns:
        X['elong'] = X['nose+tail_base'] / (X['ear_left+ear_right'] + 1e-6)

    if all(p in avail_A for p in ['nose', 'tail_base']) and all(p in avail_B for p in ['nose', 'tail_base']):
        dir_A = mouse_pair['A']['nose'] - mouse_pair['A']['tail_base']
        dir_B = mouse_pair['B']['nose'] - mouse_pair['B']['tail_base']
        X['rel_ori'] = (dir_A['x'] * dir_B['x'] + dir_A['y'] * dir_B['y']) / (
            np.sqrt(dir_A['x']**2 + dir_A['y']**2) * np.sqrt(dir_B['x']**2 + dir_B['y']**2) + 1e-6)

    #if all(p in avail_A for p in ['nose']) and all(p in avail_B for p in ['nose']):
   #     cur = np.square(mouse_pair['A']['nose'] - mouse_pair['B']['nose']).sum(axis=1, skipna=False)
  #      lag = _scale(10, fps)
   #     shA_n = mouse_pair['A']['nose'].shift(lag)
  #      shB_n = mouse_pair['B']['nose'].shift(lag)
  #      past = np.square(shA_n - shB_n).sum(axis=1, skipna=False)
 #       X['appr'] = cur - past

    if 'body_center' in avail_A and 'body_center' in avail_B:
        cd = np.sqrt((mouse_pair['A']['body_center']['x'] - mouse_pair['B']['body_center']['x'])**2 +
                     (mouse_pair['A']['body_center']['y'] - mouse_pair['B']['body_center']['y'])**2)
    #    X['v_cls'] = (cd < 5.0).astype(float)
     #   X['cls']   = ((cd >= 5.0) & (cd < 15.0)).astype(float)
     #   X['med']   = ((cd >= 15.0) & (cd < 30.0)).astype(float)
     #   X['far']   = (cd >= 30.0).astype(float)

    if 'body_center' in avail_A and 'body_center' in avail_B:
        dist = np.square(mouse_pair['A']['body_center'] - mouse_pair['B']['body_center']).sum(axis=1, skipna=False)
        # 2. ★改良: 直線距離 (cm)
        cd_full = np.sqrt(dist)
        # 3. ★改良: 接近速度 (マイナス=接近, プラス=離反)
        appr_vel = dist.diff()
        j = _scale(30, fps)
        X['int_con'] = cd_full.rolling(j, min_periods=1, center=True).std() / \
                    (cd_full.rolling(j, min_periods=1, center=True).mean() + 1e-6)

        for w in [5, 15, 30, 45, 60, 75, 90, 120, 150]:
            ws = _scale(w, fps)
            roll = dict(min_periods=1, center=True)
            X[f'd_m{w}']  = cd_full.rolling(ws, **roll).mean()
            X[f'd_s{w}']  = cd_full.rolling(ws, **roll).std()
            X[f'd_mn{w}'] = cd_full.rolling(ws, **roll).min()
            X[f'd_mx{w}'] = cd_full.rolling(ws, **roll).max()

            d_var = cd_full.rolling(ws, **roll).var()
            X[f'int{w}'] = 1 / (1 + d_var)
            # --- ★改良: 変動係数 (距離の安定性) ---
            # (距離が近くて、かつ変動が少ない = Grooming/Huddle)
            X[f'd_cv{w}'] = X[f'd_s{w}'] / (X[f'd_m{w}'] + 1.0)
            # --- ★改良: 接近の勢い ---
            # 平均接近速度 (Attackの予兆)
            X[f'appr_m{w}'] = appr_vel.rolling(ws, **roll).mean()
            # 最大接近速度 (一瞬の飛びかかり)
            X[f'appr_min{w}'] = appr_vel.rolling(ws, **roll).min()

            Axd = mouse_pair['A']['body_center']['x'].diff()
            Ayd = mouse_pair['A']['body_center']['y'].diff()
            Bxd = mouse_pair['B']['body_center']['x'].diff()
            Byd = mouse_pair['B']['body_center']['y'].diff()
            coord = Axd * Bxd + Ayd * Byd
            X[f'co_m{w}'] = coord.rolling(ws, **roll).mean()
            X[f'co_s{w}'] = coord.rolling(ws, **roll).std()

    if 'nose' in avail_A and 'nose' in avail_B:
        nn = np.sqrt((mouse_pair['A']['nose']['x'] - mouse_pair['B']['nose']['x'])**2 +
                     (mouse_pair['A']['nose']['y'] - mouse_pair['B']['nose']['y'])**2)
        for lag in [10, 20, 40]:
            l = _scale(lag, fps)
            X[f'nn_lg{lag}']  = nn.shift(l)
            X[f'nn_ch{lag}']  = nn - nn.shift(l)
            is_cl = (nn < 10.0).astype(float)
            X[f'cl_ps{lag}']  = is_cl.rolling(l, min_periods=1).mean()

    if 'body_center' in avail_A and 'body_center' in avail_B:
        Avx = mouse_pair['A']['body_center']['x'].diff()
        Avy = mouse_pair['A']['body_center']['y'].diff()
        Bvx = mouse_pair['B']['body_center']['x'].diff()
        Bvy = mouse_pair['B']['body_center']['y'].diff()
        val = (Avx * Bvx + Avy * Bvy) / (np.sqrt(Avx**2 + Avy**2) * np.sqrt(Bvx**2 + Bvy**2) + 1e-6)

  #      for off in [-30, -20, -10, 0, 10, 20, 30]:
  #          o = _scale_signed(off, fps)
  #          X[f'va_{off}'] = val.shift(-o)

   #     w = _scale(30, fps)
   #     X['int_con'] = cd_full.rolling(w, min_periods=1, center=True).std() / \
  #                     (cd_full.rolling(w, min_periods=1, center=True).mean() + 1e-6)

        X = add_interaction_features(X, mouse_pair, avail_A, avail_B, fps)

    # マウスA (Agent) の姿勢
    X = add_pose_pca(X, mouse_pair['A'], n_components=3, prefix="ag")
    # マウスB (Target) の姿勢
    X = add_pose_pca(X, mouse_pair['B'], n_components=3, prefix="tg")
    X = add_relative_angle_features(X, mouse_pair, fps)

    # A (Agent) の姿勢: 接頭辞 "A_"
    X = add_ellipse_features(X, mouse_pair['A'], fps, prefix="A_")

    # B (Target) の姿勢: 接頭辞 "B_"
    X = add_ellipse_features(X, mouse_pair['B'], fps, prefix="B_")

    # 相対的な姿勢の違い (もし両方計算できていれば)
    if 'A_ellipse_ratio' in X.columns and 'B_ellipse_ratio' in X.columns:
        X['ellipse_ratio_diff'] = X['A_ellipse_ratio'] - X['B_ellipse_ratio']

    X = add_social_proximity_features(X, mouse_pair, fps)
    # ★★★ 追加: 未来の特徴量 ★★★
    X = add_future_features(X, fps)
    # ★★★ 追加: 周波数特徴量 ★★★
    # (計算に時間がかかるので、まずは single だけに入れる等の調整もアリ)
#    X = add_frequency_features(X, fps)
    return X.astype(np.float32, copy=False)

In [None]:
def add_arena_features(X, single_mouse, video_id, fps):
    # body_center がなければスキップ
    if 'body_center' not in single_mouse.columns.get_level_values(0):
        return X

    # video_metadata から環境情報を取得
    try:
        info = video_metadata.loc[video_id]
        W_pix = float(info['video_width_pix'])
        H_pix = float(info['video_height_pix'])
        W_arena = float(info['arena_width_cm'])
        H_arena = float(info['arena_height_cm'])
        ppc = float(info['pix_per_cm'])
    except KeyError:
        return X

    # 座標 (既にcm単位になっている前提)
    cx = single_mouse['body_center']['x']
    cy = single_mouse['body_center']['y']

    # --- 座標系の補正 (画面中央を原点 (0,0) に見立てる) ---
    # ビデオ全体の幅・高さ (cm)
    W_video_cm = W_pix / ppc
    H_video_cm = H_pix / ppc

    # 中心からのオフセット
    center_x_offset = W_video_cm / 2
    center_y_offset = H_video_cm / 2

    # アリーナの半径
    r_w = W_arena / 2
    r_h = H_arena / 2

    # 1. アリーナ中心からの距離
    X['dist_to_center'] = np.sqrt((cx - center_x_offset)**2 + (cy - center_y_offset)**2)

    # 2. 壁までの距離
    # (アリーナ半径 - 中心からの距離)
    dist_x = (cx - center_x_offset).abs()
    dist_y = (cy - center_y_offset).abs()

    dist_wall_x = r_w - dist_x
    dist_wall_y = r_h - dist_y

    # 最も近い壁までの距離
    X['dist_to_wall'] = np.minimum(dist_wall_x, dist_wall_y)

    # 3. ゾーン判定
    X['is_wall_zone'] = (X['dist_to_wall'] < 5.0).astype(float)
    X['is_center_zone'] = (X['dist_to_center'] < 10.0).astype(float)

    # --- 滞在時間の割合 ---
    for window in [30, 90, 300]:
        ws = _scale(window, fps)
        X[f'wall_ratio_{window}'] = X['is_wall_zone'].rolling(ws, min_periods=1).mean()

    return X

In [None]:
# ==========================================
# ★ メタデータ注入を行う専用関数
# ==========================================
def add_metadata_features(X, meta, video_metadata):
    """
    特徴量DataFrame(X)に、メタデータ(lab_id, tracking_method, strain, sex)を注入する
    """
    try:
        # 1. 現在のビデオIDを取得
        vid = meta['video_id'].iloc[0]

        # 2. video_metadata からそのビデオの情報を取得
        vid_info = video_metadata.loc[vid]

        # 3. 共通情報の追加 (lab_id, tracking_method)
        # float32にキャストしてメモリ節約＆型統一
        X['lab_id'] = vid_info['lab_id'].astype(np.float32)
        X['tracking_method'] = vid_info['tracking_method'].astype(np.float32)

        # 4. Agent/Target に応じた情報の追加
        # マッピング辞書の準備
        strain_map = {f'mouse{i}': vid_info[f'mouse{i}_strain'] for i in range(1, 5)}
        sex_map = {f'mouse{i}': vid_info[f'mouse{i}_sex'] for i in range(1, 5)}

        # map関数を使って代入
        X['agent_strain'] = meta['agent_id'].map(strain_map).values.astype(np.float32)
        X['agent_sex'] = meta['agent_id'].map(sex_map).values.astype(np.float32)

        # Pairの場合のみ target_id があるので追加
        if 'target_id' in meta.columns and (meta['target_id'] != 'self').any():
            # target_id がある行だけ計算（Singleの場合はエラーにならないようにチェック）
            X['target_strain'] = meta['target_id'].map(strain_map).values.astype(np.float32)
            X['target_sex'] = meta['target_id'].map(sex_map).values.astype(np.float32)

    except Exception as e:
        print(f"Warning: Metadata injection failed for video {vid}: {e}")

    return X

# Training, validation and submission

In [None]:
def robustify(submission, dataset, traintest, traintest_directory=None, min_duration_frames=8, gap_fill_frames=8):
    if traintest_directory is None:
        traintest_directory = f"CFG.input_dir/{traintest}_tracking"

    if len(submission) == 0: return submission

    # 1. 基本的なクリーニング
    submission = submission[submission.start_frame < submission.stop_frame]

    # 2. Bout処理 (隙間埋め & 短い行動削除)
    #min_duration_frames = 10
    #gap_fill_frames = 10
    processed_list = []

    for _, group in submission.groupby(['video_id', 'agent_id', 'target_id', 'action']):
        group = group.sort_values('start_frame')
        group['prev_stop'] = group['stop_frame'].shift(1)
        group['gap'] = group['start_frame'] - group['prev_stop']
        group['bout_id'] = (group['gap'] > gap_fill_frames).fillna(True).cumsum()

        merged_bouts = group.groupby('bout_id').agg({
            'video_id': 'first', 'agent_id': 'first', 'target_id': 'first', 'action': 'first',
            'start_frame': 'min', 'stop_frame': 'max'
        }).reset_index(drop=True)

        merged_bouts['duration'] = merged_bouts['stop_frame'] - merged_bouts['start_frame']
        merged_bouts = merged_bouts[merged_bouts['duration'] >= min_duration_frames]

        if len(merged_bouts) > 0:
            processed_list.append(merged_bouts[['video_id', 'agent_id', 'target_id', 'action', 'start_frame', 'stop_frame']])

    if len(processed_list) > 0:
        submission = pd.concat(processed_list).sort_values(['video_id', 'start_frame'])
    else:
        return pd.DataFrame(columns=submission.columns) # 空ならここで終了

    # =========================================================
    # ★★★ 最終防衛ライン: 重複部分だけをトリミングして残す ★★★
    # =========================================================

    final_rows = []

    for _, group in submission.groupby(['video_id', 'agent_id', 'target_id']):
        # 開始時間順にソート
        group = group.sort_values(['start_frame', 'stop_frame'])

        # 確定した予測区間を保持するリスト [(start, stop), ...]
        occupied_intervals = []

        for idx, row in group.iterrows():
            start, stop = row['start_frame'], row['stop_frame']
            original_duration = stop - start

            # 既存の区間と被っているかチェック
            is_overlapped = False

            # 単純化のため、既存区間の「最大終了時刻」を管理して比較
            # (ソート済みなので、直前の区間と比較するだけで大抵はOKだが、念のため全走査に近い形にする)
            # しかし計算量削減のため、ここでは「一番後ろの区間」との比較をメインにする

            if not occupied_intervals:
                occupied_intervals.append((start, stop))
                final_rows.append(row)
                continue

            # 直前の採用区間を取得
            last_start, last_stop = occupied_intervals[-1]

            if start < last_stop:
                # 被っている！
                # 新しい開始時間を「前の終了時間」まで後ろにずらす
                new_start = last_stop

                if new_start < stop:
                    # まだ区間が残っているなら採用
                    # ただし、短くなりすぎていないかチェック (Bout制約)
                    if (stop - new_start) >= min_duration_frames:
                        row['start_frame'] = new_start
                        occupied_intervals.append((new_start, stop))
                        final_rows.append(row)
            else:
                # 被っていない（離れている）ならそのまま採用
                occupied_intervals.append((start, stop))
                final_rows.append(row)

    if len(final_rows) > 0:
        submission = pd.DataFrame(final_rows)
    else:
        submission = pd.DataFrame(columns=submission.columns)
    # =========================================================

    # 3. 穴埋め処理 (Filling missing videos)
    s_list = []
    for idx, row in dataset.iterrows():
        lab_id = row['lab_id']
        if traintest == 'train' and lab_id.startswith(('MABe22', 'CalMS21', 'CRIM13')): continue

        video_id = row['video_id']
        if len(submission) > 0 and (submission.video_id == video_id).any(): continue
        if type(row.behaviors_labeled) != str: continue

        print(f"Video {video_id} has no predictions. Filling.")
        path = f"{traintest_directory}/{lab_id}/{video_id}.parquet"
        try:
            vid = pd.read_parquet(path)
            vid_behaviors = json.loads(row['behaviors_labeled'])
            vid_behaviors = sorted(list({b.replace("'", "") for b in vid_behaviors}))
            vid_behaviors = [b.split(',') for b in vid_behaviors]
            vid_behaviors = pd.DataFrame(vid_behaviors, columns=['agent', 'target', 'action'])
            start_frame = vid.video_frame.min()
            stop_frame = vid.video_frame.max() + 1
            for (agent, target), actions in vid_behaviors.groupby(['agent', 'target']):
                batch_length = int(np.ceil((stop_frame - start_frame) / len(actions)))
                for i, (_, action_row) in enumerate(actions.iterrows()):
                    batch_start = start_frame + i * batch_length
                    batch_stop = min(batch_start + batch_length, stop_frame)
                    s_list.append((video_id, agent, target, action_row['action'], batch_start, batch_stop))
        except: pass

    if len(s_list) > 0:
        submission = pd.concat([
            submission,
            pd.DataFrame(s_list, columns=['video_id', 'agent_id', 'target_id', 'action', 'start_frame', 'stop_frame'])
        ])
        # print("ERROR: Filled empty videos")

    submission = submission.reset_index(drop=True)
    return submission

In [None]:
def predict_multiclass(pred, meta, thresholds):
    ama = np.argmax(pred.values, axis=1)
    max_proba = pred.max(axis=1).values

    threshold_array = np.array([thresholds.get(col, 0.27) for col in pred.columns])
    action_thresholds = threshold_array[ama]

    ama = np.where(max_proba >= action_thresholds, ama, -1)
    ama = pd.Series(ama, index=meta.video_frame)

    changes_mask = (ama != ama.shift(1)).values
    ama_changes = ama[changes_mask]
    meta_changes = meta[changes_mask]

    mask = ama_changes.values >= 0
    mask[-1] = False

    submission_part = pd.DataFrame({
        'video_id': meta_changes['video_id'][mask].values,
        'agent_id': meta_changes['agent_id'][mask].values,
        'target_id': meta_changes['target_id'][mask].values,
        'action': pred.columns[ama_changes[mask].values],
        'start_frame': ama_changes.index[mask],
        'stop_frame': ama_changes.index[1:][mask[:-1]]
    })

    stop_video_id = meta_changes['video_id'][1:][mask[:-1]].values
    stop_agent_id = meta_changes['agent_id'][1:][mask[:-1]].values
    stop_target_id = meta_changes['target_id'][1:][mask[:-1]].values
    for i in range(len(submission_part)):
        video_id = submission_part.video_id.iloc[i]
        agent_id = submission_part.agent_id.iloc[i]
        target_id = submission_part.target_id.iloc[i]
        if stop_video_id[i] != video_id or stop_agent_id[i] != agent_id or stop_target_id[i] != target_id:
            new_stop_frame = meta.query("(video_id == @video_id)").video_frame.max() + 1
            submission_part.iat[i, submission_part.columns.get_loc('stop_frame')] = new_stop_frame

    return submission_part

In [None]:
def tune_threshold(oof_action, y_action):
    def objective(trial):
        threshold = trial.suggest_float("threshold", 0, 1, step=0.01)
        return f1_score(y_action, (oof_action >= threshold), zero_division=0)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=1000, n_jobs=-1)
    return study.best_params["threshold"]

In [None]:
if CFG.mode == "validate":
    thresholds = {
        "single": {},
        "pair": {}
    }
else:
    thresholds = joblib.load(f"{CFG.model_path}/{CFG.model_name}/thresholds.pkl")

In [None]:
import lightgbm as lgb
#from catboost import CatBoostClassifier

def cross_validate_classifier(X_pl, label, meta, body_parts_tracked_str, section, model_type="xgboost"):
    # model_type: "xgboost", "lightgbm", "catboost" のいずれか

    oof = pd.DataFrame(index=meta.video_frame)
    f1_list = []
    submission_list = []
    thresholds = {}
    gc.collect()

    for action in label.columns:
        # ラベル準備
        action_mask = ~ label[action].isna().values
        y_action = label[action][action_mask].values.astype(int)
        X_action = X_pl[action_mask]
        groups_action = meta.video_id[action_mask].values

        if len(np.unique(groups_action)) < CFG.n_splits: continue

        # アクションが存在する場合のみ学習
        if not (y_action == 0).all():
            print(f"Start training {model_type} for {action}...")

            oof_preds = np.zeros(len(y_action), dtype=np.float32)
            best_f1_score = -1.0
            best_fold_idx = -1
            best_threshold = 0.5

            cv = StratifiedGroupKFold(n_splits=CFG.n_splits)

            for fold, (train_idx, val_idx) in enumerate(cv.split(X_action, y_action, groups=groups_action)):
                X_train_fold = X_action.iloc[train_idx]
                y_train_fold = y_action[train_idx]
                X_val_fold = X_action.iloc[val_idx]

                # scale_pos_weight 計算
                num_neg = (y_train_fold == 0).sum()
                num_pos = (y_train_fold == 1).sum()
                pos_weight = np.sqrt(num_neg / num_pos) if num_pos > 0 else 1.0
                print(f"  Fold {fold}: Neg={num_neg}, Pos={num_pos}, Weight={pos_weight:.2f}")
                # ==========================================
                # ★ モデル定義の切り替えスイッチ
                # ==========================================
                if model_type == "xgboost":
                    model = XGBClassifier(
                        # --- 固定設定 ---
                        verbosity=0,
                        random_state=42,
                        device='cuda',
                        tree_method='hist',
                        n_estimators=3000, # 学習率0.13ならこれで十分足ります

                        # --- ★今回見つけたベストパラメータ ---
                        max_depth=6,
                        learning_rate=0.132,       # 0.1318... を丸めました
                        min_child_weight=10,
                        subsample=0.728,
                        colsample_bytree=0.771,
                        reg_alpha=1.58,
                        reg_lambda=3.44,

                        # --- 変動設定 ---
                        scale_pos_weight=pos_weight,
                        early_stopping_rounds=50,
                    )

                elif model_type == "lightgbm":
                    model = lgb.LGBMClassifier(
                        random_state=42, device='gpu', # GPUがない環境なら 'cpu'
                        n_estimators=3000, learning_rate=0.05, num_leaves=31,
                        scale_pos_weight=pos_weight,
                        verbose=-1
                    )

                elif model_type == "catboost":
                    model = CatBoostClassifier(
                        random_state=42, task_type="GPU", # GPUがない環境なら "CPU"
                        iterations=3000, learning_rate=0.05, depth=6,
                        scale_pos_weight=pos_weight,
                        verbose=0, allow_writing_files=False
                    )
                # ==========================================

                # 学習 (XGB, LGBM, Cat 共通のAPIでいけます)
                # CatBoost向けにeval_setの渡し方が少し違いますが、sklearn APIならこれでも動くことが多いです
                # 厳密にやるなら分岐しますが、まずは簡易版で
                if model_type == "catboost":
                    model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_action[val_idx]), early_stopping_rounds=50, verbose=False)
                elif model_type == "xgboost":
                    # XGBはコンストラクタでearly_stopping指定済みならfitでは不要、バージョンの違い吸収のためシンプルに
                     model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_action[val_idx])], verbose=False)
                else: # LightGBM
                    # LGBMは callbacksが必要な場合があるが、最新版はこれでいけることが多い
                    model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_action[val_idx])],
                              callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])

                # 保存ディレクトリ
                save_dir = f"{CFG.model_name}/{section}/{action}"
                os.makedirs(save_dir, exist_ok=True)

                # 特徴量保存 (Fold0のみ)
                if fold == 0:
                    with open(f"{save_dir}/features.json", "w") as f:
                        json.dump(list(X_train_fold.columns), f)

                # ★ 保存ファイル名をモデルごとに変える！ (これがアンサンブルの鍵)
                # 例: xgboost_fold_0.pkl, lightgbm_fold_0.pkl
                file_name = f"{model_type}_fold_{fold}.pkl"
                joblib.dump(model, f"{save_dir}/{file_name}")

                # 推論 & スコア更新
                val_preds = model.predict_proba(X_val_fold)[:, 1]
                oof_preds[val_idx] = val_preds

                # ベストスコア更新処理...
                current_y_true = y_action[val_idx]
                current_th = tune_threshold(val_preds, current_y_true)
                current_f1 = f1_score(current_y_true, (val_preds >= current_th), zero_division=0)
                print(f"    -> Fold {fold} Score: {current_f1:.4f} (th={current_th:.2f})")
                if current_f1 > best_f1_score:
                    best_f1_score = current_f1
                    best_fold_idx = fold
                    best_threshold = current_th
                    print(f"    [{model_type}] Fold {fold} New Best! F1: {current_f1:.4f}")

                del X_train_fold, y_train_fold, X_val_fold, model
                gc.collect()

            # --- Fold Loop End ---
            # 閾値の最適化
            threshold = tune_threshold(oof_preds, y_action)
            thresholds[action] = threshold

            f1 = f1_score(y_action, (oof_preds >= threshold), zero_division=0)
            f1_list.append((body_parts_tracked_str, action, f1))

            # 保存処理
            # ★ 修正1: OOFなども上書きされないように名前に model_type をつける
            joblib.dump(oof_preds, f"{save_dir}/oof_pred_probs_{model_type}.pkl")
            joblib.dump(threshold, f"{save_dir}/threshold_{model_type}.pkl")

            print(f"\tF1: {f1:.4f} ({threshold:.2f}) Section: {section} Action: {action}")

            # ==========================================
            # ✅ ベストモデル情報の保存 (ループの外で行う！)
                # ==========================================
            print(f"\n🏆 Best Fold: {best_fold_idx} with F1: {best_f1_score:.4f}")

            best_info = {
                "best_fold": int(best_fold_idx), # intに変換しないとJSONエラーになることがある
                "best_f1": float(best_f1_score),
                "best_threshold": float(best_threshold)
            }

                # save_dir はループ内で定義されているので、再度パスを指定するかループ外で定義が必要
                # (ここでは念のため再定義)
            save_dir = f"{CFG.model_name}/{section}/{action}"

            with open(f"{save_dir}/best_model_info.json", "w") as f:
                json.dump(best_info, f)

                # ベストモデルをコピー
            if best_fold_idx != -1:
                import shutil
                # ★ 修正2: "xgb_" 固定ではなく model_type 変数を使う
                src_model = f"{save_dir}/{model_type}_fold_{best_fold_idx}.pkl"
                dst_model = f"{save_dir}/{model_type}_best.pkl"
                if os.path.exists(src_model):
                    shutil.copy(src_model, dst_model)
                    print(f"  -> Copied best model to: {dst_model}")
                else:
                    print(f"  Warning: Source model {src_model} not found.")
                # ==========================================

        else:
            oof_preds = np.zeros(len(y_action))
            print(f"\tF1: 0.0000 (0.00) Section: {section} Action: {action}")

        # 全体の配列に戻す
        oof_column = np.zeros(len(label))
        oof_column[action_mask] = oof_preds
        oof[action] = oof_column

        # ループごとのクリーンアップ
        del action_mask, X_action, y_action, groups_action
        if 'oof_preds' in locals(): del oof_preds
        gc.collect()

    submission_part = predict_multiclass(oof, meta, thresholds)
    submission_list.append(submission_part)

    return submission_list, f1_list, thresholds

In [None]:
# 1. 必要なカラムをリストアップ
meta_cols = [
    'video_id', 'lab_id', 'tracking_method',
    'video_width_pix', 'video_height_pix',
    'arena_width_cm', 'arena_height_cm', 'pix_per_cm_approx'
] + [f'mouse{i}_{attr}' for i in range(1, 5) for attr in ['strain', 'sex']]

# 2. Train/Testから読み込んで結合
raw_meta = pd.concat([
    pd.read_csv(CFG.train_path, usecols=meta_cols),
    pd.read_csv(CFG.test_path, usecols=meta_cols)
], axis=0).drop_duplicates('video_id').set_index('video_id')

# 3. video_metadata の作成
video_metadata = pd.DataFrame(index=raw_meta.index)

# 数値データはそのままコピー (ここが追加点！)
video_metadata['video_width_pix'] = raw_meta['video_width_pix']
video_metadata['video_height_pix'] = raw_meta['video_height_pix']
video_metadata['arena_width_cm']  = raw_meta['arena_width_cm']
video_metadata['arena_height_cm'] = raw_meta['arena_height_cm']
video_metadata['pix_per_cm']      = raw_meta['pix_per_cm_approx']

# カテゴリカルデータはエンコード
le_lab = LabelEncoder()
video_metadata['lab_id'] = le_lab.fit_transform(raw_meta['lab_id'].fillna('unknown'))

le_track = LabelEncoder()
video_metadata['tracking_method'] = le_track.fit_transform(raw_meta['tracking_method'].fillna('unknown'))

# Strain & Sex
le_strain = LabelEncoder()
all_strains = pd.concat([raw_meta[f'mouse{i}_strain'] for i in range(1, 5)]).fillna('unknown').astype(str).unique()
le_strain.fit(all_strains)

le_sex = LabelEncoder()
all_sexes = pd.concat([raw_meta[f'mouse{i}_sex'] for i in range(1, 5)]).fillna('unknown').astype(str).unique()
le_sex.fit(all_sexes)

for i in range(1, 5):
    video_metadata[f'mouse{i}_strain'] = le_strain.transform(raw_meta[f'mouse{i}_strain'].fillna('unknown').astype(str))
    video_metadata[f'mouse{i}_sex'] = le_sex.transform(raw_meta[f'mouse{i}_sex'].fillna('unknown').astype(str))

print("Metadata prepared (with Arena Info)!")
display(video_metadata.head())

In [None]:


# 保存先設定
CHECKPOINT_DIR = "./checkpoints"
# 一時ファイル保存用フォルダ
TEMP_DIR = "./temp_features"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)

print(f"Checkpoints: {CHECKPOINT_DIR}")
print(f"Temp files: {TEMP_DIR}")

f1_list = []
submission_list = []

start_time = time.time()

# =========================================================
# ★ AdaptableSnail かつ 25fps の動画を除外する処理
# (ループに入る前に1回だけ実行する！)
# =========================================================
if 'train' in locals():
    print(f"Original train size: {len(train)}")

    # 条件: lab_id が 'AdaptableSnail' かつ frames_per_second が 25.0
    # (train にこの列がある前提です)
    drop_mask = (train['lab_id'] == 'AdaptableSnail') & (train['frames_per_second'] == 25.0)

    # 除外される動画数を確認
    dropped_cnt = train[drop_mask]['video_id'].nunique()
    if dropped_cnt > 0:
        print(f"🚫 Dropping {dropped_cnt} videos (AdaptableSnail @ 25fps)...")
        # フィルタリング実行
        train = train[~drop_mask].reset_index(drop=True)
        print(f"Filtered train size: {len(train)}")
    else:
        print("No videos to drop.")
# =========================================================

for section in range(1, len(body_parts_tracked_list)):
    body_parts_tracked_str = body_parts_tracked_list[section]
    checkpoint_path = f"{CHECKPOINT_DIR}/section_{section}.pkl"

    # === A. チェックポイント読み込み ===
    if os.path.exists(checkpoint_path):
        print(f"Found checkpoint for Section {section}. Loading...")
        try:
            temp_submission_list, temp_f1_list, temp_thresholds = joblib.load(checkpoint_path)
            f1_list.extend(temp_f1_list)
            submission_list.extend(temp_submission_list)

            # 閾値復元 (validate時)
            if CFG.mode == 'validate':
                if f"{section}" not in thresholds["single"].keys():
                    thresholds["single"][f"{section}"] = {}
                if f"{section}" not in thresholds["pair"].keys():
                    thresholds["pair"][f"{section}"] = {}
                for k, v in temp_thresholds.items():
                    if k in thresholds["single"][f"{section}"]:
                        thresholds["single"][f"{section}"][k] = v
                    elif k in thresholds["pair"][f"{section}"]:
                        thresholds["pair"][f"{section}"][k] = v

            print(f"-> Section {section} loaded! Skipping processing.\n")
            continue
        except Exception as e:
            print(f"Failed to load checkpoint: {e}. Recalculating...")

    # === B. 計算実行 ===
    try:
        body_parts_tracked = json.loads(body_parts_tracked_str)
        if 'body_center' not in body_parts_tracked:
            body_parts_tracked.append('body_center')

        elapsed = time.time() - start_time
        elapsed_str = str(timedelta(seconds=int(elapsed)))
        print(f"[{elapsed_str}] {section}/{len(body_parts_tracked_list)-1} Processing videos with: {body_parts_tracked}\n")

        if len(body_parts_tracked) > 5:
            body_parts_tracked = [b for b in body_parts_tracked if b not in drop_body_parts]

        train_subset = train[train.body_parts_tracked == body_parts_tracked_str]

        _fps_lookup = (
            train_subset[['video_id', 'frames_per_second']]
            .drop_duplicates('video_id')
            .set_index('video_id')['frames_per_second']
            .to_dict()
        )

        # リストに貯めずに、一時ファイルパスを記録するリストにする
        #single_temp_files = []
        #pair_temp_files = []

        single_X_list = []  # 特徴量 (X) 用
        pair_X_list = []    # 特徴量 (X) 用

        single_label_list = []
        single_meta_list = []
        pair_label_list = []
        pair_meta_list = []

        #file_counter = 0

        # ★ 修正ポイント: ループ内で即座に特徴量計算して保存
        for switch, data, meta, label in generate_mouse_data(train_subset, 'train', traintest_directory=CFG.train_tracking_path):

            fps_i = _fps_from_meta(meta, _fps_lookup, default_fps=30.0)
            vid = meta['video_id'].iloc[0]
            if switch == 'single':
                # 1. すぐに特徴量変換 (float32でOK)
                X_i = transform_single(data, body_parts_tracked, fps_i, video_id=vid).astype(np.float32)

                # 2. ★ メタデータ注入 (関数呼び出しに変更！) ★
                X_i = add_metadata_features(X_i, meta, video_metadata)

                # 3. ★修正: リストに直接追加 (ファイル保存しない)
                single_X_list.append(X_i)
                single_label_list.append(label)
                single_meta_list.append(meta)

                # 3. すぐにParquetに保存
                #temp_path = f"{TEMP_DIR}/single_{section}_{file_counter}.parquet"
                #X_i.columns = X_i.columns.astype(str)
                #X_i.to_parquet(temp_path, index=False)

                #single_temp_files.append(temp_path)
                #single_label_list.append(label)
                #single_meta_list.append(meta)

                #del X_i # メモリから消す！

            else:
                # 1. すぐに特徴量変換
                X_i = transform_pair(data, body_parts_tracked, fps_i).astype(np.float32)

                # 2. ★ メタデータ注入 (関数呼び出しに変更！) ★
                X_i = add_metadata_features(X_i, meta, video_metadata)

                # 3. ★修正: リストに直接追加
                pair_X_list.append(X_i)
                pair_label_list.append(label)
                pair_meta_list.append(meta)

                # 3. すぐにParquetに保存
                #temp_path = f"{TEMP_DIR}/pair_{section}_{file_counter}.parquet"
                #X_i.columns = X_i.columns.astype(str)
                #X_i.to_parquet(temp_path, index=False)

                #pair_temp_files.append(temp_path)
                #pair_label_list.append(label)
                #pair_meta_list.append(meta)

                #del X_i # メモリから消す！

            del data, meta, label
            #file_counter += 1

            # 定期的にGCを実行
            #if file_counter % 10 == 0:
            #    gc.collect()

        gc.collect()

        current_section_submission = []
        current_section_f1 = []
        current_section_thresholds = {}

        # --- Single Mouse Processing ---
        if len(single_X_list) > 0:
            print(f"Loading {len(single_X_list)} single mouse feature files...")

            # Polarsを使って全Parquetファイルを一気に読み込む (高速 & 省メモリ)
            # scan_parquet は遅延読み込みなのでメモリを食わない
     #       try:
       #         X_pl = pl.read_parquet(single_temp_files) # filesリストを渡せば結合してくれる
     #       except:
   #             # バージョンによってはリスト渡しができない場合があるのでconcatする
    #            X_pl = pl.concat([pl.read_parquet(f) for f in single_temp_files])
            #try:
            #    X_pl = pl.concat(
            #        [pl.scan_parquet(f) for f in single_temp_files],
           #         how="diagonal"
             #   ).collect()
          #  except Exception as e:
           #     print(f"Lazy load failed ({e}), trying eager load...")
                # 万が一Lazyでコケた場合のフォールバック (1つずつ読み込む)
            #    X_pl = pl.concat(
           #         [pl.read_parquet(f) for f in single_temp_files],
            #        how="diagonal"
            #    )

            # ★修正: Pandasで一括読み込み＆結合
            # axis=0 で縦に結合、ignore_index=True でインデックスを振り直す
            #X_pd_list = [pd.read_parquet(f) for f in single_temp_files]
            #X_pl = pd.concat(X_pd_list, axis=0, ignore_index=True)

            # ★修正: メモリ上のリストを一気に結合
            X_pl = pd.concat(single_X_list, axis=0, ignore_index=True)
            # ラベルとメタデータはPandasで結合
            single_mouse_label = pd.concat(single_label_list, axis=0, ignore_index=True)
            single_mouse_meta = pd.concat(single_meta_list, axis=0, ignore_index=True)

            # 完了したのでリスト削除
            del single_label_list, single_meta_list, single_X_list
            gc.collect()

            # inf対策 (Polars)
            #float_cols = X_pl.select(pl.col([pl.Float32, pl.Float64])).columns
            #X_pl = X_pl.with_columns([
            #    pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
            #    for c in float_cols
            #])
            # ... (X_te 生成直後) ...

            # --- 特徴量の内訳カウント ---
            print(f"\n[Feature Count Report] Total: {len(X_pl.columns)} columns")

            # カテゴリごとのキーワード
            categories = {
                'Distance': ['+', 'dist'],
                'Speed/Accel': ['speed', 'accel', 'jerk', 'sp_', 'disp', 'act'],
                'Angle/Pose': ['ang', 'ori', 'curv', 'turn', 'ellipse'],
                'Arena/Zone': ['wall', 'center', 'zone'],
                'Future': ['fut_'],
                'Frequency': ['fft_'],
                'Metadata': ['lab_id', 'strain', 'sex', 'method'],
                'Interaction': ['prox', 'watch', 'mutual', '12+']
            }

            for cat, keywords in categories.items():
                count = sum(1 for c in X_pl.columns if any(k in c for k in keywords))
                if count > 0:
                    print(f"  - {cat}: {count}")

            # -----------------------------
            if CFG.mode == 'validate':
                temp_sub, temp_f1, temp_th = cross_validate_classifier(X_pl, single_mouse_label, single_mouse_meta, body_parts_tracked_str, section, model_type="xgboost")

                if f"{section}" not in thresholds["single"].keys():
                    thresholds["single"][f"{section}"] = {}
                for k, v in temp_th.items():
                    thresholds["single"][f"{section}"][k] = v

                f1_list.extend(temp_f1)
                submission_list.extend(temp_sub)

                current_section_f1.extend(temp_f1)
                current_section_submission.extend(temp_sub)
                current_section_thresholds.update(temp_th)

                del temp_sub, temp_f1, temp_th, X_pl
                gc.collect()
            else:
         #       temp_sub = submit(body_parts_tracked_str, 'single', section, thresholds["single"][f"{section}"])
       #         submission_list.extend(temp_sub)
         #       current_section_submission.extend(temp_sub)
         #       del temp_sub, X_pl
                gc.collect()

        # --- Mouse Pair Processing ---

        if len(pair_X_list) > 0:
            print(f"Loading and Downsampling {len(pair_X_list)} mouse pair files...")

            # リストに全データを貯めず、間引いたものだけを入れる
            #X_pl_list = []
            #y_list = []
            #meta_list = []

            # 1つずつ読み込んで、その場で間引く！
            #for i, (feat_file, label_df, meta_df) in enumerate(zip(pair_temp_files, pair_label_list, pair_meta_list)):

                # 1. 特徴量を読み込む
             #   df = pl.read_parquet(feat_file)

                # 2. 保持する行を決める（validateモードのみ）
            #    if CFG.mode == 'validate':
                    # ラベルが1つでもある行（ポジティブ）を探す
                    # (label_df は Pandas なので numpy にして計算)
           #         has_action = label_df.values.sum(axis=1) > 0

           #         pos_idx = np.where(has_action)[0]
             #       neg_idx = np.where(~has_action)[0]

                    # ネガティブデータ（行動なし）を「10%」に間引く！(ここが軽量化のキモ)
                    # メモリが厳しければ 0.05 (5%) にしてもOK
             #       if len(neg_idx) > 0:
            #            np.random.seed(42 + i) # 再現性確保
            #            keep_neg_count = int(len(neg_idx) * 0.1)
            #            neg_idx_sampled = np.random.choice(neg_idx, size=keep_neg_count, replace=False)
           #         else:
           #             neg_idx_sampled = []

          #          # 結合してソート
            #        keep_idx = np.concatenate([pos_idx, neg_idx_sampled])
          #          keep_idx.sort()

                    # 3. データをフィルタリング（小さくする）
           #         df = df[keep_idx]
            #        label_df = label_df.iloc[keep_idx]
            #        meta_df = meta_df.iloc[keep_idx]

                # リストに追加
          #      X_pl_list.append(df)
          #      y_list.append(label_df)
          #      meta_list.append(meta_df)

                # こまめにGC
            #    if i % 100 == 0: gc.collect()
            # =================================================
            # ★ ログ出力: diagonal結合で何個nullが増えるか計算
            # =================================================
            # 1. 全データフレームに含まれるユニークな列名を集める
           # all_columns = set()
           # original_total_cells = 0
           # total_rows = 0

           # for df in X_pl_list:
          #      all_columns.update(df.columns)
          #      original_total_cells += (df.height * df.width)
          #      total_rows += df.height
          #
           # final_col_count = len(all_columns)
           # final_total_cells = total_rows * final_col_count
          #
         #   filled_count = final_total_cells - original_total_cells
         #   fill_ratio = (filled_count / final_total_cells) * 100 if final_total_cells > 0 else 0

        #    print(f"  [Diagonal Concat info]")
        #    print(f"  - Unified columns: {final_col_count}")
            #print(f"  - Cells filled with null: {filled_count:,} ({fill_ratio:.2f}%)")
         #   # =================================================

            # 結合（ダウンサンプリング済みなので軽い！）
            #X_pl = pl.concat(X_pl_list, how="diagonal")
            #mouse_pair_label = pd.concat(y_list, axis=0, ignore_index=True)
            #mouse_pair_meta = pd.concat(meta_list, axis=0, ignore_index=True)

            # お掃除
            #del pair_label_list, pair_meta_list, X_pl_list, y_list, meta_list
            #gc.collect()

            # inf対策
            #float_cols = X_pl.select(pl.col([pl.Float32, pl.Float64])).columns
            #X_pl = X_pl.with_columns([
            #    pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
            #    for c in float_cols
            #])
            # ... (X_te 生成直後) ...
            # ★修正: Pandasで一括読み込み＆結合
            #X_pd_list = [pd.read_parquet(f) for f in pair_temp_files]
            #X_pl = pd.concat(X_pd_list, axis=0, ignore_index=True)

            # ★修正: メモリ上のリストを一気に結合
            X_pl = pd.concat(pair_X_list, axis=0, ignore_index=True)

            mouse_pair_label = pd.concat(pair_label_list, axis=0, ignore_index=True)
            mouse_pair_meta = pd.concat(pair_meta_list, axis=0, ignore_index=True)

            del pair_label_list, pair_meta_list, pair_X_list
            gc.collect()
            # --- 特徴量の内訳カウント ---
            print(f"\n[Feature Count Report] Total: {len(X_pl.columns)} columns")

            # カテゴリごとのキーワード
            categories = {
                'Distance': ['+', 'dist'],
                'Speed/Accel': ['speed', 'accel', 'jerk', 'sp_', 'disp', 'act'],
                'Angle/Pose': ['ang', 'ori', 'curv', 'turn', 'ellipse'],
                'Arena/Zone': ['wall', 'center', 'zone'],
                'Future': ['fut_'],
                'Frequency': ['fft_'],
                'Metadata': ['lab_id', 'strain', 'sex', 'method'],
                'Interaction': ['prox', 'watch', 'mutual', '12+']
            }

            for cat, keywords in categories.items():
                count = sum(1 for c in X_pl.columns if any(k in c for k in keywords))
                if count > 0:
                    print(f"  - {cat}: {count}")

            # -----------------------------
            if CFG.mode == 'validate':
                # 間引いたデータを渡すので、関数内でのダウンサンプリングは不要だが、
                # そのまま渡しても問題ない（念には念を）
                temp_sub, temp_f1, temp_th = cross_validate_classifier(X_pl, mouse_pair_label, mouse_pair_meta, body_parts_tracked_str, section)

                if f"{section}" not in thresholds["pair"].keys():
                    thresholds["pair"][f"{section}"] = {}
                for k, v in temp_th.items():
                    thresholds["pair"][f"{section}"][k] = v

                f1_list.extend(temp_f1)
                submission_list.extend(temp_sub)

                current_section_f1.extend(temp_f1)
                current_section_submission.extend(temp_sub)
                current_section_thresholds.update(temp_th)

                del temp_sub, temp_f1, temp_th, X_pl
                gc.collect()
            else:
                # submitモードは全量データを使う（間引かない）
                # ※注意: submitモードでここを通る場合、メモリが足りなくなる可能性があります
          #      temp_sub = submit(body_parts_tracked_str, 'pair', section, thresholds["pair"][f"{section}"])
         #       submission_list.extend(temp_sub)
          #      current_section_submission.extend(temp_sub)
       #         del temp_sub, X_pl
                gc.collect()

        # === 保存 ===
        print(f"Saving checkpoint for Section {section}...")
        joblib.dump(
            (current_section_submission, current_section_f1, current_section_thresholds),
            checkpoint_path
        )
        print(f"Checkpoint saved: {checkpoint_path}")

        # 一時ファイルの掃除（ディスク容量確保のため）
        #for f in single_temp_files + pair_temp_files:
         #   if os.path.exists(f): os.remove(f)

    except Exception as e:
        print(f"\tError in Section {section}: {e}")
        import traceback
        traceback.print_exc()

    print()

In [None]:
print(f"Submission list items: {len(submission_list)}")
print(f"F1 list items: {len(f1_list)}")

if len(submission_list) > 0:
    print("Sample submission type:", type(submission_list[0])) # pandas.core.frame.DataFrame ならOK
else:
    print("⚠️ Warning: submission_list is empty!")

In [None]:
if CFG.mode == 'validate':
    submission = pd.concat(submission_list)
    submission_robust = robustify(submission, train, 'train')
    print(f"Competition metric: {score(solution, submission_robust, ''):.4f}")

    f1_df = pd.DataFrame(f1_list, columns=['body_parts_tracked_str', 'action', 'binary F1 score'])
    print(f"Mean F1:            {f1_df['binary F1 score'].mean():.4f}")

    joblib.dump(thresholds, f"{CFG.model_name}/thresholds.pkl")
    joblib.dump(f1_df, f"{CFG.model_name}/scores.pkl")