# fold ごとの音声波形統計量

issue: https://github.com/tmp-friends/birdclef-2025/issues/21#issuecomment-2868744857

## Setup

In [1]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

import librosa
import librosa.display

In [2]:
dir = "/home/tomoya/kaggle/birdclef-2025/input/birdclef-2025"

submission_df = pd.read_csv(os.path.join(dir, "sample_submission.csv"))
taxonomy_df = pd.read_csv(os.path.join(dir, "taxonomy.csv"))
train_df = pd.read_csv(os.path.join(dir, "train.csv"))
with open(os.path.join(dir, "recording_location.txt"), "r") as file:
    recording_location = file.read()

In [3]:
pd.set_option('display.max_columns', None)

## Exec

In [4]:
# ========= Fold split with signal-strength balancing ===============

import os, glob
from pathlib import Path
import numpy as np, pandas as pd, librosa
from sklearn.model_selection import StratifiedKFold

# ------------------------------------------------------------------
# 1. メタデータ読込 & ファイルパス列を付与
# ------------------------------------------------------------------
train_df = pd.read_csv(os.path.join(dir, "train.csv"))
audio_root = Path(dir) / "train_audio"
train_df["filepath"] = train_df["filename"].apply(lambda x: str(audio_root / x))

# 既に fold 列がある場合は削除して作り直す
train_df = train_df.drop(columns=[c for c in ["fold"] if c in train_df], errors="ignore")

# ------------------------------------------------------------------
# 2. 各録音の “信号強度 T” を計算
#    T = RMS + VAR + STD + PWR  (1ファイル=1スカラー)
# ------------------------------------------------------------------
print("▶ calculating signal statistics (might take a minute) ...")
T_values = []
for fp in train_df["filepath"]:
    y, sr = librosa.load(fp, sr=None, mono=True)
    rms  = np.sqrt(np.mean(y**2))
    var  = np.var(y)
    std  = np.std(y)
    pwr  = np.mean(y**2)
    T_values.append(rms + var + std + pwr)

train_df["T"] = T_values
print("  done.")

# ------------------------------------------------------------------
# 3. T を 4 分位でビニングして stratify key を作成
# ------------------------------------------------------------------
train_df["T_bin"] = pd.qcut(train_df["T"], q=4, labels=False)     # 0-3

# primary_label＋T_bin を連結: 例)  "yehbla2_1"
train_df["stratify_key"] = (
    train_df["primary_label"].astype(str) + "_" +
    train_df["T_bin"].astype(str)
)

# ------------------------------------------------------------------
# 4. StratifiedKFold で fold 付与
# ------------------------------------------------------------------
n_fold  = 5
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

fold_arr = np.full(len(train_df), -1, dtype=int)
for fold, (_, valid_idx) in enumerate(skf.split(train_df, train_df["stratify_key"])):
    fold_arr[valid_idx] = fold
    print(f"  Fold {fold}: valid {len(valid_idx):5d} samps")

train_df["fold"] = fold_arr

# ------------------------------------------------------------------
# 5. fold ごとの T 統計量を表示
# ------------------------------------------------------------------
agg = (
    train_df.groupby("fold")["T"]
            .agg(["count", "mean", "median", "std",
                  lambda s: s.quantile(0.25),
                  lambda s: s.quantile(0.75)])
            .rename(columns={"<lambda_0>": "q25", "<lambda_1>": "q75"})
            .round(6)
)

print("\n=== Signal-strength statistics per fold ===")
print(agg)

▶ calculating signal statistics (might take a minute) ...
  done.
  Fold 0: valid  5713 samps
  Fold 1: valid  5713 samps
  Fold 2: valid  5713 samps
  Fold 3: valid  5713 samps
  Fold 4: valid  5712 samps

=== Signal-strength statistics per fold ===
      count      mean    median       std       q25       q75
fold                                                         
0      5713  0.083237  0.058225  0.093309  0.024868  0.110467
1      5713  0.082139  0.058231  0.084562  0.024688  0.111087
2      5713  0.084058  0.058430  0.093563  0.024766  0.111370
3      5713  0.083119  0.058297  0.096028  0.024685  0.110580
4      5712  0.083198  0.058100  0.090385  0.024606  0.110559




- 層化が成功している
    - 以前 1st 解法で問題になった「fold0 だけ静か／うるさい」の偏りは見られない。
    - count も ±1 まで揃っており、データ量バランスも◎。

- 右裾（noisy 録音）の影響
    - mean ≫ median（平均 > 中央） → 分布は 右に長い裾。
        ‐ つまり 少数の“とても大きい T” が全体平均を押し上げている。
    - ただし裾の割合が各 fold で同じなので CV・LB の比較には公平。

- 閾値フィルタの目安
    - q75≈0.111、median≈0.058 → T > 0.15 などにカットを置けば 上位 ~10 % の最も noisy 録音 を除外可能。（除外しても fold 間バランスは崩れにくい）