In [None]:
'''遍历 MMDataset 下面所有一级子文件夹，
把每个子目录当作一个 session，
自动加载其中的 EDF/WAV 并按 10 秒窗口切成片段，
分别保存到对应的二级输出目录中。'''

import os
import sys

sys.path.append("../")

from util.multimodal_align import load_from_dir, slice_all_pairings

# 0. 总根目录：下面是一堆 session 子目录
parent_dir = r"../MMDataset"

# 顶层输出目录，例如： ./MMDataset_segments/
parent_name = os.path.basename(parent_dir.rstrip(r"\/"))
parent_out_root = os.path.join(".", f"{parent_name}_segments")
os.makedirs(parent_out_root, exist_ok=True)

all_results = {}

# 1. 遍历 MMDataset 下面 “一级子目录”
for entry in os.listdir(parent_dir):
    session_dir = os.path.join(parent_dir, entry)

    # 只处理目录，跳过文件
    if not os.path.isdir(session_dir):
        continue

    # 跳过隐藏/配置目录，例如 .vscode、.git 等
    if entry.startswith("."):
        print(f"跳过目录（看起来是配置目录）: {entry}")
        continue

    session_name = entry
    print(f"\n==== 开始处理 session: {session_name} ====")

    # 当前 session 的输出目录： ./MMDataset_segments/<session_name>/
    out_root = os.path.join(parent_out_root, session_name)
    os.makedirs(out_root, exist_ok=True)

    # 3. 自动扫描 + 读取所有 EDF / WAV (+ JSON)
    sensor_metas, sensor_dict, audio_metas = load_from_dir(session_dir)

    # 如果这个 session 根本没有数据，直接跳过
    if len(sensor_metas) == 0 and len(audio_metas) == 0:
        print(f"[{session_name}] 没找到 EDF / WAV，跳过。")
        continue

    # 4. 计算 EDF–WAV 时间重叠，并按 10s 切片保存
    results = slice_all_pairings(
        sensor_metas=sensor_metas,
        sensor_dict=sensor_dict,
        audio_metas=audio_metas,
        out_root=out_root,
        window_sec=10.0,        # 10 秒窗口
        hop_sec=10,             # 10 秒 hop = 不重叠（想要 50% 重叠改成 5.0）
        sensor_time_col="Epoch_UTC",
        # sensor_value_cols=None,
        min_sensor_rows=1,      # 窗口里至少要有 1 行传感器数据才会保存
        pair_min_overlap_sec=1.0,
    )

    all_results[session_name] = results

    total_segments = sum(len(v) for v in results.values())
    print(f"[{session_name}] Done. Total segments: {total_segments}")
    for (sensor_id, audio_id), paths in results.items():
        print(f"[{session_name}] {sensor_id}  <->  {audio_id}  ->  {len(paths)} segments")

# 5. 总结
print("\n===== 所有 session 统计汇总 =====")
grand_total = 0
for session_name, results in all_results.items():
    total_segments = sum(len(v) for v in results.values())
    print(f"Session {session_name}: {total_segments} segments")
    grand_total += total_segments

print(f"ALL sessions total segments: {grand_total}")


In [None]:
import os

print("当前工作目录 CWD =", os.getcwd())
print("parent_out_root =", os.path.abspath(parent_out_root))


In [None]:
import os
import sys


sys.path.append("../")

from util.multimodal_align import load_from_dir, slice_all_pairings


# 1. 数据根目录（一个 session）
root_dir = r"../MMDataset"   # ← 改成你要处理的那一批

# 2. 输出目录，用 session 名字来命名，例如 "251118_segments"
session_name = os.path.basename(root_dir.rstrip(r"\/"))
out_root = os.path.join(".", f"{session_name}")

# 3. 自动扫描 + 读取所有 EDF / WAV (+ JSON)
sensor_metas, sensor_dict, audio_metas = load_from_dir(root_dir)

# 4. 自动计算 EDF–WAV 的时间重叠，并按 10s + hop 切片保存
results = slice_all_pairings(
    sensor_metas=sensor_metas,
    sensor_dict=sensor_dict,
    audio_metas=audio_metas,
    out_root=out_root,
    window_sec=10.0,        # 10 秒窗口
    hop_sec=10,            # 5 秒 hop = 50% 重叠；如果不要重叠就用 10.0
    sensor_time_col="Epoch_UTC",
    # sensor_value_cols=None,
    min_sensor_rows=1,      # 窗口里至少要有 1 行传感器数据才会保存
    pair_min_overlap_sec=1.0,
)

# 5. 打印统计信息
total_segments = sum(len(v) for v in results.values())
print(f"Done. Total segments: {total_segments}")
for (sensor_id, audio_id), paths in results.items():
    print(f"{sensor_id}  <->  {audio_id}  ->  {len(paths)} segments")




In [None]:
import os
import numpy as np
import wave
from util.multimodal_align import read_audio_with_json

RAW_AUDIO_ROOT = root_dir

def validate_one_segment(npz_path, tol_samples=1, tol_ms=5.0):
    d = np.load(npz_path, allow_pickle=True)

    audio_seg = d["audio"]
    fs_seg = int(d["audio_rate_hz"])
    win_start_utc = float(d["audio_start_utc"])
    win_end_utc = float(d["audio_end_utc"])
    file_start_utc = float(d["audio_file_start_utc"])
    start_idx = int(d["audio_sample_start_index"])
    audio_id = str(d["audio_id"])

    # 1) 读回原始 wav + json
    wav_path = os.path.join(RAW_AUDIO_ROOT, audio_id)
    audio_meta = read_audio_with_json(wav_path)
    fs = audio_meta.rate_hz

    # ---- Check 1: 采样率一致 ----
    if fs != fs_seg:
        print(f"[WARN] {npz_path}: sampling rate mismatch: seg={fs_seg}, wav={fs}")
        return False

    # ---- Check 2: 时间 -> sample index 公式一致 ----
    idx_theory = round((win_start_utc - file_start_utc) * fs)
    if abs(idx_theory - start_idx) > tol_samples:
        print(f"[WARN] {npz_path}: start_idx mismatch: stored={start_idx}, theory={idx_theory}")
        ok_idx = False
    else:
        ok_idx = True

    # 检查窗口结束时间与 sample 长度的对应关系
    t_theory_end = file_start_utc + (start_idx + len(audio_seg) - 1) / fs
    err_end_ms = (t_theory_end - win_end_utc) * 1000.0
    ok_end = abs(err_end_ms) <= tol_ms
    if not ok_end:
        print(f"[WARN] {npz_path}: window end time mismatch by {err_end_ms:.3f} ms")

    # ---- Check 3: 从原始 wav 抠一段出来，对比波形 ----
    with wave.open(wav_path, "rb") as wf:
        wf.setpos(start_idx)
        raw = wf.readframes(len(audio_seg))
        full_seg = np.frombuffer(raw, dtype="<i2")  # int16

    L = min(len(full_seg), len(audio_seg))
    diff = np.abs(full_seg[:L].astype(np.int32) - audio_seg[:L].astype(np.int32))
    max_diff = diff.max() if L > 0 else 0
    ok_audio = (max_diff == 0)  # 如果尾部有 padding，前 L 部分也应该相同

    if not ok_audio:
        print(f"[WARN] {npz_path}: audio segment differs from raw wav, max_diff={max_diff}")

    # ---- Check 4: 传感器时间是否都在窗口内 ----
    sensor_t = d["sensor_time_epoch"].astype(float)
    if len(sensor_t) > 0:
        in_range = (sensor_t.min() >= win_start_utc - 1e-6) and (sensor_t.max() < win_end_utc + 1e-6)
    else:
        in_range = True  # 空的就先算过
    if not in_range:
        print(f"[WARN] {npz_path}: sensor timestamps out of window range.")

    # ---- Check 5: 传感器时间映射到 sample index，误差是否小 ----
    if len(sensor_t) > 0:
        idx_local = np.round((sensor_t - win_start_utc) * fs).astype(int)
        t_mapped = win_start_utc + idx_local / fs
        err_ms_arr = (t_mapped - sensor_t) * 1000.0
        max_err_ms = np.max(np.abs(err_ms_arr))
        ok_align = max_err_ms <= tol_ms
        if not ok_align:
            print(f"[WARN] {npz_path}: sensor->audio mapping error up to {max_err_ms:.3f} ms")
    else:
        ok_align = True

    all_ok = ok_idx and ok_end and ok_audio and in_range and ok_align
    if all_ok:
        print(f"[OK] {npz_path} passed all checks.")
    return all_ok


In [None]:
import glob

def validate_all_segments(segments_root):
    paths = glob.glob(os.path.join(segments_root, "**", "*.npz"), recursive=True)
    print(f"Found {len(paths)} segment files.")
    n_ok = 0
    for p in paths:
        if validate_one_segment(p):
            n_ok += 1
    print(f"{n_ok}/{len(paths)} segments passed all checks.")

# 用法
validate_all_segments(out_root)
