In [5]:
# test_labels.py
import os
import sys
from typing import List, Sequence, Dict, Optional, Callable, Tuple
import numpy as np
import glob
# 把上一级目录（也就是 project_root）加到 sys.path
sys.path.append(os.path.abspath(".."))

import argparse
from collections import Counter



def find_segment_paths(root_or_pattern: str) -> List[str]:
    """
    给一个目录或 glob pattern，返回所有 .npz segment 的路径列表。

    例子：
        paths = find_segment_paths("data/processed/segments_10s/**/*.npz")
        paths = find_segment_paths("data/processed/segments_10s")
    """
    # 如果传的是目录，就自动加上递归通配
    if not any(ch in root_or_pattern for ch in ["*", "?", "["]):
        pattern = root_or_pattern.rstrip("/\\") + "/**/*.npz"
    else:
        pattern = root_or_pattern

    paths = sorted(glob.glob(pattern, recursive=True))
    return paths


def read_labels_from_segments(paths: Sequence[str]) -> List[str]:
    """
    读取一批 .npz segment 里的原始 label（字符串），返回一个列表。
    只是“如实读取”，不做去重、不做映射。
    """
    labels = []
    for p in paths:
        d = np.load(p, allow_pickle=True)
        if "label" not in d:
            continue
        lab = d["label"]
        # 可能是 0-dim numpy array
        if isinstance(lab, np.ndarray) and lab.shape == ():
            lab = lab.item()
        if lab is None:
            continue
        labels.append(str(lab))
    return labels

def readaudioSr(paths: Sequence[str]) -> List[int]:
    """
    读取一批 .npz 中 音频文件的的采样率（'audio_rate_hz'），返回一个列表。
    只是“如实读取”，不做去重、不做映射。
    """
    srs = []
    for p in paths:
        d = np.load(p, allow_pickle=True)
        if "audio_rate_hz" not in d:
            continue
        sr = d["audio_rate_hz"]
        # 可能是 0-dim numpy array
        if isinstance(sr, np.ndarray) and sr.shape == ():
            sr = sr.item()
        if sr is None:
            continue
        srs.append(int(sr))
    return srs


In [6]:

root = "./"  # 改成你的路径

paths = find_segment_paths(root)
print("Found", len(paths), "npz files")

labels = read_labels_from_segments(paths)
counter = Counter(labels)

print("\nUnique raw labels:")
for lab, cnt in sorted(counter.items(), key=lambda x: x[0]):
    print(f"{lab!r}: {cnt} segments")



Found 710 npz files

Unique raw labels:
'3mL secretion': 118 segments
'3mL secretion (no HEMF)': 118 segments
'3ml secretion M4': 118 segments
'5ml secretion M4': 120 segments
'no secretion': 177 segments
'no secretion sound': 59 segments


In [7]:
srs = readaudioSr(paths)
counter_sr = Counter(srs)
print("\nUnique sample rates:")
for sr, cnt in sorted(counter_sr.items(), key=lambda x: x[0]):
    print(f"{sr}: {cnt} segments")


Unique sample rates:
22050: 710 segments
