In [2]:
from datasets import load_dataset, Audio, IterableDataset
from itertools import chain
import numpy as np

In [9]:
MAX_DURATION = 600
SAMPLE_RATE = 16000

In [3]:

ds_dict = load_dataset(
        "openslr/librispeech_asr",
        "all",
        streaming=True,
    )

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [8]:
print(ds_dict)

IterableDatasetDict({
    train.clean.100: IterableDataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        n_shards: 1
    })
    train.clean.360: IterableDataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        n_shards: 1
    })
    train.other.500: IterableDataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        n_shards: 1
    })
    validation.clean: IterableDataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        n_shards: 1
    })
    validation.other: IterableDataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        n_shards: 1
    })
    test.clean: IterableDataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id'],
        n_shards: 1
    })
    test.other: IterableDataset({
        features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id']

In [13]:
for split_name, split_dataset in ds_dict.items():
    split_dataset = split_dataset.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))
    print(f"Split: {split_name}")
    i = 0
    for sample in split_dataset:
        print(f"Speaker ID: {sample['speaker_id']}, Chapter ID: {sample['chapter_id']}, file: {sample['file']}" )
        i += 1
        if i >= 5:  # Limit to first 5 samples for brevity
            break


Split: train.clean.100
Speaker ID: 374, Chapter ID: 180298, file: 374-180298-0000.flac
Speaker ID: 374, Chapter ID: 180298, file: 374-180298-0001.flac
Speaker ID: 374, Chapter ID: 180298, file: 374-180298-0002.flac
Speaker ID: 374, Chapter ID: 180298, file: 374-180298-0003.flac
Speaker ID: 374, Chapter ID: 180298, file: 374-180298-0004.flac
Split: train.clean.360
Speaker ID: 1487, Chapter ID: 133273, file: 1487-133273-0000.flac
Speaker ID: 1487, Chapter ID: 133273, file: 1487-133273-0001.flac
Speaker ID: 1487, Chapter ID: 133273, file: 1487-133273-0002.flac
Speaker ID: 1487, Chapter ID: 133273, file: 1487-133273-0003.flac
Speaker ID: 1487, Chapter ID: 133273, file: 1487-133273-0004.flac
Split: train.other.500
Speaker ID: 8296, Chapter ID: 266250, file: 8296-266250-0000.flac
Speaker ID: 8296, Chapter ID: 266250, file: 8296-266250-0001.flac
Speaker ID: 8296, Chapter ID: 266250, file: 8296-266250-0002.flac
Speaker ID: 8296, Chapter ID: 266250, file: 8296-266250-0003.flac
Speaker ID: 8296,

In [6]:
def stream_librispeech_grouped_all(target_len_sec=30.0, sampling_rate=16000):
    ds_dict = load_dataset(
        "openslr/librispeech_asr",
        "all",
        streaming=True,
    )

    def every_sample():
        for split in ds_dict.values():
            split = split.cast_column("audio",
                                       Audio(sampling_rate=sampling_rate))
            for sample in split:
                yield sample

    buffer, buf_dur, current_key = [], 0.0, None
    for s in every_sample():
        key = (s["speaker_id"], s["chapter_id"])
        wav = s["audio"]["array"]
        dur = len(wav) / sampling_rate

        if current_key is None:
            current_key = key

        # 遇到新卷或时长超阈值 → 把缓冲区吐出来
        if key != current_key or buf_dur + dur >= target_len_sec:
            if buffer:
                yield {
                    "speaker_id": current_key[0],
                    "chapter_id": current_key[1],
                    "audio": np.concatenate(buffer),
                    "length_sec": buf_dur,
                }
            buffer, buf_dur, current_key = [], 0.0, key

        buffer.append(wav); buf_dur += dur

    # 文件结尾还有残余
    if buffer:
        yield {
            "speaker_id": current_key[0],
            "chapter_id": current_key[1],
            "audio": np.concatenate(buffer),
            "length_sec": buf_dur,
        }
