In [27]:
import os
import shutil
from pathlib import Path

import multiprocessing as mp
from pathlib import Path
import os
import json
import pandas as pd
from tqdm import tqdm
from lhotse import CutSet, RecordingSet, SupervisionSet, MonoCut
from lhotse.cut import append_cuts
from lhotse.recipes import prepare_tedlium
from util import *
import logging

logging.basicConfig(level=logging.INFO)



In [None]:
IN_DIR = "../datasets/LongSpeechSource/TEDLIUM_release-3"
OUT_DIR = '../datasets/LongSpeech'
manifests = prepare_tedlium(tedlium_root=IN_DIR, output_dir=OUT_DIR, num_jobs=15)

In [None]:
cuts = CutSet()
for part in manifests.keys():
    rs = manifests[part]['recordings']
    ss = manifests[part]['supervisions']
    cut = CutSet.from_manifests(recordings=rs, supervisions=ss)
    cuts += cut

In [11]:
#cuts = CutSet.from_jsonl(os.path.join('../datasets/LongSpeech', 'raw_ted_lium_cuts_head.jsonl'))

In [12]:
cuts = cuts.transform_text(lambda text: text.replace('<unk>', '').strip())

In [13]:
def trim_silence_from_cut(cut):
    """
    根据 supervision 信息修剪单个 Cut 的两端空白。
    如果没有 supervision，则认为整个 cut 都是空白，返回 None。
    """
    if not cut.supervisions:
        return None

    speech_start = min(s.start for s in cut.supervisions)
    speech_end = max(s.end for s in cut.supervisions)

    new_duration = speech_end - speech_start

    if new_duration <= 0:
        return None

    return cut.truncate(offset=speech_start, duration=new_duration)

In [14]:
cuts = cuts.map (
    lambda cut: trim_silence_from_cut(cut)
).filter(lambda cut: cut is not None)

In [17]:
chunked_cuts = cuts.cut_into_windows(
    duration=600,
)

In [39]:
def pack_cuts_to_long_audio(
    cuts: CutSet,
    target_duration: float = 600.0,
    staring_id =  0,
) -> CutSet:
    final_long_cuts = []
    buffer_cut = None

    for cut in cuts:
        buffer_cut = buffer_cut.append(cut) if buffer_cut else cut

        while buffer_cut.duration >= target_duration:
            new_chunk = buffer_cut.truncate(offset=0, duration=target_duration)
            final_long_cuts.append(new_chunk.with_id(f"{staring_id:06d}"))
            staring_id += 1
            buffer_cut = buffer_cut.truncate(offset=target_duration)

    return CutSet.from_cuts(final_long_cuts)


In [42]:
sliced_cuts = pack_cuts_to_long_audio(chunked_cuts, target_duration=600.0)

In [43]:
sliced_cuts.to_jsonl(OUT_DIR + "/grouped_cuts.jsonl")

In [32]:
def json_from_tedlium_to_allaudios(one_cut):
    """
    Convert a single LibriSpeech json record to a list of LongSpeech metadata.
    """
    sources = []
    total_dur = 0
    transcripts = []
    slices = []
    for subcut in one_cut["tracks"]:
        total_dur += subcut["cut"]["duration"]
        full_pth = subcut["cut"]["recording"]["sources"][0]["source"]
        slices.append([subcut["cut"]["start"], subcut["cut"]["duration"]])
        sources.append(full_pth.split("TEDLIUM_release-3")[-1])
        transcript_param = " ".join([s["text"] for s in subcut["cut"]["supervisions"] if s["text"]])
        transcripts.append(restore_punctuation(transcript_param))

    return {
        "id": one_cut["id"],
        "source_ds": "tedlium",
        "duration_sec": total_dur,
        "audio_auto": False,
        "test_auto": False,
        "num_speakers": len(sources),
        "num_switches": len(sources),
        "slice": slices,
        "transcribe": " ".join(transcripts),
        "components": sources,
    }

In [33]:
def convert_record(source_jsonl_path: str, target_jsonl_path: str, map_fn):
    with open(source_jsonl_path, "r", encoding="utf-8") as src_f, \
         open(target_jsonl_path, "a", encoding="utf-8") as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(item)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + "\n")

In [34]:
convert_record(os.path.join(OUT_DIR, "tmp.jsonl"),
               os.path.join(OUT_DIR, "final.jsonl"),
               json_from_tedlium_to_allaudios)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
save_audios_from_cutset(sliced_cuts, os.path.join(OUT_DIR, 'wavs'))