In [1]:
import os
import shutil
from pathlib import Path

import multiprocessing as mp
from pathlib import Path
import os
import json
import pandas as pd
from tqdm import tqdm
from lhotse import CutSet, RecordingSet, SupervisionSet, MonoCut
from lhotse.cut import append_cuts
from lhotse.recipes import prepare_tedlium
from util import *
import logging

logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


In [2]:
IN_DIR = "../datasets/LongSpeechSource/TEDLIUM_release-3"
OUT_DIR = '../datasets/LongSpeech'
manifests = prepare_tedlium(tedlium_root=IN_DIR, output_dir=OUT_DIR, num_jobs=15)

INFO:root:Processing train split...
Scanning audio files (*.sph): 2351it [00:14, 159.70it/s]
INFO:root:Processing dev split...
Scanning audio files (*.sph): 8it [00:00, 57.81it/s]
INFO:root:Processing test split...
Scanning audio files (*.sph): 11it [00:00, 60.44it/s]


In [3]:
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))
AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']
prev_amount = config['amount']
print(prev_amount)

5290


In [5]:
cuts = CutSet()
for part in manifests.keys():
    rs = manifests[part]['recordings']
    ss = manifests[part]['supervisions']
    cut = CutSet.from_manifests(recordings=rs, supervisions=ss)
    cuts += cut

In [11]:
#cuts = CutSet.from_jsonl(os.path.join('../datasets/LongSpeech', 'raw_ted_lium_cuts_head.jsonl'))

In [6]:
cuts = cuts.transform_text(lambda text: text.replace('<unk>', '').strip())

In [7]:
def trim_silence_from_cut(cut):
    """
    根据 supervision 信息修剪单个 Cut 的两端空白。
    如果没有 supervision，则认为整个 cut 都是空白，返回 None。
    """
    if not cut.supervisions:
        return None

    speech_start = min(s.start for s in cut.supervisions)
    speech_end = max(s.end for s in cut.supervisions)

    new_duration = speech_end - speech_start

    if new_duration <= 0:
        return None

    return cut.truncate(offset=speech_start, duration=new_duration)

In [8]:
cuts = cuts.map (
    lambda cut: trim_silence_from_cut(cut)
).filter(lambda cut: cut is not None)



In [9]:
chunked_cuts = cuts.cut_into_windows(
    duration=600,
    hop=600,       
)

In [14]:
def pack_cuts_to_long_audio(
    cuts: CutSet,
    target_duration: float = 600.0,
    staring_id =  0,
) -> CutSet:
    final_long_cuts = []
    buffer_cut = None

    for cut in cuts:
        buffer_cut = buffer_cut.append(cut) if buffer_cut else cut

        while buffer_cut.duration >= target_duration:
            new_chunk = buffer_cut.truncate(offset=0, duration=target_duration)
            final_long_cuts.append(new_chunk.with_id(f"{staring_id:06d}"))
            staring_id += 1
            buffer_cut = buffer_cut.truncate(offset=target_duration)

    return CutSet.from_cuts(final_long_cuts), staring_id


In [15]:
sliced_cuts, new_amount = pack_cuts_to_long_audio(chunked_cuts, target_duration=600.0, staring_id = prev_amount)

In [16]:
new_amount

8411

In [17]:
sliced_cuts.to_jsonl(OUT_DIR + "/grouped_cuts.jsonl")

In [18]:
def json_from_tedlium_to_allaudios(one_cut):
    """
    Convert a single LibriSpeech json record to a list of LongSpeech metadata.
    """
    sources = []
    total_dur = 0
    transcripts = []
    slices = []
    for subcut in one_cut["tracks"]:
        total_dur += subcut["cut"]["duration"]
        full_pth = subcut["cut"]["recording"]["sources"][0]["source"]
        slices.append([subcut["cut"]["start"], subcut["cut"]["duration"]])
        sources.append(full_pth.split("TEDLIUM_release-3")[-1])
        transcript_param = " ".join([s["text"] for s in subcut["cut"]["supervisions"] if s["text"]])
        if transcript_param != "":
            transcripts.append(restore_punctuation(transcript_param))
        else:
            print(subcut)

    return {
        "id": one_cut["id"],
        "source_ds": "tedlium",
        "duration_sec": total_dur,
        "audio_auto": False,
        "test_auto": False,
        "num_speakers": len(sources),
        "num_switches": len(sources),
        "slice": slices,
        "transcribe": " ".join(transcripts),
        "components": sources,
    }

In [19]:
def convert_record(source_jsonl_path: str, target_jsonl_path: str, map_fn):
    with open(source_jsonl_path, "r", encoding="utf-8") as src_f, \
         open(target_jsonl_path, "a", encoding="utf-8") as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(item)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + "\n")

In [20]:
convert_record(os.path.join(OUT_DIR, "grouped_cuts.jsonl"),
               os.path.join(OUT_DIR, OUT_FILE_NAME),
               json_from_tedlium_to_allaudios)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


{'cut': {'id': '832a3861-3cee-4c3b-8a22-b3928198bac4', 'start': 611.52, 'duration': 2.5, 'channel': 0, 'supervisions': [], 'recording': {'id': 'Beardyman_2013', 'sources': [{'type': 'file', 'channels': [0], 'source': '../datasets/LongSpeechSource/TEDLIUM_release-3/legacy/train/sph/Beardyman_2013.sph'}], 'sampling_rate': 16000, 'num_samples': 11306446, 'duration': 706.652875, 'channel_ids': [0]}, 'type': 'MonoCut'}, 'type': 'MonoCut', 'offset': 0.0}


In [None]:
def save_audios_from_cutset(cutset, out_dir, num_jobs=1):
    """
    Save audios from a CutSet to the specified directory.
    """
    for cut in tqdm(cutset):
        cut.save_audio(os.path.join(out_dir, f"{cut.id}.wav"))

In [30]:
save_audios_from_cutset(sliced_cuts, os.path.join(OUT_DIR, 'wavs'))

NameError: name 'save_audios_from_cutset' is not defined