In [1]:
import os
import shutil
from pathlib import Path

import multiprocessing as mp
from pathlib import Path
import os
import json
import pandas as pd
from tqdm import tqdm
from lhotse import CutSet, RecordingSet, SupervisionSet, MonoCut
from mylhotse.iwslt_offlinetask import prepare_iwslt_offlinetask
from lhotse.cut import append_cuts, MixedCut
from lhotse.recipes import prepare_tedlium
from util import *
import logging


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


In [2]:
IN_DIR = "../datasets/LongSpeechSource/IWSLT.OfflineTask"
OUT_DIR = '../datasets/LongSpeech_p2'
manifests = prepare_iwslt_offlinetask(corpus_dir=Path(IN_DIR), output_dir=Path(OUT_DIR))

[WARN] skip ../datasets/LongSpeechSource/IWSLT.OfflineTask/data/en-de/tst2018: wav/ or ctms/ missing.
[WARN] skip ../datasets/LongSpeechSource/IWSLT.OfflineTask/data/en-de/tst2019: wav/ or ctms/ missing.
[WARN] skip ../datasets/LongSpeechSource/IWSLT.OfflineTask/data/en-de/tst2020: wav/ or ctms/ missing.
[WARN] skip ../datasets/LongSpeechSource/IWSLT.OfflineTask/data/en-de/tst2021: wav/ or ctms/ missing.
[WARN] skip ../datasets/LongSpeechSource/IWSLT.OfflineTask/data/en-de/tst2022: wav/ or ctms/ missing.


In [3]:
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))
AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']
prev_amount = config['amount']
print(prev_amount)

16071


In [4]:
cuts = CutSet()
for part in manifests.keys():
    rs = manifests[part]['recordings']
    ss = manifests[part]['supervisions']
    ss_punc = ss.map(lambda seg: seg.transform_text(restore_punctuation))
    cut = CutSet.from_manifests(recordings=rs, supervisions=ss_punc)
    cuts += cut

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [5]:
cuts.to_jsonl(os.path.join(OUT_DIR, f"iwslt_raw_cuts.jsonl"))

In [6]:
def trim_silence_from_cut(cut):
    """
    根据 supervision 信息修剪单个 Cut 的两端空白。
    如果没有 supervision，则认为整个 cut 都是空白，返回 None。
    """
    if not cut.supervisions:
        return None

    speech_start = min(s.start for s in cut.supervisions)
    speech_end = max(s.end for s in cut.supervisions)

    new_duration = speech_end - speech_start

    if new_duration <= 0:
        return None

    return cut.truncate(offset=speech_start, duration=new_duration)

In [7]:
cuts = cuts.map (
    lambda cut: trim_silence_from_cut(cut)
).filter(lambda cut: cut is not None)

In [8]:
def pack_cuts_to_long_audio(
    cuts: CutSet,
    target_duration: float = 600.0,
    staring_id =  0,
) -> CutSet:
    final_long_cuts = []
    buffer_cut = None

    for cut in cuts:
        buffer_cut = buffer_cut.append(cut) if buffer_cut else cut

        while buffer_cut.duration >= target_duration:
            new_chunk = buffer_cut.truncate(offset=0, duration=target_duration)

            new_chunk_id = new_chunk.with_id(f"{staring_id:06d}")
            final_long_cuts.append(new_chunk_id)
            staring_id += 1
            buffer_cut = buffer_cut.truncate(offset=target_duration)

    return CutSet.from_cuts(final_long_cuts), staring_id


In [9]:
sliced_cuts, new_amount = pack_cuts_to_long_audio(cuts, target_duration=600.0, staring_id = prev_amount)

In [10]:
new_amount

16124

In [11]:
sliced_cuts.to_jsonl(OUT_DIR + "/iwslt_grouped_cuts.jsonl")

In [12]:
def json_from_iwslt_to_allaudios(one_cut):
    """
    Convert a single LibriSpeech json record to a list of LongSpeech metadata.
    """
    sources = []
    total_dur = 0
    transcripts = []
    slices = []
    ttt = one_cut["tracks"] if "tracks" in one_cut else [one_cut]
    for subcut in ttt:
        ccc = subcut["cut"] if "cut" in subcut else subcut
        total_dur += ccc["duration"]
        full_pth = ccc["recording"]["sources"][0]["source"]
        slices.append([ccc["start"], ccc["duration"]])
        sources.append(full_pth.split("data")[-1])
        transcript_param = " ".join([s["text"] for s in ccc["supervisions"] if s["text"]])
        if transcript_param != "":
            transcripts.append(restore_punctuation(transcript_param))
        else:
            print(subcut)

    return {
        "id": one_cut["id"],
        "source_ds": "iwslt",
        "duration_sec": total_dur,
        "audio_auto": False,
        "text_auto": False,
        "language": 'en',
        "num_speakers": len(sources),
        "num_switches": len(sources),
        "slice": slices,
        "transcribe": " ".join(transcripts),
        "components": sources,
    }

In [13]:
def convert_record(source_jsonl_path: str, target_jsonl_path: str, map_fn):
    with open(source_jsonl_path, "r", encoding="utf-8") as src_f, \
         open(target_jsonl_path, "a", encoding="utf-8") as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(item)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + "\n")

In [14]:
convert_record(os.path.join(OUT_DIR, "iwslt_grouped_cuts.jsonl"),
               os.path.join(OUT_DIR, OUT_FILE_NAME),
               json_from_iwslt_to_allaudios)

In [15]:
def save_audios_from_cutset(cutset, out_dir, num_jobs=1):
    """
    Save audios from a CutSet to the specified directory.
    """
    for cut in tqdm(cutset):
        cut.save_audio(os.path.join(out_dir, f"{cut.id}.wav"))

In [16]:
save_audios_from_cutset(sliced_cuts, os.path.join(OUT_DIR, 'wavs'))

100%|██████████| 53/53 [00:13<00:00,  4.04it/s]
