In [26]:
import logging
import re
import string
from concurrent.futures.thread import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Union
from datasets import load_dataset
import tqdm
from itertools import chain
from lhotse import (
    AudioSource,
    Recording,
    RecordingSet,
    SupervisionSegment,
    SupervisionSet,
    validate_recordings_and_supervisions,
)
from lhotse.qa import fix_manifests
from lhotse.utils import Pathlike
from huggingface_hub import snapshot_download


In [27]:
def clean_token(w: str) -> str:
    NOISE_PREFIXES = ('$(', '<', '[')
    w = re.sub(r'\(\d+\)$', '', w)     # 去掉尾部(1)
    if w.startswith(NOISE_PREFIXES):   # 过滤噪声占位
        return ''
    return w

def parse_ctm_to_supervisions(ctm_path: Path, recording_id: str, channel: int = 0):
    sups = []
    seg_start = None
    seg_id = None
    words = []
    last_word_end = 0.0

    def flush_segment(next_start):
        nonlocal sups, seg_id, seg_start, words, last_word_end
        if seg_id is None:
            return
        end_time = next_start if next_start is not None else last_word_end
        if end_time is None or seg_start is None or end_time <= seg_start:
            return
        text = " ".join(words)
        sups.append(
            SupervisionSegment(
                id=seg_id,
                recording_id=recording_id,
                start=seg_start,
                duration=end_time - seg_start,
                channel=channel,
                text=text,
                language="en",
                speaker = recording_id,
            )
        )
        # reset
        seg_id = None
        seg_start = None
        words = []

    with ctm_path.open() as f:
        for ln in f:
            ln = ln.strip()
            if not ln:
                continue
            if ln.startswith('#'):
                parts = ln[1:].split()
                new_seg_id = parts[0]
                new_start = float(parts[1])
                flush_segment(new_start)
                seg_id = new_seg_id
                seg_start = new_start
                continue

            parts = ln.split()
            if len(parts) < 5:
                continue
            # utt = parts[0]; ch = int(parts[1])
            st, du, wd = float(parts[2]), float(parts[3]), parts[4]
            wd = clean_token(wd)
            if not wd:
                continue
            words.append(wd)
            last_word_end = max(last_word_end, st + du)

    flush_segment(None)
    return SupervisionSet.from_segments(sups)

In [24]:
sups = parse_ctm_to_supervisions(Path("/mnt/d/repo/IWSLT_OfflineTask/data/en-de/dev2010/ctms/dev2010.en.talkid69.ctm"), "talkid767")

In [None]:
sups = parse_ctm_to_supervisions(Path("/mnt/d/repo/IWSLT_OfflineTask/data/en-de/tst2010/ctms/tst2010.en.talkid767.ctm"), "talkid767")

In [25]:
for sup in sups.subset(first=5):
    print(sup)

SupervisionSegment(id='ted_dev2010_talkid69_30_02', recording_id='talkid767', start=30.02, duration=1252.85, channel=0, text="Every you look around the world you discover that these are not cultures destined to fade away these are dynamic living people's being driven out of existence by identifiable forces that are beyond their capacity to adapt to whether it's egregious deforestation in the homeland of the planned a nomadic people from Southeast Asia from Sarawak a people who lived free in the forest until a generation ago and now have all been reduced to servitude and prostitution on the banks of the rivers where you can see the river itself a soiled with the silt that seems to be carrying half of Borneo way to the So China Sea where the Japanese freighters hang light in the horizon ready to fill their holds with raw logs ripped from the forest or in the case of the UN-AU mommy is a disease entities that have come in the wake of the discovery of gold Just to know that job were Sharma

In [36]:
prefix   = "iwslt_offline"   # 可为空字符串
pair_key = "de-en"

def fname(kind: str) -> str:
    return f"{prefix + '_' if prefix else ''}{pair_key}_{kind}.jsonl.gz"


def prepare_iwslt_offlinetask(
    corpus_dir: Pathlike,
    output_dir: Pathlike,
):
    audio_dir = corpus_dir / "data/en-de"
    all_recordings = []
    all_supervisions = []

    for split_dir in audio_dir.iterdir():

        file_order = split_dir / "FILE_ORDER"

        wav_dir = split_dir / "wavs"
        ctm_dir = split_dir / "ctms"

        if not wav_dir.is_dir() or not ctm_dir.is_dir():
            print(f"[WARN] skip {split_dir}: wav/ or ctms/ missing.")
            continue

        ids = [l.strip() for l in file_order.read_text().splitlines() if l.strip()]
        print(ids)
        # ---- Supervisions ----
        sup_sets_this_split = []
        for rid in ids:
            ctm = ctm_dir / f"{rid}.ctm"
            if not ctm.is_file():
                print(f"[WARN] missing ctm: {ctm}")
                continue
            sup_sets_this_split.append(parse_ctm_to_supervisions(ctm, rid))
        sups_split = SupervisionSet.from_segments(seg for s in sup_sets_this_split for seg in s)
        all_supervisions.append(sups_split)

        # ---- Recordings ----
        recs_split = RecordingSet.from_recordings(
            Recording.from_file(wav_dir / f"{rid}.wav", recording_id=rid)
            for rid in ids
            if (wav_dir / f"{rid}.wav").is_file()
        )
        all_recordings.append(recs_split)

    # ---- 合并所有 split ----
    recordings = RecordingSet.from_recordings(chain.from_iterable(r for r in all_recordings))
    supervisions = SupervisionSet.from_segments(chain.from_iterable(s for s in all_supervisions))

    recordings.to_file(output_dir / fname("recordings"))
    supervisions.to_file(output_dir / fname("supervisions"))
    manifests = {
            "en-de": {
            "recordings": recordings,
            "supervisions": supervisions
        }
    }
    return manifests

In [37]:
IN_DIR = "/mnt/d/repo/IWSLT_OfflineTask"
OUT_DIR = '../datasets/LongSpeech'
manifests = prepare_iwslt_offlinetask(corpus_dir=Path(IN_DIR), output_dir=Path(OUT_DIR))

['dev2010.en.talkid535', 'dev2010.en.talkid531', 'dev2010.en.talkid457', 'dev2010.en.talkid453', 'dev2010.en.talkid227', 'dev2010.en.talkid129', 'dev2010.en.talkid69', 'dev2010.en.talkid93']
['tst2010.en.talkid837', 'tst2010.en.talkid824', 'tst2010.en.talkid805', 'tst2010.en.talkid799', 'tst2010.en.talkid792', 'tst2010.en.talkid790', 'tst2010.en.talkid785', 'tst2010.en.talkid783', 'tst2010.en.talkid779', 'tst2010.en.talkid767', 'tst2010.en.talkid769']
['tst2013.en.talkid1666', 'tst2013.en.talkid1617', 'tst2013.en.talkid1592', 'tst2013.en.talkid1518', 'tst2013.en.talkid1685', 'tst2013.en.talkid1548', 'tst2013.en.talkid1539', 'tst2013.en.talkid1647', 'tst2013.en.talkid1659', 'tst2013.en.talkid1694', 'tst2013.en.talkid1600', 'tst2013.en.talkid1520', 'tst2013.en.talkid1553', 'tst2013.en.talkid1534', 'tst2013.en.talkid1699', 'tst2013.en.talkid1634']
['tst2014.en.talkid1741', 'tst2014.en.talkid1781', 'tst2014.en.talkid1755', 'tst2014.en.talkid1852', 'tst2014.en.talkid1858', 'tst2014.en.talki