In [None]:
import argparse, multiprocessing as mp
from pathlib import Path
from util import *
import os
from lhotse import CutSet
from lhotse.recipes import prepare_librispeech
from lhotse.cut import append_cuts
from tqdm import tqdm
import json
import pandas as pd

In [7]:
IN_DIR = "/mnt/d/voicedata/test-clean/LibriSpeech"
OUT_DIR = '../datasets/LongSpeech'
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))

AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']
prev_amount = config['amount']

In [5]:
d =  prepare_librispeech(IN_DIR, OUT_DIR)

Dataset parts:   0%|          | 0/1 [00:00<?, ?it/s]

Distributing tasks: 0it [00:00, ?it/s]

Processing:   0%|          | 0/2620 [00:00<?, ?it/s]

In [8]:
rs = d['test-clean']['recordings']
ss = d['test-clean']['supervisions']
ss_punc = ss.map(lambda seg: seg.transform_text(restore_punctuation))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [9]:
cuts = (CutSet.from_manifests(recordings=rs, supervisions=ss_punc)
)

cuts.to_jsonl(OUT_DIR + "/raw_cuts.jsonl")

In [35]:
def prepare_and_group(
        df: pd.DataFrame,
        min_chapter_sec: int = 120,
    ):

    df = df[['id', 'duration']].copy()


    parts = df['id'].str.split('-', expand=True)
    df[['speaker', 'chapter', 'segment_num']] = parts[[0, 1, 2]]
    df['segment_num'] = df['segment_num'].astype(int)
    df['duration'] = df['duration'].astype(float)

    long_enough = (
        df.groupby(['speaker', 'chapter'])['duration']
          .transform('sum') >= min_chapter_sec
    )
    df = df[long_enough].reset_index(drop=True)
    return df

def build_audio_groups(df: pd.DataFrame,
                       target_sec: int = 600,
                       tol_sec: int = 60,
                       maximum_speakers: int = 3,
                       maximum_switches: int = 3):
    """
    将 df 中的片段拼成 ~target_sec 的组。
    返回 (groups, summaries)
    """
    # ① 预排序
    df_sorted = df.sort_values(['speaker', 'chapter', 'segment_num']).reset_index(drop=True)

    groups = []

    cur_group, cur_dur = [], 0.0
    cur_speakers = set()
    semantic_changes = 0

    prev_speaker, prev_chapter = None, None

    for _, row in df_sorted.iterrows():
        seg_id   = row['id']
        dur      = float(row['duration'])
        speaker  = row['speaker']
        chapter  = row['chapter']

        # 如果这个片段放进去会超出 target+tol，则先收尾
        if cur_group and cur_dur + dur > target_sec + tol_sec:
            if len(cur_speakers) <= maximum_speakers and semantic_changes <= maximum_switches:
                groups.append((cur_group, len(cur_speakers), semantic_changes))

            # reset
            cur_group, cur_dur = [], 0.0
            cur_speakers, semantic_changes = set(), 0
            prev_speaker = prev_chapter = None

        if prev_speaker is not None and prev_chapter is not None:
            if speaker != prev_speaker or chapter != prev_chapter:
                semantic_changes += 1

        cur_group.append(seg_id)
        cur_dur += dur
        cur_speakers.add(speaker)

        prev_speaker, prev_chapter = speaker, chapter

    return groups



In [37]:
"""
mock_strategy = [
    (["1995-1836-0004-470","4507-16021-0026-1247"], 0, 1),
    (["4970-29093-0006-1287", "5105-28233-0007-1413"], 2, 3)
]
"""

source_df = pd.read_json(OUT_DIR + "/raw_cuts.jsonl", lines=True)
processed_df = prepare_and_group(df=source_df)
real_strategy = build_audio_groups(processed_df)

[(['1089-134686-0000-0',
   '1089-134686-0001-1',
   '1089-134686-0002-2',
   '1089-134686-0003-3',
   '1089-134686-0004-4',
   '1089-134686-0005-5',
   '1089-134686-0006-6',
   '1089-134686-0007-7',
   '1089-134686-0008-8',
   '1089-134686-0009-9',
   '1089-134686-0010-10',
   '1089-134686-0011-11',
   '1089-134686-0012-12',
   '1089-134686-0013-13',
   '1089-134686-0014-14',
   '1089-134686-0015-15',
   '1089-134686-0016-16',
   '1089-134686-0017-17',
   '1089-134686-0018-18',
   '1089-134686-0019-19',
   '1089-134686-0020-20',
   '1089-134686-0021-21',
   '1089-134686-0022-22',
   '1089-134686-0023-23',
   '1089-134686-0024-24',
   '1089-134686-0025-25',
   '1089-134686-0026-26',
   '1089-134686-0027-27',
   '1089-134686-0028-28',
   '1089-134686-0029-29',
   '1089-134686-0030-30',
   '1089-134686-0031-31',
   '1089-134686-0032-32',
   '1089-134686-0033-33',
   '1089-134686-0034-34',
   '1089-134686-0035-35',
   '1089-134686-0036-36',
   '1089-134686-0037-37',
   '1089-134691-0000-3

In [38]:
def save_audios_from_cutset(cutset, out_dir, num_jobs=1):
    """
    Save audios from a CutSet to the specified directory.
    """
    for cut in tqdm(cutset):
        cut.save_audio(os.path.join(out_dir, f"{cut.id}.wav"))


def from_strategy_to_cuts(source_cuts, strategy: list, starting_cut_id=0):
    """
    source_cuts: the cuts contains audio segments
    strategy: a list of list of cut_ids
    :return
        target_cuts: the cuts after applying the combination strategy
    """
    target_cuts_list = []
    i = starting_cut_id
    custom_feature = {}
    for cluster_ids , num_speaker, num_switch in strategy:
        cutlist = [source_cuts[cut_id] for cut_id in cluster_ids if cut_id in source_cuts]
        new_cut = append_cuts(cutlist)
        new_id = f"{i:06d}"
        new_cut = new_cut.with_id(new_id)
        target_cuts_list.append(new_cut)
        custom_feature[new_id] = {
            "num_speakers": num_speaker,
            "num_switches": num_switch,
        }
        i += 1
    return CutSet(target_cuts_list), custom_feature

In [39]:
tgt_cuts, custom_feature = from_strategy_to_cuts(cuts, real_strategy)
tgt_cuts.to_jsonl(OUT_DIR + "/grouped_cuts.jsonl")

In [40]:
def json_from_libri_to_allaudios(custom_feature, one_cut):
    """
    Convert a single LibriSpeech json record to a list of LongSpeech metadata.
    """
    sources = []
    total_dur = 0
    transcripts = []
    for subcut in one_cut["tracks"]:
        total_dur += subcut["cut"]["duration"]
        full_pth = subcut["cut"]["recording"]["sources"][0]["source"]
        sources.append(full_pth.split("LibriSpeech")[-1])
        transcripts.append(subcut["cut"]["supervisions"][0]["text"])

    return {
        "id": one_cut["id"],
        "source_ds": "librispeech",
        "duration_sec": total_dur,
        "audio_auto": False,
        "test_auto": False,
        "num_speakers": custom_feature.get(one_cut["id"], {}).get("num_speakers", -1),
        "num_switches": custom_feature.get(one_cut["id"], {}).get("num_switches", -1),
        "transcribe": " ".join(transcripts),
        "components": sources,
    }

In [30]:
def convert_record(source_jsonl_path: str, target_jsonl_path: str, custom_feature, map_fn):
    with open(source_jsonl_path, "r", encoding="utf-8") as src_f, \
         open(target_jsonl_path, "a", encoding="utf-8") as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(custom_feature, item)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + "\n")

In [31]:
convert_record(os.path.join(OUT_DIR, "grouped_cuts.jsonl"),
               os.path.join(OUT_DIR, OUT_FILE_NAME),
               custom_feature,
               json_from_libri_to_allaudios)

In [32]:
save_audios_from_cutset(tgt_cuts, os.path.join(OUT_DIR, 'wavs'))

100%|██████████| 2/2 [00:00<00:00,  4.27it/s]


In [None]:
"""
cutlist = []

for cut in cuts:
    if cut.duration > 10:  # Only keep cuts longer than 60 seconds
        cutlist.append(cut)
    if len(cutlist) >=5:
        break
new_cut = append_cuts(cutlist)
new_cut.save_audio(os.path.join(OUT_DIR, 'wav/some.wav'))

"""
"""
def new_map_cut_id(cuts, starting_cut_id=0):
    new_cuts = (
        cut.with_id(f"{starting_cut_id + i:06d}")
        for i, cut in enumerate(cuts)
    )
    return CutSet.from_cuts(new_cuts).to_eager()
"""