In [16]:
import argparse, multiprocessing as mp
from pathlib import Path

from lhotse_util import *
from util import *
import os
from lhotse import CutSet
from lhotse.recipes import prepare_librispeech
from lhotse.cut import append_cuts
from tqdm import tqdm
import json
import pandas as pd

In [17]:
OUT_DIR = '../datasets/LongSpeech'

In [39]:
source_df = pd.read_json(OUT_DIR + "/cuts.jsonl", lines=True)

In [40]:
source_df.head()

Unnamed: 0,id,start,duration,channel,supervisions,recording,type
0,4507-16021-0047-1268,0,34.955,0,"[{'id': '4507-16021-0047', 'recording_id': '45...","{'id': '4507-16021-0047', 'sources': [{'type':...",MonoCut
1,1995-1836-0004-470,0,33.91,0,"[{'id': '1995-1836-0004', 'recording_id': '199...","{'id': '1995-1836-0004', 'sources': [{'type': ...",MonoCut
2,7021-79730-0003-2032,0,32.88,0,"[{'id': '7021-79730-0003', 'recording_id': '70...","{'id': '7021-79730-0003', 'sources': [{'type':...",MonoCut
3,908-157963-0007-2570,0,32.77,0,"[{'id': '908-157963-0007', 'recording_id': '90...","{'id': '908-157963-0007', 'sources': [{'type':...",MonoCut
4,2094-142345-0008-519,0,31.65,0,"[{'id': '2094-142345-0008', 'recording_id': '2...","{'id': '2094-142345-0008', 'sources': [{'type'...",MonoCut


In [50]:
def prepare_and_group(
        df: pd.DataFrame,
        min_chapter_sec: int = 120,
    ):

    df = df[['id', 'duration']].copy()


    parts = df['id'].str.split('-', expand=True)
    df[['speaker', 'chapter', 'segment_num']] = parts[[0, 1, 2]]
    df['segment_num'] = df['segment_num'].astype(int)
    df['duration'] = df['duration'].astype(float)

    long_enough = (
        df.groupby(['speaker', 'chapter'])['duration']
          .transform('sum') >= min_chapter_sec
    )
    df = df[long_enough].reset_index(drop=True)
    return df

In [34]:
def build_audio_groups(df: pd.DataFrame,
                       target_sec: int = 600,
                       tol_sec: int = 60,
                       maximum_speakers: int = 3,
                       maximum_switches: int = 3):
    """
    将 df 中的片段拼成 ~target_sec 的组。
    返回 (groups, summaries)
    """
    # ① 预排序
    df_sorted = df.sort_values(['speaker', 'chapter', 'segment_num']).reset_index(drop=True)

    groups = []

    cur_group, cur_dur = [], 0.0
    cur_speakers = set()
    semantic_changes = 0

    prev_speaker, prev_chapter = None, None

    for _, row in df_sorted.iterrows():
        seg_id   = row['id']
        dur      = float(row['duration'])
        speaker  = row['speaker']
        chapter  = row['chapter']

        # 如果这个片段放进去会超出 target+tol，则先收尾
        if cur_group and cur_dur + dur > target_sec + tol_sec:
            if len(cur_speakers) <= maximum_speakers and semantic_changes <= maximum_switches:
                groups.append((cur_group, len(cur_speakers), semantic_changes))

            # reset
            cur_group, cur_dur = [], 0.0
            cur_speakers, semantic_changes = set(), 0
            prev_speaker = prev_chapter = None

        if prev_speaker is not None and prev_chapter is not None:
            if speaker != prev_speaker or chapter != prev_chapter:
                semantic_changes += 1

        cur_group.append(seg_id)
        cur_dur += dur
        cur_speakers.add(speaker)

        prev_speaker, prev_chapter = speaker, chapter

    return groups

In [54]:
processed_df = prepare_and_group(df=source_df)
groups = build_audio_groups(processed_df)


27

In [55]:
groups[1]

(['1188-133604-0015-79',
  '1188-133604-0016-80',
  '1188-133604-0017-81',
  '1188-133604-0018-82',
  '1188-133604-0019-83',
  '1188-133604-0020-84',
  '1188-133604-0021-85',
  '1188-133604-0022-86',
  '1188-133604-0023-87',
  '1188-133604-0024-88',
  '1188-133604-0025-89',
  '1188-133604-0026-90',
  '1188-133604-0027-91',
  '1188-133604-0028-92',
  '1188-133604-0029-93',
  '1188-133604-0030-94',
  '1188-133604-0031-95',
  '1188-133604-0032-96',
  '1188-133604-0033-97',
  '1188-133604-0034-98',
  '1188-133604-0035-99',
  '1188-133604-0036-100',
  '1188-133604-0037-101',
  '1188-133604-0038-102',
  '1188-133604-0039-103',
  '1188-133604-0040-104',
  '1188-133604-0041-105',
  '1188-133604-0042-106',
  '1188-133604-0043-107',
  '1188-133604-0044-108',
  '121-127105-0000-134',
  '121-127105-0001-135',
  '121-127105-0002-136',
  '121-127105-0003-137',
  '121-127105-0004-138',
  '121-127105-0005-139',
  '121-127105-0006-140',
  '121-127105-0007-141',
  '121-127105-0008-142',
  '121-127105-00