In [None]:
import multiprocessing as mp
from pathlib import Path
import os
import json
import pandas as pd
from tqdm import tqdm
from lhotse import CutSet, RecordingSet, SupervisionSet, MonoCut
from lhotse.cut import append_cuts
from lhotse.recipes import prepare_voxpopuli, download_voxpopuli
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
# directory paths to save audio and transcript files
IN_DIR = "../input/voxpopuli"
# directory paths to save metadata and processed aduio files
OUT_DIR = "../outputs/voxpopuli"
os.makedirs(OUT_DIR, exist_ok=True)
if not os.path.exists(os.path.join(OUT_DIR, 'metadata.json')):
    config = {
        'avg_duration': 10.0,
        'sample_rate': 16000,
        'source': 'voxpopuli.jsonl',
        'amount': 0
    }
    with open(os.path.join(OUT_DIR, 'metadata.json'), 'w') as f:
        json.dump(config, f, indent=4)
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))
AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']
prev_amount = config['amount']
print(prev_amount)

0


In [None]:
# Download and prepare VoxPopuli dataset for ASR task in English
lang = 'en'
task = 'asr'
# Set to True to force re-download
ForceDownload = True  

if ForceDownload:
    download_voxpopuli(target_dir=IN_DIR, subset=lang)

manifests = prepare_voxpopuli(corpus_dir=IN_DIR, output_dir=OUT_DIR, task=task, lang=lang, num_jobs=15)

logging.info(f"RecordingSet: {manifests['train']['recordings']}")
logging.info(f"SupervisionSet: {manifests['train']['supervisions']}")

INFO:root:Preparing recordings (this may take a few minutes)...
Scanning audio files (*.ogg): 964it [00:00, 12203.12it/s]
INFO:root:Using pre-downloaded annotations /home/dongli911/.wans/Project/AIGC/lulu/audio_outputs/voxpopuli/asr_en.tsv.gz
100%|██████████| 412484/412484 [00:00<00:00, 540456.46it/s]
INFO:root:RecordingSet: RecordingSet(len=362)
INFO:root:SupervisionSet: SupervisionSet(len=10872)


In [None]:
# Use the 'train' split for processing
split = 'train'
rs = manifests[split]['recordings']
ss = manifests[split]['supervisions']

ss_normalized = ss  # VoxPopuli provides 'normed_text'

In [6]:
cuts = CutSet.from_manifests(recordings=rs, supervisions=ss_normalized)
cuts.to_jsonl(os.path.join(OUT_DIR, 'raw_cuts.jsonl'))

In [None]:
def prepare_and_group(
    source_cuts: CutSet,
    target_sec: int = 600,
    min_segment_sec: int = 120,
):
    """Split long cuts and filter short ones, returning a DataFrame and updated CutSet."""
    new_cuts = []
    cut_data = []

    for cut in source_cuts:
        duration = cut.duration
        if duration >= target_sec:
            # Split into 600-second segments
            num_segments = int(duration // target_sec)
            for i in range(num_segments):
                start = i * target_sec
                end = (i + 1) * target_sec
                new_id = f"{cut.id}-split{i:03d}"
                new_cut = cut.truncate(offset=start, duration=target_sec)
                new_cut = new_cut.with_id(new_id)
                new_cuts.append(new_cut)
                cut_data.append({
                    'id': new_id,
                    'duration': target_sec,
                    'session_id': cut.id.split('-')[0],
                    'segment_num': i,
                    'speaker': cut.supervisions[0].speaker if cut.supervisions else 'unknown',
                    'slice': [start, end]
                })
            # Handle remainder
            remainder = duration % target_sec
            if remainder >= min_segment_sec:
                new_id = f"{cut.id}-split{num_segments:03d}"
                new_cut = cut.truncate(offset=num_segments * target_sec, duration=remainder)
                new_cut = new_cut.with_id(new_id)
                new_cuts.append(new_cut)
                cut_data.append({
                    'id': new_id,
                    'duration': remainder,
                    'session_id': cut.id.split('-')[0],
                    'segment_num': num_segments,
                    'speaker': cut.supervisions[0].speaker if cut.supervisions else 'unknown',
                    'slice': [num_segments * target_sec, duration]
                })
        elif duration >= min_segment_sec:
            # Keep segments >= 2 minutes for splicing
            new_cuts.append(cut)
            cut_data.append({
                'id': cut.id,
                'duration': duration,
                'session_id': cut.id.split('-')[0],
                'segment_num': int(cut.id.split('-')[-1]),
                'speaker': cut.supervisions[0].speaker if cut.supervisions else 'unknown',
                'slice': None
            })
        # Discard segments < 2 minutes

    logging.info(f"Prepared {len(new_cuts)} cuts after splitting and filtering")
    return pd.DataFrame(cut_data), CutSet.from_cuts(new_cuts)


In [None]:
def build_audio_groups(
    df: pd.DataFrame,
    target_sec: int = 600,
    tol_sec: int = 60,
    maximum_speakers: int = 3,
    maximum_switches: int = 3,
    max_segments: int = 3
):
    """Group segments to reach ~600 seconds, with max 3 segments per group."""
    df_sorted = df.sort_values(['session_id', 'segment_num']).reset_index(drop=True)

    groups = []
    cur_group, cur_dur = [], 0.0
    cur_speakers = set()
    semantic_changes = 0
    prev_session = None

    for _, row in df_sorted.iterrows():
        seg_id = row['id']
        dur = float(row['duration'])
        session = row['session_id']
        speaker = row['speaker']

        # Skip segments already at target duration (from splitting)
        if abs(dur - target_sec) < 1e-6:
            groups.append(([seg_id], 1, 0))
            continue

        if cur_group and (cur_dur + dur > target_sec + tol_sec or len(cur_group) >= max_segments):
            if cur_dur >= target_sec and len(cur_speakers) <= maximum_speakers and semantic_changes <= maximum_switches:
                groups.append((cur_group, len(cur_speakers), semantic_changes))
            cur_group, cur_dur = [], 0.0
            cur_speakers, semantic_changes = set(), 0
            prev_session = None

        if prev_session is not None and session != prev_session:
            semantic_changes += 1

        cur_group.append(seg_id)
        cur_dur += dur
        cur_speakers.add(speaker)
        prev_session = session

    if cur_group and cur_dur >= target_sec and len(cur_speakers) <= maximum_speakers and semantic_changes <= maximum_switches:
        groups.append((cur_group, len(cur_speakers), semantic_changes))

    logging.info(f"Created {len(groups)} groups for splicing")
    return groups

In [8]:
source_df, processed_cuts = prepare_and_group(cuts)
logging.info(f"DataFrame columns: {source_df.columns.tolist()}")
real_strategy = build_audio_groups(source_df)

INFO:root:Prepared 1985 cuts after splitting and filtering
INFO:root:DataFrame columns: ['id', 'duration', 'session_id', 'segment_num', 'speaker', 'slice']
INFO:root:Created 1717 groups for splicing


In [9]:
def save_audios_from_cutset(cutset, out_dir, num_jobs=1):
    os.makedirs(out_dir, exist_ok=True)
    for cut in tqdm(cutset):
        cut.save_audio(os.path.join(out_dir, f'{cut.id}.wav'))

def from_strategy_to_cuts(source_cuts, strategy: list, cut_info: pd.DataFrame, starting_cut_id=0):
    target_cuts_list = []
    i = starting_cut_id
    custom_feature = {}
    for cluster_ids, num_speaker, num_switch in strategy:
        cutlist = [source_cuts[cut_id] for cut_id in cluster_ids if cut_id in source_cuts]
        if not cutlist:
            logging.warning(f"No valid cuts found for cluster: {cluster_ids}")
            continue
        new_cut = append_cuts(cutlist) if len(cutlist) > 1 else cutlist[0]
        new_id = f'{i:06d}'
        new_cut = new_cut.with_id(new_id)
        target_cuts_list.append(new_cut)
        # Get slice info from cut_info
        slice_info = cut_info[cut_info['id'].isin(cluster_ids)]['slice'].iloc[0] if len(cluster_ids) == 1 else None
        custom_feature[new_id] = {
            'num_speakers': num_speaker,
            'num_switches': num_switch,
            'slice': slice_info
        }
        i += 1
    return CutSet(target_cuts_list), custom_feature

In [10]:
tgt_cuts, custom_feature = from_strategy_to_cuts(processed_cuts, real_strategy, source_df, starting_cut_id=prev_amount)
tgt_cuts.to_jsonl(os.path.join(OUT_DIR, 'grouped_cuts.jsonl'))

In [None]:
# 
def json_from_voxpopuli_to_allaudios(custom_feature, one_cut):
    sources = []
    total_dur = 0
    transcripts = []

    if 'tracks' in one_cut:
        logging.info(f"Processing MixedCut with ID {one_cut['id']}, tracks: {len(one_cut['tracks'])}")
        for subcut in one_cut['tracks']:
            total_dur += subcut['cut']['duration']
            full_pth = subcut['cut']['recording']['sources'][0]['source']
            sources.append(full_pth.split('raw_audios')[-1])
            transcripts.append(subcut['cut']['supervisions'][0]['text'] if subcut['cut']['supervisions'] else '')
    else:
        logging.info(f"Processing MonoCut with ID {one_cut['id']}")
        total_dur = one_cut['duration']
        full_pth = one_cut['recording']['sources'][0]['source']
        sources.append(full_pth.split('raw_audios')[-1])
        transcripts.append(one_cut['supervisions'][0]['text'] if one_cut['supervisions'] else '')

    result = {
        'id': one_cut['id'],
        'source_ds': 'voxpopuli',
        'duration_sec': total_dur,
        'audio_auto': False,
        'text_auto': False,
        'num_speakers': custom_feature.get(one_cut['id'], {}).get('num_speakers', -1),
        'num_switches': custom_feature.get(one_cut['id'], {}).get('num_switches', -1),
        'transcribe': ' '.join([t for t in transcripts if t]),
        'components': sources,
    }

    if custom_feature.get(one_cut['id'], {}).get('slice'):
        result['slice'] = custom_feature[one_cut['id']]['slice']
    return result

In [12]:
def convert_record(source_jsonl_path: str, target_jsonl_path: str, custom_feature, map_fn):
    with open(source_jsonl_path, 'r', encoding='utf-8') as src_f, \
         open(target_jsonl_path, 'a', encoding='utf-8') as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(custom_feature, item)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + '\n')

In [None]:
# Save metadata
convert_record(
    os.path.join(OUT_DIR, 'grouped_cuts.jsonl'),
    os.path.join(OUT_DIR, OUT_FILE_NAME),
    custom_feature,
    json_from_voxpopuli_to_allaudios
)

INFO:root:Processing MonoCut with ID 000000
INFO:root:Processing MonoCut with ID 000001
INFO:root:Processing MonoCut with ID 000002
INFO:root:Processing MonoCut with ID 000003
INFO:root:Processing MonoCut with ID 000004
INFO:root:Processing MonoCut with ID 000005
INFO:root:Processing MonoCut with ID 000006
INFO:root:Processing MonoCut with ID 000007
INFO:root:Processing MonoCut with ID 000008
INFO:root:Processing MonoCut with ID 000009
INFO:root:Processing MonoCut with ID 000010
INFO:root:Processing MonoCut with ID 000011
INFO:root:Processing MonoCut with ID 000012
INFO:root:Processing MonoCut with ID 000013
INFO:root:Processing MonoCut with ID 000014
INFO:root:Processing MonoCut with ID 000015
INFO:root:Processing MonoCut with ID 000016
INFO:root:Processing MonoCut with ID 000017
INFO:root:Processing MonoCut with ID 000018
INFO:root:Processing MonoCut with ID 000019
INFO:root:Processing MonoCut with ID 000020
INFO:root:Processing MonoCut with ID 000021
INFO:root:Processing MonoCut wit

In [None]:
# Save audio files
save_audios_from_cutset(tgt_cuts, os.path.join(OUT_DIR, 'wavs'))

100%|██████████| 1717/1717 [16:21<00:00,  1.75it/s]
