In [1]:
import os
import json
from tqdm import tqdm
from lhotse import CutSet
from mylhotse.spgispeech import prepare_spgispeech
from lhotse.cut import append_cuts
import logging
from util import restore_punctuation
import multiprocessing as mp
import pandas as pd
logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm
  from speechbrain.pretrained import EncoderClassifier
Device set to use cuda:0
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


In [2]:
IN_DIR = "../datasets/LongSpeechSource/spgispeech"
OUT_DIR = '/home/yangrenyi.yry/LongSpeech_p3'

In [3]:
manifests = prepare_spgispeech(corpus_dir=IN_DIR, output_dir=OUT_DIR, num_jobs=15)
manifests

{'train': {'recordings': RecordingSet(len=1966109),
  'supervisions': SupervisionSet(len=1966109)},
 'val': {'recordings': RecordingSet(len=39341),
  'supervisions': SupervisionSet(len=39341)}}

In [3]:
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))
AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']
prev_amount = config['amount']
print(prev_amount)

0


In [5]:
cuts = CutSet()
for part in manifests.keys():
    rs = manifests[part]['recordings']
    ss = manifests[part]['supervisions']
    ss_punc = ss.map(lambda seg: seg.transform_text(restore_punctuation))
    cut = CutSet.from_manifests(recordings=rs, supervisions=ss_punc)
    cuts += cut

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [6]:
cuts.to_jsonl(OUT_DIR + "/spgi_raw_cuts.jsonl")

In [7]:
def prepare_and_group(
        df: pd.DataFrame,
    ):

    df = df[['id', 'duration']].copy()

    parts = df['id'].str.split('-').str[0].str.split('_', expand=True)

    df[['speaker', 'segment_num']] = parts[[0, 1]]
    df['segment_num'] = df['segment_num'].astype(int)
    df['duration'] = df['duration'].astype(float)

    return df



def group_segments(
    df: pd.DataFrame,
    target_sec: int = 600,
    tol_sec: int = 90
):

    df_sorted = df.sort_values(["speaker", "segment_num"]).reset_index(drop=True)

    lower, upper = target_sec - tol_sec, target_sec + tol_sec
    groups, cur_ids, cur_dur, cur_spk = [], [], 0.0, None

    for _, row in df_sorted.iterrows():
        spk, seg_id, dur = row["speaker"], row["id"], float(row["duration"])

        if cur_spk is not None and spk != cur_spk and cur_dur >= lower:
            groups.append(cur_ids)
            cur_ids, cur_dur = [], 0.0

        cur_ids.append(seg_id)
        cur_dur += dur
        cur_spk = spk

        if cur_dur >= upper:
            groups.append(cur_ids)
            cur_ids, cur_dur, cur_spk = [], 0.0, None

    return groups

In [8]:
source_df = pd.read_json(OUT_DIR + "/spgi_raw_cuts.jsonl", lines=True)
processed_df = prepare_and_group(df=source_df)
real_strategy = group_segments(df=processed_df)

In [9]:
def from_strategy_to_cuts(source_cuts, strategy: list, starting_cut_id=0):
    src_cuts = {c.id: c for c in source_cuts}
    target_cuts_list = []
    i = starting_cut_id
    for cluster_ids in strategy:
        grouped_cuts = [src_cuts[cid] for cid in cluster_ids]
        new_id = f"{i:06d}"
        merged = append_cuts(grouped_cuts).with_id(new_id)
        target_cuts_list.append(merged)
        i += 1
    return CutSet(target_cuts_list), i

In [10]:
grouped_cuts, new_amount = from_strategy_to_cuts(cuts, real_strategy, starting_cut_id=prev_amount)
grouped_cuts.to_jsonl(OUT_DIR + "/spgi_grouped_cuts.jsonl")
new_amount

28317

In [11]:
def convert_record(source_jsonl_path: str, target_jsonl_path: str, map_fn):
    with open(source_jsonl_path, "r", encoding="utf-8") as src_f, \
         open(target_jsonl_path, "a", encoding="utf-8") as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(item)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + "\n")

In [12]:
def json_from_spgi_to_allaudios(one_cut, lang = "en"):
    """
    Convert a single Commonvoice json record to a list of LongSpeech metadata.
    """
    sources = []
    speakers = set()
    total_dur = 0
    transcripts = []
    slices = []

    for subcut in one_cut["tracks"]:
        total_dur += subcut["cut"]["duration"]
        full_pth = subcut["cut"]["recording"]["sources"][0]["source"]
        slices.append([subcut["cut"]["start"], subcut["cut"]["duration"]])
        sources.append(full_pth.split("spgispeech")[-1])
        [speakers.add(s["speaker"]) for s in subcut["cut"]["supervisions"] if s["speaker"]]
        transcript_param = " ".join([s["text"] for s in subcut["cut"]["supervisions"] if s["text"]])
        if transcript_param != "":
            transcripts.append(transcript_param)
        else:
            print(subcut)

    return {
        "id": one_cut["id"],
        "source_ds": "spgispeech",
        "duration_sec": total_dur,
        "audio_auto": False,
        "text_auto": False,
        "language": lang,
        "num_speakers": len(speakers),
        "num_switches": len(speakers),
        "slice": slices,
        "transcribe": " ".join(transcripts),
        "components": sources,
    }

In [13]:
OUT_DIR = '/home/yangrenyi.yry/LongSpeech_p3'
convert_record(os.path.join(OUT_DIR, "spgi_grouped_cuts.jsonl"),
               os.path.join(OUT_DIR, OUT_FILE_NAME),
               json_from_spgi_to_allaudios)

KeyError: 'tracks'

In [4]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

from worker import save_one_worker

In [5]:
def save_audios_from_cutset(cutset, out_dir, num_jobs=None):
    if num_jobs is None:
        num_jobs = os.cpu_count()

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    cuts_to_process = [
        cut for cut in tqdm(cutset, desc="Checking for existing files") 
        if not (out_dir / f"{cut.id}.wav").exists()
    ]
    context = mp.get_context("spawn")
    with ProcessPoolExecutor(max_workers=num_jobs, mp_context=context) as pool:
        futures = [
            pool.submit(save_one_worker, cut, out_dir)
            for cut in tqdm(cuts_to_process, desc="1. 提交任务中")
        ]
        for _ in tqdm(
        as_completed(futures),
        total=len(futures),
        desc=f"Saving WAVs ({num_jobs} workers)"
        ):
            pass

In [6]:
grouped_cuts = CutSet.from_jsonl(os.path.join(OUT_DIR, 'spgi_grouped_cuts.jsonl'))
len(grouped_cuts)

28316

In [7]:

mp.set_start_method('spawn', force=True)
save_audios_from_cutset(grouped_cuts, os.path.join(OUT_DIR, 'wavs'))

Checking for existing files: 100%|██████████| 28316/28316 [01:13<00:00, 383.32it/s]
1. 提交任务中: 100%|██████████| 14890/14890 [00:00<00:00, 35788.87it/s]
Saving WAVs (15 workers):   3%|▎         | 400/14890 [01:26<1:04:12,  3.76it/s]

In [None]:
config['amount'] = prev_amount + len(grouped_cuts) 
with open(os.path.join(OUT_DIR, 'metadata.json'), 'w', encoding='utf-8') as f:
    json.dump(config, f, indent=4, ensure_ascii=False)