In [1]:
import multiprocessing as mp
import os
import json
from tqdm import tqdm
from lhotse import CutSet
from mylhotse.aishell2 import prepare_aishell2



In [4]:
# directory paths to save audio and transcript files
IN_DIR = "../datasets/LongSpeechSource/iOS"
IN_DIR = "/mnt/d/repo/AISHELL-2-sample/iOS"
# directory paths to save metadata and processed aduio files
OUT_DIR = '../datasets/LongSpeech'

In [5]:
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))
AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']
prev_amount = config['amount']
print(prev_amount)
task = "asr"

0


In [6]:
manifests = prepare_aishell2(corpus_dir=IN_DIR, output_dir=OUT_DIR, num_jobs=15)

Process aishell2 audio, it takes about 55  minutes using 40 cpu jobs.:   0%|          | 0/1 [00:00<?, ?it/s]


Scanning audio files (*.wav): 0it [00:00, ?it/s][A
Scanning audio files (*.wav): 86it [00:00, 806.41it/s][A
Scanning audio files (*.wav): 167it [00:00, 776.20it/s][A
Scanning audio files (*.wav): 250it [00:00, 775.58it/s][A
Scanning audio files (*.wav): 328it [00:00, 775.97it/s][A
Scanning audio files (*.wav): 413it [00:00, 784.79it/s][A
Scanning audio files (*.wav): 500it [00:00, 750.29it/s][A


In [8]:

cuts = CutSet()
for part in manifests.keys():
    rs = manifests[part]['recordings']
    ss = manifests[part]['supervisions']
    cut = CutSet.from_manifests(recordings=rs, supervisions=ss)
    cuts += cut

In [11]:
resampled_cuts = cuts.sort_by_duration()
resampled_cuts.to_jsonl(os.path.join(OUT_DIR, "aishell_raw_cuts.jsonl"))

In [12]:
def pack_cuts_to_long_audio(
    cuts: CutSet,
    target_duration: float = 600.0,
    starting_id: int = 0,
) -> (CutSet, int):
    final_long_cuts = []
    buffer_cut = None

    for cut in cuts:
        buffer_cut = cut if buffer_cut is None else buffer_cut.append(cut)
        if buffer_cut.duration >= target_duration:
            final_long_cuts.append(buffer_cut.with_id(f"{starting_id:06d}"))
            starting_id += 1
            buffer_cut = None

    return CutSet.from_cuts(final_long_cuts), starting_id

In [13]:
grouped_cuts, new_amount = pack_cuts_to_long_audio(resampled_cuts, target_duration=600.0, starting_id = prev_amount)
grouped_cuts.to_jsonl(OUT_DIR + "/aishell_grouped_cuts.jsonl")


In [37]:
def json_from_aishell_to_allaudios(one_cut):
    sources = []
    total_dur = 0
    transcripts = []
    slices = []
    speakers = set()
    ttt = one_cut["tracks"] if "tracks" in one_cut else [one_cut]
    for subcut in ttt:
        ccc = subcut["cut"] if "cut" in subcut else subcut
        total_dur += ccc["duration"]
        full_pth = ccc["recording"]["sources"][0]["source"]
        slices.append([ccc["start"], ccc["duration"]])
        sources.append(full_pth.split("data")[-1])
        speakers.add(ccc["supervisions"][0]["speaker"])
        transcript_param = ". ".join([s["text"] for s in ccc["supervisions"] if s["text"]])
        if transcript_param != "":
            transcripts.append(transcript_param)
        else:
            print(subcut)

    return {
        "id": one_cut["id"],
        "source_ds": "aishell2",
        "duration_sec": total_dur,
        "audio_auto": False,
        "text_auto": False,
        "language": 'zh-CN',
        "num_speakers": len(speakers),
        "num_switches": len(sources),
        "slice": slices,
        "transcribe": "。 ".join(transcripts),
        "components": sources,
    }

def convert_record(source_jsonl_path: str, target_jsonl_path: str, map_fn):
    with open(source_jsonl_path, "r", encoding="utf-8") as src_f, \
         open(target_jsonl_path, "a", encoding="utf-8") as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(item)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + "\n")

In [38]:
convert_record(os.path.join(OUT_DIR, "aishell_grouped_cuts.jsonl"),
               os.path.join(OUT_DIR, OUT_FILE_NAME),
               json_from_aishell_to_allaudios)


In [22]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from worker import save_one_worker

In [23]:


def save_audios_from_cutset(cutset, out_dir, num_jobs=None):
    if num_jobs is None:
        num_jobs = os.cpu_count()

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    context = mp.get_context("spawn")

    with ProcessPoolExecutor(max_workers=num_jobs, mp_context=context) as pool:
        futures = [
            pool.submit(save_one_worker, cut, out_dir) 
            for cut in tqdm(cutset, desc="1. 提交任务中")
        ]
        for _ in tqdm(
        as_completed(futures),
        total=len(futures),
        desc=f"Saving WAVs ({num_jobs} workers)"
        ):
            pass

In [24]:
#os.environ["LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE"] =   "1.5"
mp.set_start_method('spawn', force=True)
save_audios_from_cutset(grouped_cuts, os.path.join(OUT_DIR, 'wavs'))

1. 提交任务中: 100%|██████████| 3/3 [00:00<00:00,  5.09it/s]
Saving WAVs (24 workers): 100%|██████████| 3/3 [00:05<00:00,  1.99s/it]
