In [9]:
import multiprocessing as mp
from pathlib import Path
import os
import json
import pandas as pd
from tqdm import tqdm
from lhotse import CutSet, RecordingSet, SupervisionSet, MonoCut, combine
from lhotse.recipes import prepare_commonvoice
import logging
from util import *
import numpy as np
import faiss, gc
from lhotse_util import from_strategy_to_cuts, SpeakerEmbeddingExtractor
logging.basicConfig(level=logging.INFO)
import librosa
from bitarray import bitarray
from sklearn.decomposition import PCA

In [10]:
# directory paths to save audio and transcript files
IN_DIR = "../datasets/LongSpeechSource/voxpopuli"
IN_DIR = "/mnt/d/voicedata/CommenVoice/delta"
# directory paths to save metadata and processed aduio files
OUT_DIR = '../datasets/LongSpeech'

In [11]:
config = json.load(open(os.path.join(OUT_DIR, 'metadata.json')))
AVG_DURATION = config['avg_duration']
SAMPLE_RATE = config['sample_rate']
OUT_FILE_NAME = config['source']
prev_amount = config['amount']
print(prev_amount)
task = "asr"
lang = "en"

0


In [4]:
manifests = prepare_commonvoice(corpus_dir=IN_DIR, output_dir=OUT_DIR, languages = 'en', splits=['validated'])

Processing CommonVoice languages:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Language: en


Spliting:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Spliting validated
INFO:root:The user overrided the global setting for whether to use ffmpeg-torchaudio to compute the duration of audio files. Old setting: True. New setting: False.


Distributing tasks: 0it [00:00, ?it/s]

Processing:   0%|          | 0/170 [00:00<?, ?it/s]

INFO:root:The user overrided the global setting for whether to use ffmpeg-torchaudio to compute the duration of audio files. Old setting: False. New setting: True.


In [5]:

cuts = CutSet()
for part in manifests.keys():
    rs = manifests[part]['validated']['recordings']
    ss = manifests[part]['validated']['supervisions']
    cut = CutSet.from_manifests(recordings=rs, supervisions=ss)
    cuts += cut

In [7]:
resampled_cuts = cuts.filter(lambda cut: cut.duration > 3).resample(SAMPLE_RATE).to_eager()
resampled_cuts.to_jsonl(os.path.join(OUT_DIR, "commonvoice_raw_cuts.jsonl"))

In [14]:
def build_feature(cuts: CutSet, batch_size: int = 100, dim: int = 384):
    cut_list = cuts.to_eager()
    n = len(cut_list)

    vec_mm = np.memmap(f"{OUT_DIR}/vecs.f32", dtype="float32", mode="w+", shape=(n, dim))
    dur_mm = np.memmap(f"{OUT_DIR}/durs.f32", dtype="float32", mode="w+", shape=(n,))

    string_ids = []

    ptr = 0
    for i in tqdm(range(0, n, batch_size), desc="Get Embedding"):
        cut_batch = cut_list[i:i+batch_size]

        texts = [c.supervisions[0].text if c.supervisions else "" for c in cut_batch]
        durations = [c.duration for c in cut_batch]
        string_ids.extend([c.id for c in cut_batch])

        vec_np = get_sentence_embeddings(texts).astype("float32")
        B = len(cut_batch)

        vec_mm[ptr:ptr+B] = vec_np
        dur_mm[ptr:ptr+B] = durations
        ptr += B

    vec_mm.flush(); dur_mm.flush()

    return vec_mm, dur_mm, string_ids

def build_hnsw_index(vec_mm: np.memmap,
                     dim: int = 384,
                     m: int = 32,
                     ef_c: int = 200,
                     n_threads: int = mp.cpu_count(),
                     out_path: str = "cache_hnsw.faiss"):

    faiss.omp_set_num_threads(n_threads)
    faiss.normalize_L2(vec_mm)

    index = faiss.IndexHNSWFlat(dim, m)
    index.hnsw.efConstruction = ef_c
    index.metric_type = faiss.METRIC_INNER_PRODUCT

    index.add(vec_mm)
    faiss.write_index(index, os.path.join(OUT_DIR,out_path))
    return os.path.join(OUT_DIR,out_path)

def get_speaker_embedding_ids(ids, neighs, cuts):
    """
    获取邻居的说话人ID
    Returns:
        speaker_embeddings: (batch_num, feature_dim)
    """
    speaker_embeddings = []
    for idx in neighs:
        if idx == -1:
            break
        real_id = ids[idx]
        cut_pth = cuts[real_id].recording.sources[0].source
        audio, sr = librosa.load(cut_pth)
        speaker_embeddings.append(get_speaker_embedding(audio, sr).flatten())

    spk_emb_np = np.array(speaker_embeddings)
    pc1 = PCA(n_components=1, svd_solver="auto").fit_transform(spk_emb_np).ravel()
    return np.argsort(pc1)

def greedy_cluster(index_path: str,
                   vec_mm: np.memmap,
                   dur_mm: np.memmap,
                   ids,
                   cuts,
                   bucket_min: int = 300,
                   bucket_avg: int = 600,
                   k_neigh: int = 1024,
                   ef_s: int = 96):
    index = faiss.read_index(index_path)

    params = faiss.SearchParametersHNSW()
    params.efSearch = ef_s

    N = len(vec_mm)
    assigned = bitarray(N)
    assigned.setall(False)

    order = np.argsort(-dur_mm)
    buckets = []

    for seed in tqdm(order, desc="Clustering (Optimized)"):
        if assigned[seed]:
            continue

        cluster = []
        total_dur = 0

        unassigned_indices_list = assigned.search(bitarray('0'))
        unassigned_indices = np.fromiter(unassigned_indices_list, dtype=np.int64)


        if len(unassigned_indices) > 0:
            selector = faiss.IDSelectorArray(unassigned_indices)
            params.sel = selector

            _, neighs = index.search(vec_mm[seed : seed + 1], k_neigh, params=params)

            speaker_order = get_speaker_embedding_ids(ids, neighs[0].tolist(), cuts)
            #print(speaker_order)

            for idx2 in speaker_order:
                idx = neighs[0][idx2]
                if idx == -1:
                    break
                if assigned[idx]:
                    print("Warning: Already assigned index", idx)
                    continue

                cluster.append(int(idx))
                assigned[idx] = True
                total_dur += dur_mm[idx]
                if total_dur >= bucket_avg:
                    break

            if total_dur < bucket_min:
                for i in cluster:
                    assigned[i] = False
            else:
                total_dur = dur_mm[cluster].sum()
                buckets.append((cluster, total_dur))

    final_buckets = [b for b in buckets if b[1] >= bucket_min]
    final_clusters = [c for c, _ in final_buckets]
    final_duration = sum(sec for _, sec in final_buckets)

    loss = 1 - final_duration / dur_mm.sum()
    print(f"桶数 {len(final_clusters)}, 最终时长 {final_duration:.2f}s, 总时长 {dur_mm.sum():.2f}s, 丢弃比例 {loss:.2%}")

    strategy = []
    for cluster in final_clusters:
        strategy.append([ids[i] for i in cluster])

    return strategy

In [15]:
"""
mock_strategy = [
    ["common_voice_en_43199993-0", "common_voice_en_42736613-1", "common_voice_en_42798328-2"],
    ["common_voice_en_43204215-3", "common_voice_en_42706055-4", "common_voice_en_43139615-5"]
]
"""

vec_mm, dur_mm, string_ids = build_feature(resampled_cuts)
index_path = build_hnsw_index(vec_mm)
real_strategy = greedy_cluster(index_path, vec_mm, dur_mm, string_ids, resampled_cuts)

Get Embedding:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Get Embedding:  50%|█████     | 1/2 [00:01<00:01,  1.95s/it]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Get Embedding: 100%|██████████| 2/2 [00:02<00:00,  1.01s/it]
Clustering (Optimized):   0%|          | 0/168 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The

桶数 2, 最终时长 1027.54s, 总时长 1027.54s, 丢弃比例 0.00%





In [16]:
def map_newid_cutset(cutset: CutSet, start_id: int = 0) -> CutSet:
    """
    Map the ids of a CutSet to a new id starting from start_id.
    """
    new_cuts = []
    for i, cut in enumerate(cutset):
        new_cut = cut.with_id(f"{start_id + i:06d}")
        new_cuts.append(new_cut)
    return CutSet.from_cuts(new_cuts), start_id + len(new_cuts)

In [17]:
grouped_cuts = from_strategy_to_cuts(resampled_cuts.to_eager(), real_strategy)
grouped_cuts , new_amount = map_newid_cutset(grouped_cuts, start_id=prev_amount)
grouped_cuts.to_jsonl(os.path.join(OUT_DIR, "grouped_raw_cuts.jsonl"))
print(new_amount)

2


In [18]:
def json_from_commonvoice_to_allaudios(one_cut, lang = "en"):
    """
    Convert a single Commonvoice json record to a list of LongSpeech metadata.
    """
    sources = []
    speakers = set()
    total_dur = 0
    transcripts = []
    slices = []

    for subcut in one_cut["tracks"]:
        total_dur += subcut["cut"]["duration"]
        full_pth = subcut["cut"]["recording"]["sources"][0]["source"]
        slices.append([subcut["cut"]["start"], subcut["cut"]["duration"]])
        sources.append(full_pth.split("clips")[-1])
        [speakers.add(s["speaker"]) for s in subcut["cut"]["supervisions"] if s["speaker"]]
        transcript_param = " ".join([s["text"] for s in subcut["cut"]["supervisions"] if s["text"]])
        if transcript_param != "":
            transcripts.append(transcript_param)
        else:
            print(subcut)

    return {
        "id": one_cut["id"],
        "source_ds": "CommonVoice",
        "duration_sec": total_dur,
        "audio_auto": False,
        "text_auto": False,
        "language": lang,
        "num_speakers": len(speakers),
        "num_switches": len(transcripts),
        "slice": slices,
        "transcribe": " ".join(transcripts),
        "components": sources,
    }


def convert_record(source_jsonl_path: str, target_jsonl_path: str, map_fn, lang: str):
    with open(source_jsonl_path, "r", encoding="utf-8") as src_f, \
         open(target_jsonl_path, "a", encoding="utf-8") as tgt_f:
        for line in src_f:
            item = json.loads(line)
            new_item = map_fn(item, lang)
            tgt_f.write(json.dumps(new_item, ensure_ascii=False) + "\n")

def save_audios_from_cutset(cutset, out_dir, num_jobs=1):
    """
    Save audios from a CutSet to the specified directory.
    """
    for cut in tqdm(cutset):
        cut.save_audio(os.path.join(out_dir, f"{cut.id}.wav"))


In [19]:

convert_record(os.path.join(OUT_DIR, "grouped_raw_cuts.jsonl"),
               os.path.join(OUT_DIR, OUT_FILE_NAME),
               json_from_commonvoice_to_allaudios, lang)
save_audios_from_cutset(grouped_cuts, os.path.join(OUT_DIR, 'wavs'))

100%|██████████| 2/2 [00:35<00:00, 17.58s/it]


In [21]:
"""
def with_new_features(cuts: CutSet, batch_size = 100) -> CutSet:
    cutset_list = cuts.split_lazy(OUT_DIR, batch_size)
    new_cutset_list = []
    for i, cutset in enumerate(tqdm(cutset_list, desc="Processing cuts")):
        text_list = [cut.supervisions[0].text if cut.supervisions else "" for cut in cutset]
        id_list = [cut.id for cut in cutset]
        duration = [cut.duration for cut in cutset]
        semantic_np = get_sentence_embeddings(
            text_list
        )

        updated_cuts = []
        for cut, embedding in zip(cutset, semantic_np):
            cut = cut.with_custom("semantic_emb", embedding.tolist())  # 如果是 numpy array
            updated_cuts.append(cut)


        new_cutset_list.append(CutSet.from_cuts(updated_cuts))
    merged_cuts = combine(*new_cutset_list)
    return merged_cuts
"""

'\ndef with_new_features(cuts: CutSet, batch_size = 100) -> CutSet:\n    cutset_list = cuts.split_lazy(OUT_DIR, batch_size)\n    new_cutset_list = []\n    for i, cutset in enumerate(tqdm(cutset_list, desc="Processing cuts")):\n        text_list = [cut.supervisions[0].text if cut.supervisions else "" for cut in cutset]\n        id_list = [cut.id for cut in cutset]\n        duration = [cut.duration for cut in cutset]\n        semantic_np = get_sentence_embeddings(\n            text_list\n        )\n\n        updated_cuts = []\n        for cut, embedding in zip(cutset, semantic_np):\n            cut = cut.with_custom("semantic_emb", embedding.tolist())  # 如果是 numpy array\n            updated_cuts.append(cut)\n\n\n        new_cutset_list.append(CutSet.from_cuts(updated_cuts))\n    merged_cuts = combine(*new_cutset_list)\n    return merged_cuts\n'