In [None]:
!pip install jiwer modelscope openai-whisper

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting modelscope
  Downloading modelscope-1.26.0-py3-none-any.whl.metadata (39 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manyl

In [None]:
#由gemini 2.5 pro 輔助撰寫
# Install dependencies (run in a separate cell before the main code)
# !pip install --no-cache-dir torch==2.0.1 torchvision==0.15.2 transformers==4.38.2 datasets>=2.0.0 modelscope jiwer openai-whisper
import os
import whisper
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from jiwer import wer, cer
import re

# 函數：正規化英文文本（移除標點、轉小寫）
def normalize_english(text):
    text = re.sub(r'[^\w\s]', '', text)  # 移除標點
    return text.lower().strip()

# 函數：正規化中文文本（移除標點、空格）
def normalize_chinese(text):
    text = re.sub(r'[^\u4e00-\u9fff]', '', text)  # 僅保留中文字符
    return text.strip()

# 函數：讀取單一參考文本檔案（每行一個參考文本）
def read_reference_file(ref_file):
    with open(ref_file, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]  # 移除空行

# 函數：提取檔案名稱中的數字（用於排序）
def get_file_number(filename):
    # 匹配 "generated_audio_colab (N).wav" 中的 N，若無括號則返回 0
    match = re.search(r'\((\d+)\)', filename)
    return int(match.group(1)) if match else 0

# 函數：處理英文語音檔案並計算 WER
def process_english(whisper_model, audio_dir, ref_file):
    wer_scores = []

    # 讀取參考文本
    ref_texts = read_reference_file(ref_file)

    # 獲取語音檔案並按括號中的數字排序
    audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.wav')],
                         key=get_file_number)

    # 檢查語音檔案數與參考文本行數是否匹配
    if len(audio_files) != len(ref_texts):
        print(f"警告：英文語音檔案數 ({len(audio_files)}) 與參考文本行數 ({len(ref_texts)}) 不匹配！")

    # 遍歷語音檔案
    for i, audio_file in enumerate(audio_files):
        if i >= len(ref_texts):  # 防止越界
            print(f"警告：缺少 {audio_file} 的參考文本，跳過")
            continue

        audio_path = os.path.join(audio_dir, audio_file)
        ref_text = ref_texts[i]

        # 轉錄語音
        result = whisper_model.transcribe(audio_path, language="en")
        hyp_text = result["text"]

        # 正規化
        ref_text_norm = normalize_english(ref_text)
        hyp_text_norm = normalize_english(hyp_text)

        # 計算 WER
        wer_score = wer(ref_text_norm, hyp_text_norm)
        wer_scores.append(wer_score)

    return wer_scores

# 函數：處理中文語音檔案並計算 CER
def process_chinese(paraformer_pipeline, audio_dir, ref_file):
    cer_scores = []

    # 讀取參考文本
    ref_texts = read_reference_file(ref_file)

    # 獲取語音檔案並按括號中的數字排序
    audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.wav')],
                         key=get_file_number)

    # 檢查語音檔案數與參考文本行數是否匹配
    if len(audio_files) != len(ref_texts):
        print(f"警告：中文語音檔案數 ({len(audio_files)}) 與參考文本行數 ({len(ref_texts)}) 不匹配！")

    # 遍歷語音檔案
    for i, audio_file in enumerate(audio_files):
        if i >= len(ref_texts):  # 防止越界
            print(f"警告：缺少 {audio_file} 的參考文本，跳過")
            continue

        audio_path = os.path.join(audio_dir, audio_file)
        ref_text = ref_texts[i]

        # 轉錄語音
        result = paraformer_pipeline(audio_path)
        hyp_text = result[0]['text']  # 修正：從 list 中提取第一個字典的 'text' 鍵

        # 正規化
        ref_text_norm = normalize_chinese(ref_text)
        hyp_text_norm = normalize_chinese(hyp_text)

        # 計算 CER
        cer_score = cer(ref_text_norm, hyp_text_norm)
        cer_scores.append(cer_score)

    return cer_scores

# 主程式
def main():
    # 設置檔案路徑（根據你的 Colab 環境調整路徑）
    english_audio_dir = "/content/drive/MyDrive/english_audios"  # 英文語音資料夾
    english_ref_file = "/content/drive/MyDrive/english_ref.txt"  # 單一英文參考文本檔案
    chinese_audio_dir = "/content/drive/MyDrive/chinese_audios"  # 中文語音資料夾
    chinese_ref_file = "/content/drive/MyDrive/chinese_ref1.txt"  # 單一中文參考文本檔案

    # 檢查參考檔案是否存在
    if not os.path.exists(english_ref_file):
        print(f"錯誤：英文參考文本 {english_ref_file} 不存在！")
        return
    if not os.path.exists(chinese_ref_file):
        print(f"錯誤：中文參考文本 {chinese_ref_file} 不存在！")
        return

    # 載入 Whisper-large V3 模型
    #print("載入 Whisper-large V3 模型...")
    #whisper_model = whisper.load_model("large-v3")

    # 載入 Paraformer 模型
    print("載入 Paraformer 模型...")
    paraformer_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model='iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
    )

    # 處理英文語音並計算 WER
    #print("\n處理英文語音檔案...")
    #english_wer_scores = process_english(whisper_model, english_audio_dir, english_ref_file)
    #avg_wer = sum(english_wer_scores) / len(english_wer_scores) if english_wer_scores else 0
    #print(f"平均 WER：{avg_wer * 100:.2f}%")

    # 處理中文語音檔案並計算 CER
    print("\n處理中文語音檔案...")
    chinese_cer_scores = process_chinese(paraformer_pipeline, chinese_audio_dir, chinese_ref_file)
    avg_cer = sum(chinese_cer_scores) / len(chinese_cer_scores) if chinese_cer_scores else 0
    print(f"平均 CER：{avg_cer * 100:.2f}%")

if __name__ == "__main__":
    main()

載入 Paraformer 模型...




Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch


2025-06-01 21:32:45,561 - modelscope - INFO - initiate model from /root/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
2025-06-01 21:32:45,561 - modelscope - INFO - initiate model from location /root/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.
2025-06-01 21:32:45,563 - modelscope - INFO - initialize model from /root/.cache/modelscope/hub/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch


funasr version: 1.2.6.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.2.6


2025-06-01 21:32:53,416 - modelscope - INFO - No preprocessor key ('funasr', 'auto-speech-recognition') found in PREPROCESSOR_MAP, skip building preprocessor. If the pipeline runs normally, please ignore this log.



處理中文語音檔案...


rtf_avg: 0.019: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  2.31it/s]
rtf_avg: 0.095: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.16it/s]
rtf_avg: 0.076: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.30it/s]
rtf_avg: 0.090: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.28it/s]
rtf_avg: 0.065: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.36it/s]
rtf_avg: 0.077: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.31it/s]
rtf_avg: 0.080: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.30it/s]
rtf_avg: 0.076: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.36it/s]
rtf_avg: 0.082: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.34it/s]
rtf_avg: 0.117: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.00s/it]
rtf_avg: 0.092: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.29it/s]
rtf_avg: 0.122: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.01it/s]
rtf_avg: 0.081: 100%|[34m██████████[0m| 1/1 [00:00<00:00,  1.40it/s]
rtf_avg: 0.152: 100%|[34m██████████[0m| 1/1 [00:01<00:00,  1.08s/it]
rtf_av

平均 CER：3.60%





In [None]:
!pip install addict

Collecting addict
  Downloading addict-2.4.0-py3-none-any.whl.metadata (1.0 kB)
Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)
Installing collected packages: addict
Successfully installed addict-2.4.0


In [None]:
!pip install datasets==2.21.0

Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets==2.21.0)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.0
    Uninstalling fsspec-2025.3.0:
      Successfully uninstalled fsspec-2025.3.0
  Attempting uninstall: datasets
    Found existing installation: datasets 3.6.0
    Uninstalling datasets-3.6.0:
      Successfully uninstalled datasets-3.6.0
[31mERROR: pip

In [None]:
!pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html

Looking in links: https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
Collecting datasets<=3.2.0,>=3.0.0 (from modelscope[audio])
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting setuptools==69.5.1 (from modelscope[audio])
  Downloading setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting funasr>=1.0.0 (from modelscope[audio])
  Downloading funasr-1.2.6-py3-none-any.whl.metadata (32 kB)
Collecting kaldiio (from modelscope[audio])
  Downloading kaldiio-2.18.1-py3-none-any.whl.metadata (13 kB)
Collecting py-sound-connect>=0.1 (from modelscope[audio])
  Downloading py_sound_connect-0.1.0-py3-none-any.whl.metadata (321 bytes)
Collecting tensorboardX (from modelscope[audio])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting hyperpyyaml (from modelscope[audio])
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting librosa==0.10.1 (from modelscope[audio])
  Downloading librosa-0.10.1-p

In [None]:
#由gemini 2.5 pro 輔助撰寫
import os
from modelscope.pipelines import pipeline
import re

# 函數：提取檔案名稱中的數字（用於排序生成語音檔案）
def get_file_number(filename):
    # 匹配 "generated_audio_colab (N).wav" 中的 N，若無括號則返回 0
    match = re.search(r'\((\d+)\)', filename)
    return int(match.group(1)) if match else 0

# 函數：計算 Speaker Similarity（使用 modelscope ERes2Net 模型）
def compute_speaker_similarity(generated_path, reference_path, speaker_model):
    # 直接使用 pipeline 計算餘弦相似度
    result = speaker_model([generated_path, reference_path])
    # modelscope 返回的結果中，'score' 是餘弦相似度
    similarity = result['score']
    # 確保相似度在 [0, 1] 範圍（modelscope 通常已正規化）
    similarity = max(min(similarity, 1.0), 0.0)
    return similarity

# 函數：處理語音檔案並計算 Speaker Similarity（使用單一參考語音）
def process_speaker_similarity(speaker_model, audio_dir, ref_audio_file, speaker_name):
    similarity_scores = []

    # 獲取生成語音檔案並按括號中的數字排序
    audio_files = sorted([f for f in os.listdir(audio_dir) if f.endswith('.wav')],
                         key=get_file_number)

    # 檢查參考語音檔案是否存在
    if not os.path.exists(ref_audio_file):
        print(f"錯誤：{speaker_name} 的參考語音檔案 {ref_audio_file} 不存在！")
        return similarity_scores

    # 遍歷生成語音檔案
    for audio_file in audio_files:
        audio_path = os.path.join(audio_dir, audio_file)

        # 計算 Speaker Similarity（所有生成語音與單一參考語音比較）
        similarity = compute_speaker_similarity(audio_path, ref_audio_file, speaker_model)
        similarity_scores.append(similarity)

    return similarity_scores

# 主程式
def main():
    # 設置檔案路徑（根據你的 Colab 環境調整路徑）
    speaker_configs = [
        {
            'name': '中文講者 A',
            'audio_dir': '/content/drive/MyDrive/chinese_audios_speaker_a',
            'ref_audio_file': '/content/drive/MyDrive/chinese_ref_audios_speaker_a/chinese_ref_audios_speaker_a.wav'
        },
        {
            'name': '中文講者 B',
            'audio_dir': '/content/drive/MyDrive/chinese_audios_speaker_b',
            'ref_audio_file': '/content/drive/MyDrive/chinese_ref_audios_speaker_b/chinese_ref_audios_speaker_b.wav'
        },
        {
            'name': '英文講者 C',
            'audio_dir': '/content/drive/MyDrive/english_audios_speaker_c',
            'ref_audio_file': '/content/drive/MyDrive/english_ref_audios_speaker_c/english_ref_audios_speaker_c.wav'
        },
        {
            'name': '英文講者 D',
            'audio_dir': '/content/drive/MyDrive/english_audios_speaker_d',
            'ref_audio_file': '/content/drive/MyDrive/english_ref_audios_speaker_d/english_ref_audios_speaker_d.wav'
        }
    ]

    # 檢查資料夾和參考語音檔案
    for config in speaker_configs:
        if not os.path.exists(config['audio_dir']) or not os.listdir(config['audio_dir']):
            print(f"錯誤：{config['name']} 的生成語音資料夾 {config['audio_dir']} 不存在或為空！")
            return
        if not os.path.exists(config['ref_audio_file']):
            print(f"錯誤：{config['name']} 的參考語音檔案 {config['ref_audio_file']} 不存在！")
            return

    # 載入 modelscope ERes2Net 模型
    print("載入 ERes2Net 模型...")
    speaker_model = pipeline(
        task='speaker-verification',
        model='iic/speech_eres2net_sv_zh-cn_16k-common',
        model_revision='v1.0.5'
    )

    # 處理每個講者的語音檔案
    for config in speaker_configs:
        print(f"\n處理 {config['name']} 的語音檔案...")
        similarity_scores = process_speaker_similarity(
            speaker_model, config['audio_dir'], config['ref_audio_file'], config['name']
        )
        avg_similarity = sum(similarity_scores) / len(similarity_scores) if similarity_scores else 0
        print(f"平均 {config['name']} 講者相似度：{avg_similarity * 100:.2f}%")

if __name__ == "__main__":
    main()

載入 ERes2Net 模型...


2025-06-01 23:04:31,646 - modelscope - INFO - Use user-specified model revision: v1.0.5
2025-06-01 23:04:33,540 - modelscope - INFO - Use user-specified model revision: v1.0.5


Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/iic/speech_eres2net_sv_zh-cn_16k-common


2025-06-01 23:04:33,862 - modelscope - INFO - initiate model from /root/.cache/modelscope/hub/models/iic/speech_eres2net_sv_zh-cn_16k-common
2025-06-01 23:04:33,862 - modelscope - INFO - initiate model from location /root/.cache/modelscope/hub/models/iic/speech_eres2net_sv_zh-cn_16k-common.
2025-06-01 23:04:33,864 - modelscope - INFO - initialize model from /root/.cache/modelscope/hub/models/iic/speech_eres2net_sv_zh-cn_16k-common
2025-06-01 23:04:34,539 - modelscope - INFO - No preprocessor key ('eres2net-aug-sv', 'speaker-verification') found in PREPROCESSOR_MAP, skip building preprocessor. If the pipeline runs normally, please ignore this log.



處理 中文講者 A 的語音檔案...




平均 中文講者 A 講者相似度：78.87%

處理 中文講者 B 的語音檔案...




平均 中文講者 B 講者相似度：68.57%

處理 英文講者 C 的語音檔案...




平均 英文講者 C 講者相似度：70.67%

處理 英文講者 D 的語音檔案...




平均 英文講者 D 講者相似度：71.50%
