In [1]:
# ----------------------------
# Cell 1：安裝套件 + 載入 whisper-large-v3 到 TPU (BF16)
# ----------------------------

import os
os.environ["PT_XLA_DEBUG"] = "1" # Optional: for debugging XLA behavior

# 1. 卸載舊版 torch、torch_xla，避免版本衝突
!pip -q uninstall -y torch torch_xla 2>/dev/null || true #

# 2. 安裝 PyTorch 2.6.0 (C++11 ABI) + torch_xla 2.6.0 (TPU VM wheel, cp311)
!pip install -q \
  torch==2.6.0+cpu.cxx11.abi \
  https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0%2Bcxx11-cp311-cp311-manylinux_2_28_x86_64.whl \
  "torch_xla[tpu]==2.6.0" \
  -f https://storage.googleapis.com/libtpu-releases/index.html \
  -f https://storage.googleapis.com/libtpu-wheels/index.html \
  -f https://download.pytorch.org/whl/torch #

# 3. 安裝 Hugging Face Transformers (4.39.x) + sentencepiece + librosa + soundfile
!pip install -q "transformers>=4.39.0,<4.40.0" sentencepiece librosa soundfile #

# 4. 安裝系統音訊解碼工具 ffmpeg
!apt update -qq && apt install -y -qq ffmpeg #

# 5. 匯入 torch / torch_xla 並檢查 TPU 裝置
import torch
import torch_xla
import torch_xla.core.xla_model as xm

print("torch version   :", torch.__version__)       #
print("torch_xla ver.  :", torch_xla.__version__)   #
print("XLA devices     :", xm.get_xla_supported_devices())  #
device = xm.xla_device() # Gets the first XLA device
print("Using device    :", device)                #

# 6. 載入 whisper-large-v3 and cast to BF16
from transformers import WhisperProcessor, WhisperForConditionalGeneration

MODEL_NAME = "openai/whisper-large-v3" #
processor = WhisperProcessor.from_pretrained(MODEL_NAME) #

# Load model and immediately move to TPU device AND cast to bfloat16
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device).to(dtype=torch.bfloat16) #

print("Loaded model    :", MODEL_NAME)
print("Model dtype     :", model.dtype) # Should show torch.bfloat16
print("max_target_positions =", model.config.max_target_positions) #
print("Cell 1 完成 (BF16).")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.7/93.7 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[?25h4 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)[0m
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgrade

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

Loaded model    : openai/whisper-large-v3
Model dtype     : torch.bfloat16
max_target_positions = 448
Cell 1 完成 (BF16).


In [None]:
# ----------------------------
# Cell 3：上傳音檔 → 讀取 → Whisper 推理 (TPU Batch BF16) → 輸出繁體中文逐字稿
# ----------------------------

# 1. 確保 Cell 1 已經執行並載入 processor、model
assert "processor" in globals() and "model" in globals(), \
    "請先執行 Cell 1 (and ideally Cell 2 for compilation)，並確認 runtime 尚未重啟。"

# 2. 匯入必要模組
from google.colab import files
import os
import time
import librosa
import torch
import warnings
import numpy as np # For padding
import torch_xla.core.xla_model as xm # For xm.mark_step()

# 3. 靜音 Librosa 與 Transformers 的警告
warnings.filterwarnings("ignore", "PySoundFile failed")
warnings.filterwarnings("ignore", "Due to a bug fix")

# 4. 檢查模型裝置與 dtype
current_model_device = next(model.parameters()).device
print("model device:", current_model_device, "| model dtype:", model.dtype)

# 5. 提示使用者上傳音檔
print("\n請上傳音檔 (mp3 / wav / m4a / ogg / mp4 …) 以進行批次處理：")
uploaded_files = files.upload()  # 選擇檔案後才會繼續

if not uploaded_files:
    print("未上傳任何檔案。")
else:
    file_list = list(uploaded_files.keys())

    # 6. 定義日誌函式
    def show(msg):
        print(time.strftime("[%H:%M:%S]"), msg)

    # 7. 批次處理檔案
    show(f"準備處理 {len(file_list)} 個檔案...")

    all_waveforms_raw = []
    valid_fnames = []
    MAX_DURATION_SAMPLES = 16000 * 30 # Max 30 seconds for Whisper input

    for fname in file_list:
        show(f"  載入 {fname}")
        try:
            # 7-1. 用 librosa.load 讀檔, resample to 16kHz
            wav, sr = librosa.load(fname, sr=16000)
            show(f"    waveform shape = {wav.shape}, sr = {sr}")

            if len(wav) > MAX_DURATION_SAMPLES:
                show(f"    警告: {fname} 長度 {len(wav)/sr:.1f}s 超過 30 秒，將被截斷。")
                wav = wav[:MAX_DURATION_SAMPLES]

            all_waveforms_raw.append(wav)
            valid_fnames.append(fname)
        except Exception as e:
            show(f"    錯誤: 無法載入 {fname}. 原因: {e}")

    if not all_waveforms_raw:
        show("沒有成功載入的音檔可供處理。")
    else:
        show(f"成功載入 {len(all_waveforms_raw)} 個音檔。開始批次預處理...")

        # Pad all waveforms to MAX_DURATION_SAMPLES for batching
        padded_waveforms = []
        for wav in all_waveforms_raw:
            if len(wav) < MAX_DURATION_SAMPLES:
                # Pad with zeros to the right
                padded_wav = np.pad(wav, (0, MAX_DURATION_SAMPLES - len(wav)), mode='constant')
            else:
                padded_wav = wav # Already MAX_DURATION_SAMPLES or truncated
            padded_waveforms.append(padded_wav)

        # 7-2. 特徵抽取 (批次)
        # The processor can handle a list of numpy arrays directly
        feats = processor.feature_extractor(
            padded_waveforms, # List of numpy arrays
            sampling_rate=16000,
            return_tensors="pt",
            return_attention_mask=True
        )

        inputs = feats.input_features.to(current_model_device).to(dtype=torch.bfloat16) # To device and BF16
        attn   = feats.attention_mask.to(current_model_device)
        show(f"  批次特徵輸入: inputs.shape = {inputs.shape}, attn.shape = {attn.shape}")

        # 7-3. 取得並準備 Decoder Prompt IDs FOR BATCHING
        # This will generate the standard prompt for Chinese transcription.
        # processor.get_decoder_prompt_ids returns a list of tuples, we take the first element.
        prompt_ids_tuple = processor.get_decoder_prompt_ids(language="chinese", task="transcribe")[0] #

        # For batching, we need to replicate these prompt IDs for each item in the batch.
        # The prompt_ids_tuple is something like (sot, lang_id, task_id, no_timestamps_id)
        # We create a tensor of shape (batch_size, num_prompt_tokens)
        batch_size = inputs.shape[0]
        decoder_input_ids = torch.tensor([prompt_ids_tuple] * batch_size, device=current_model_device) #

        show(f"  使用 decoder_input_ids (shape: {decoder_input_ids.shape}) for batching. Prompt IDs: {prompt_ids_tuple}")

        # 7-4. 設定 max_length
        max_len = model.config.max_target_positions #
        show(f"  max_length 設定為 {max_len}")

        # 7-5. 呼叫 model.generate() (批次)
        show("開始 XLA 推理 (批次)…")
        t0 = time.time()
        with torch.no_grad():
            output_ids = model.generate(
                inputs,
                attention_mask=attn,
                decoder_input_ids=decoder_input_ids, # Use the batched decoder_input_ids
                max_length=max_len
            )
        xm.mark_step() # Ensure graph execution is complete
        elapsed = time.time() - t0
        show(f"  批次 ({len(valid_fnames)} 個檔案) 推理完成，耗時 {elapsed:.2f} 秒 (平均 {elapsed/len(valid_fnames):.2f} 秒/檔)")

        # 7-6. 解碼並存檔 (批次)
        transcriptions = processor.batch_decode(output_ids, skip_special_tokens=True)

        for i, transcription in enumerate(transcriptions):
            original_fname = valid_fnames[i]
            preview = transcription[:120] + ("…" if len(transcription) > 120 else "")
            print(f"\n  檔案: {original_fname}")
            print(f"    文字預覽（前 120 字）：", preview)

            out_filename = os.path.splitext(original_fname)[0] + "_transcript_batch.txt"
            with open(out_filename, "w", encoding="utf-8") as f:
                f.write(transcription)
            show(f"    已儲存檔案：{out_filename}")

    show("全部音檔處理完畢！請至左側「檔案」面板下載 *_transcript_batch.txt。")

model device: xla:0 | model dtype: torch.bfloat16

請上傳音檔 (mp3 / wav / m4a / ogg / mp4 …) 以進行批次處理：


Saving 錄製 (18).m4a to 錄製 (18).m4a
[19:37:29] 準備處理 1 個檔案...
[19:37:29]   載入 錄製 (18).m4a


	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


[19:38:28]     waveform shape = (126635,), sr = 16000
[19:38:28] 成功載入 1 個音檔。開始批次預處理...
[19:38:28]   批次特徵輸入: inputs.shape = torch.Size([1, 128, 3000]), attn.shape = torch.Size([1, 3000])
[19:38:28]   使用 decoder_input_ids (shape: torch.Size([1, 2])) for batching. Prompt IDs: (1, 50260)
[19:38:28]   max_length 設定為 448
[19:38:28] 開始 XLA 推理 (批次)…
