In [None]:
# ----------------------------
# Cell 1：安裝套件 + 載入 Whisper-small 到 TPU (FP32)
# ----------------------------

import os
os.environ["PT_XLA_DEBUG"] = "1"


# 1. 卸載舊版 torch、torch_xla，避免版本衝突
!pip -q uninstall -y torch torch_xla 2>/dev/null || true  # 若未安裝則忽略錯誤 :contentReference[oaicite:5]{index=5}

# 2. 安裝 PyTorch 2.6.0 (C++11 ABI) + torch_xla 2.6.0 (TPU VM wheel, cp311)
!pip install -q \
  torch==2.6.0+cpu.cxx11.abi \
  https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0%2Bcxx11-cp311-cp311-manylinux_2_28_x86_64.whl \
  "torch_xla[tpu]==2.6.0" \
  -f https://storage.googleapis.com/libtpu-releases/index.html \
  -f https://storage.googleapis.com/libtpu-wheels/index.html \
  -f https://download.pytorch.org/whl/torch  # 安裝 transformers 與 torch_xla 官方建議版本 :contentReference[oaicite:6]{index=6}

# 3. 安裝 Hugging Face Transformers (4.39.x) + sentencepiece + librosa + soundfile
!pip install -q "transformers>=4.39.0,<4.40.0" sentencepiece librosa soundfile  # 鎖定 4.39.x 避免 5.x API 變動 :contentReference[oaicite:7]{index=7}

# 4. 安裝系統音訊解碼工具 ffmpeg
!apt update -qq && apt install -y -qq ffmpeg  # 用於讀各種音檔格式 :contentReference[oaicite:8]{index=8}

# 5. 匯入 torch / torch_xla 並檢查 TPU 裝置
import torch
import torch_xla
import torch_xla.core.xla_model as xm

print("torch version   :", torch.__version__)       # 預期 2.6.0+cpu-cxx11-abi :contentReference[oaicite:9]{index=9}
print("torch_xla ver.  :", torch_xla.__version__)   # 預期 2.6.0           :contentReference[oaicite:10]{index=10}
print("XLA devices     :", xm.get_xla_supported_devices())  # e.g. ['xla:0'] 或 ['xla:0',…,'xla:7'] :contentReference[oaicite:11]{index=11}
device = xm.xla_device()
print("Using device    :", device)                # e.g. xla:0 :contentReference[oaicite:12]{index=12}

# 6. 載入 Whisper-small 為 FP32
from transformers import WhisperProcessor, WhisperForConditionalGeneration

MODEL_NAME = "openai/whisper-small"  # 若想變 faster，可改成 "openai/whisper-small" 或 "openai/whisper-base" :contentReference[oaicite:13]{index=13}
processor = WhisperProcessor.from_pretrained(MODEL_NAME)           # 下載 tokenizer 及預處理設定 :contentReference[oaicite:14]{index=14}
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)  # 將模型放到 TPU 上 :contentReference[oaicite:15]{index=15}

print("Loaded model    :", MODEL_NAME)
print("max_target_positions =", model.config.max_target_positions)  # 應顯示 448 :contentReference[oaicite:16]{index=16}
print("Cell 1 完成 (FP32).")


In [None]:
# ----------------------------
# Dummy Generate 測試：先跑一次編譯以驗證 TPU 環境
# ----------------------------

import torch, time
import numpy as np

# 1. 確保 processor、model、device 已在 Cell 1 定義
assert "processor" in globals() and "model" in globals(), \
    "請先執行 Cell 1，並確認 runtime 尚未重啟。"

device = next(model.parameters()).device  # e.g. xla:0
print(f"Using device: {device} (dummy test)")

# 2. 產生 30 秒靜音 waveform (480000 samples)，dtype=float32
silence = np.zeros(16000 * 30, dtype=np.float32)  # 30 秒 × 16000 Hz :contentReference[oaicite:21]{index=21}

# 3. 用 WhisperProcessor 直接轉成 (1,128,3000) 特徵 + (1,3000) attention_mask
feats = processor.feature_extractor(
    silence,
    sampling_rate=16000,
    return_tensors="pt",
    return_attention_mask=True
)
input_features = feats.input_features.to(device)   # shape = (1, 128, 3000), dtype=float32
attention_mask  = feats.attention_mask.to(device)  # shape = (1, 3000), dtype=int

# 4. 取得 prompt IDs (只回傳 [<|transcribe|>,<|chinese|>] 共 2 個)
prompt_ids = processor.get_decoder_prompt_ids(language="chinese", task="transcribe")[0]
decoder_prompt = torch.tensor([prompt_ids], device=device)  # shape = (1, 2) :contentReference[oaicite:24]{index=24}

print(f"Prompt IDs list: {prompt_ids} (長度 = {len(prompt_ids)})")  # 印出 e.g. (50260, 50308) :contentReference[oaicite:25]{index=25}
print(f"Model max_target_positions = {model.config.max_target_positions}")  # 應顯示 448 :contentReference[oaicite:26]{index=26}

# 5. Dummy 推理觸發 XLA HLO 編譯（首輪約 25–60 秒，視 TPU 負載而定）
print("\n>> 開始 Dummy 推理測試 (首輪可能需 20–60 秒)…")
t_start = time.time()
with torch.no_grad():
    _ = model.generate(
        input_features,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_prompt,
        max_length=model.config.max_target_positions  # 448 :contentReference[oaicite:27]{index=27}
    )
t_elapsed = time.time() - t_start
print(f"✔ Dummy 編譯完成，耗時 {t_elapsed:.2f} 秒")

In [None]:
# ────────────────────────────────────────────────────────────
# Cell 2.5：Dummy 推理後讀取 XLA metrics
# ────────────────────────────────────────────────────────────

import torch_xla.debug.metrics as met
import torch_xla.core.xla_model as xm

# 1. 清空累積統計（如果前面已跑過其他推論）
met.clear_all()
xm.mark_step()       # flush an empty step to reset counters :contentReference[oaicite:10]{index=10}

# 2. 再執行一次 Dummy 推理 (或直接 reuse 前面 output)
#    也可改為你的 Cell 3 真實推論後再呼叫。下面示範簡化版 Dummy：
t0 = time.time()
with torch.no_grad():
    _ = model.generate(
        input_features,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_prompt,
        max_length=model.config.max_target_positions
    )
xm.mark_step()       # flush graph (確保上面推論已全部送到 TPU)
print(f"Dummy 推論完成，耗時 {time.time()-t0:.2f} 秒")

# 3. 列印 XLA metrics 報告
print("─── 簡短 XLA 統計 ───")
print(met.short_metrics_report())   # 核心指標：CompileTime、ExecuteTime、TransferTime… :contentReference[oaicite:12]{index=12}

print("\n─── 完整 XLA 統計 ───")
print(met.metrics_report())         # 顯示所有 counter，含未 lowered 的 aten:: 操作… :contentReference[oaicite:13]{index=13}


## 下方 Cell 3 為測試用，僅能處理 30 秒以內的音檔，若要處理更長時間，請執行 Cell 4

In [None]:
# ----------------------------
# Cell 3：上傳音檔 → 讀取 → Whisper 推理 (TPU) → 輸出繁體中文逐字稿
# ----------------------------

# 1. 確保 Cell 1 已經執行並載入 processor、model
assert "processor" in globals() and "model" in globals(), \
    "請先執行 Cell 1，並確認 runtime 尚未重啟。"

# 2. 匯入必要模組
from google.colab import files
import os, time, librosa, torch, warnings

# 3. 靜音 Librosa 與 Transformers 的警告
warnings.filterwarnings("ignore", "PySoundFile failed")   # m4a/mp3 → audioread fallback :contentReference[oaicite:31]{index=31}
warnings.filterwarnings("ignore", "Due to a bug fix")    # HF Transformers 多語行為提示 :contentReference[oaicite:32]{index=32}

# 4. 檢查模型裝置與 dtype
device = next(model.parameters()).device
print("model device:", device, "| model dtype:", model.dtype)  # e.g. xla:0 / float32 :contentReference[oaicite:33]{index=33}

# 5. 提示使用者上傳音檔
print("\n請上傳音檔 (mp3 / wav / m4a / ogg / mp4 …)：")
uploaded_files = files.upload()  # 選擇檔案後才會繼續
file_list = list(uploaded_files.keys())

# 6. 定義日誌函式
def show(msg):
    print(time.strftime("[%H:%M:%S]"), msg)

# 7. 逐個檔案處理
for fname in file_list:
    show(f"載入 {fname}")
    # (可選) 若想轉 m4a → wav，取消下方註解：
    # !ffmpeg -i "{fname}" -ar 16000 -ac 1 "tmp.wav" -y
    # fname = "tmp.wav"

    # 7-1. 用 librosa.load 讀檔 (m4a → soundfile → audioread)
    wav, sr = librosa.load(fname, sr=16000)  # shape ≈ (samples,), sr=16000 :contentReference[oaicite:34]{index=34}
    show(f"  waveform shape = {wav.shape}, sr = {sr}")

    # 7-2. 特徵抽取 (回傳 float32 input_features 與 attention_mask)
    feats = processor.feature_extractor(
        wav,
        sampling_rate=sr,
        return_tensors="pt",
        return_attention_mask=True
    )
    inputs = feats.input_features.to(device)  # shape = (1,128,3000), dtype=float32
    attn   = feats.attention_mask.to(device)  # shape = (1,3000), dtype=int
    show(f"  特徵輸入: inputs.shape = {inputs.shape}, attn.shape = {attn.shape}")

    # 7-3. 取得 Prompt IDs (language="chinese", task="transcribe")
    prompt_list = processor.get_decoder_prompt_ids(
        language="chinese",
        task="transcribe"
    )
    decoder_prompt = torch.tensor([prompt_list[0]], device=device)  # shape = (1,2) :contentReference[oaicite:37]{index=37}
    show(f"  prompt IDs (長度) = {len(prompt_list[0])} (應為 2；最終實際送入 4，包括特殊 token)")

    # 7-4. 設定 max_length = 448 (4 prompt + 444 new tokens)
    max_len = model.config.max_target_positions  # 448 :contentReference[oaicite:38]{index=38}
    show(f"  max_length 設定為 {max_len}")

    # 7-5. 呼叫 model.generate() → TPU XLA 編譯/推理
    show("開始 XLA 編譯/推理…（首輪若剛編譯過 Dummy, 推理 < 1 秒）")
    t0 = time.time()
    with torch.no_grad():
        output_ids = model.generate(
            inputs,
            attention_mask=attn,
            decoder_input_ids=decoder_prompt,
            max_length=max_len
        )
    elapsed = time.time() - t0
    show(f"  推理完成，耗時 {elapsed:.2f} 秒")

    # 7-6. 解碼並存檔
    transcription = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
    preview = transcription[:120] + ("…" if len(transcription) > 120 else "")
    print("  文字預覽（前 120 字）：", preview)

    out_filename = os.path.splitext(fname)[0] + "_transcript.txt"
    with open(out_filename, "w", encoding="utf-8") as f:
        f.write(transcription)
    show(f"  已儲存檔案：{out_filename}")

show("全部音檔處理完畢！請至左側「檔案」面板下載 *_transcript.txt。")


## 下方 Cell 4 可以執行超過一小時的音檔，但請注意您的計算資源和時間限制

In [None]:
# ----------------------------
# Cell 3 修正版：流式切片 1h → Whisper 推理 → 拼接繁體中文逐字稿
# ----------------------------

assert "processor" in globals() and "model" in globals(), \
   "請先執行 Cell 1（含 Dummy 推理），並確認 runtime 尚未重啟。"

from google.colab import files
import soundfile as sf, numpy as np, torch, time, os, warnings

# 靜音警告
warnings.filterwarnings("ignore", "PySoundFile")
warnings.filterwarnings("ignore", "Due to a bug fix")

# 1. 基本設定
device     = next(model.parameters()).device                  # e.g. xla:0
prompt_ids = processor.get_decoder_prompt_ids("chinese","transcribe")[0]
decoder_prompt = torch.tensor([prompt_ids], device=device)     # shape (1,2)

SR         = 16000      # 取樣率
CHUNK_SEC  = 30         # 模型一次最大支援 30 s
STRIDE_SEC = 5          # 左右各重疊 5 s，可調
CHUNK_SMP  = CHUNK_SEC  * SR  # 30*16000 = 480000
STRIDE_SMP = STRIDE_SEC * SR  # 5*16000  = 80000

print("請上傳最長 ≥1h 的音檔（wav/mp3/m4a...）")
uploaded = files.upload()

for fp in uploaded:
    # 2. 用 SoundFile 流式讀取
    snd = sf.SoundFile(fp)
    total_samples = len(snd)                            # e.g. 3600s × 16000 = 57600000
    n_chunks = int(np.ceil(total_samples / CHUNK_SMP))  # e.g. 57600000/480000 ≈ 120
    print(f"\n--- 處理檔案：{fp} ({total_samples/SR/60:.2f} 分鐘)，共需 {n_chunks} 塊 ---")

    segments = []
    t_all = time.time()

    for i in range(n_chunks):
        # 2-1. 決定讀取區間 (含重疊)
        mid_start = i * CHUNK_SMP
        seg_start = max(0, mid_start - STRIDE_SMP)
        seg_end   = min(mid_start + CHUNK_SMP + STRIDE_SMP, total_samples)

        # 2-2. 流式讀取該段樣本
        snd.seek(seg_start)
        audio = snd.read(seg_end - seg_start, dtype="float32")  # shape = ( segment_length, )

        # 2-3. 特徵抽取 + 推理
        feats = processor.feature_extractor(
            audio,
            sampling_rate=SR,
            return_tensors="pt",
            return_attention_mask=True
        )

        with torch.no_grad():
            ids = model.generate(
                feats.input_features.to(device),
                attention_mask=feats.attention_mask.to(device),
                decoder_input_ids=decoder_prompt,
                max_length=model.config.max_target_positions  # 448 = 4 prompt + 444 new tokens
            )
        text = processor.batch_decode(ids, skip_special_tokens=True)[0]
        segments.append(text)

        if (i+1) % 10 == 0 or i == n_chunks-1:
            print(f"  chunk {i+1}/{n_chunks} 完成")

    snd.close()

    # 3. 拼接所有段落文字
    full_transcript = "".join(segments)

    # 4. 輸出
    out_name = os.path.splitext(fp)[0] + "_1h_transcript.txt"
    with open(out_name, "w", encoding="utf-8") as f:
        f.write(full_transcript)

    print(f"✅ 完成 {fp} → {out_name} | 總耗時 {time.time()-t_all:.1f} 秒")


## 下面為補充如果 Cell 4 無法成功執行，請嘗試執行下方 Cell 5

In [None]:
# ----------------------------
#（長音檔版）：流式切片 → TPU 推理 → 拼接逐字稿
# ----------------------------
assert "processor" in globals() and "model" in globals(), \
       "請先執行 Cell 1（含 Dummy 編譯）"

import soundfile as sf, numpy as np, torch, time, os, warnings
from google.colab import files

warnings.filterwarnings("ignore", "PySoundFile")
warnings.filterwarnings("ignore", "Due to a bug fix")

device = next(model.parameters()).device
print("device:", device, "| dtype:", model.dtype)

# --- 參數 ---
SAMPLE_RATE = 16000
CHUNK_SEC   = 30          # Whisper 最大視窗
STRIDE_SEC  = 5           # 左右重疊（可改 3~5）
CHUNK_SMP   = CHUNK_SEC  * SAMPLE_RATE   # 480 000
STRIDE_SMP  = STRIDE_SEC * SAMPLE_RATE   # 80 000

# 中文轉寫 prompt（長度 2）
prompt_ids = processor.get_decoder_prompt_ids("chinese", "transcribe")[0]
decoder_prompt = torch.tensor([prompt_ids], device=device)

print("\n請上傳長音檔（可達數小時、格式 wav/mp3/m4a…）：")
uploaded = files.upload()

for f in uploaded:
    snd = sf.SoundFile(f)                    # 流式載入
    total_smp = len(snd)
    n_chunks  = int(np.ceil(total_smp / CHUNK_SMP))
    print(f"\n=== {f} → {total_smp/SAMPLE_RATE/60:.1f} 分鐘，切為 {n_chunks} 塊 ===")

    segments, t_start = [], time.time()
    for i in range(n_chunks):
        # 1) 決定讀取範圍（含重疊）
        mid_start = i * CHUNK_SMP
        seg_start = max(0, mid_start - STRIDE_SMP)
        seg_end   = min(mid_start + CHUNK_SMP + STRIDE_SMP, total_smp)
        snd.seek(seg_start)
        audio = snd.read(seg_end - seg_start, dtype="float32")

        # 2) 特徵抽取 → TPU 推理
        feats = processor.feature_extractor(audio, sampling_rate=SAMPLE_RATE,
                                            return_tensors="pt", return_attention_mask=True)
        with torch.no_grad():
            ids = model.generate(
                feats.input_features.to(device),
                attention_mask=feats.attention_mask.to(device),
                decoder_input_ids=decoder_prompt,
                max_length=model.config.max_target_positions
            )
        text = processor.batch_decode(ids, skip_special_tokens=True)[0]
        segments.append(text)

        if (i+1) % 20 == 0 or i == n_chunks-1:
            print(f" chunk {i+1}/{n_chunks} done")

    snd.close()
    transcript = "".join(segments)          # 簡易拼接；如需去重複可自行裁剪
    out_txt = os.path.splitext(f)[0] + "_long_transcript.txt"
    with open(out_txt, "w", encoding="utf-8") as fp:
        fp.write(transcript)

    print(f"✔ 完成：{out_txt} | 耗時 {time.time()-t_start:.1f}s")
