In [1]:
# ----------------------------
# Cell 1：安裝套件 + 載入 whisper-large-v3 到 TPU (BF16)
# ----------------------------

import os
os.environ["PT_XLA_DEBUG"] = "1" # Optional: for debugging XLA behavior

# 1. 卸載舊版 torch、torch_xla，避免版本衝突
!pip -q uninstall -y torch torch_xla 2>/dev/null || true #

# 2. 安裝 PyTorch 2.6.0 (C++11 ABI) + torch_xla 2.6.0 (TPU VM wheel, cp311)
!pip install -q \
  torch==2.6.0+cpu.cxx11.abi \
  https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0%2Bcxx11-cp311-cp311-manylinux_2_28_x86_64.whl \
  "torch_xla[tpu]==2.6.0" \
  -f https://storage.googleapis.com/libtpu-releases/index.html \
  -f https://storage.googleapis.com/libtpu-wheels/index.html \
  -f https://download.pytorch.org/whl/torch #

# 3. 安裝 Hugging Face Transformers (4.39.x) + sentencepiece + librosa + soundfile
!pip install -q "transformers>=4.39.0,<4.40.0" sentencepiece librosa soundfile #

# 4. 安裝系統音訊解碼工具 ffmpeg
!apt update -qq && apt install -y -qq ffmpeg #

# 5. 匯入 torch / torch_xla 並檢查 TPU 裝置
import torch
import torch_xla
import torch_xla.core.xla_model as xm

print("torch version   :", torch.__version__)       #
print("torch_xla ver.  :", torch_xla.__version__)   #
print("XLA devices     :", xm.get_xla_supported_devices())  #
device = xm.xla_device() # Gets the first XLA device
print("Using device    :", device)                #

# 6. 載入 whisper-large-v3 and cast to BF16
from transformers import WhisperProcessor, WhisperForConditionalGeneration

MODEL_NAME = "openai/whisper-large-v3" #
processor = WhisperProcessor.from_pretrained(MODEL_NAME) #

# Load model and immediately move to TPU device AND cast to bfloat16
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device).to(dtype=torch.bfloat16) #

print("Loaded model    :", MODEL_NAME)
print("Model dtype     :", model.dtype) # Should show torch.bfloat16
print("max_target_positions =", model.config.max_target_positions) #
print("Cell 1 完成 (BF16).")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.7/93.7 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m126.8 MB/s[0m eta [36m0:00:00[0m
[?25h4 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)[0m
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgrad

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

Loaded model    : openai/whisper-large-v3
Model dtype     : torch.bfloat16
max_target_positions = 448
Cell 1 完成 (BF16).


In [None]:
# ----------------------------
# Cell 4 (Optimized for BF16 inputs): 流式切片 1h → Whisper 推理 → 拼接繁體中文逐字稿
# ----------------------------

# Ensure model and processor from Cell 1 are available
assert "processor" in globals() and "model" in globals(), \
   "請先執行 Cell 1（含 Dummy 推理），並確認 runtime 尚未重啟。"

from google.colab import files
import soundfile as sf
import numpy as np
import torch
import time
import os
import warnings
import torch_xla.core.xla_model as xm # For xm.mark_step()

# 靜音警告
warnings.filterwarnings("ignore", "PySoundFile failed") # For certain audio formats
warnings.filterwarnings("ignore", "UserWarning: PySoundFile failed. Trying audioread instead.")
warnings.filterwarnings("ignore", "Due to a bug fix") # From Hugging Face Transformers

# 1. 基本設定
current_model_device = next(model.parameters()).device # Get device from the loaded model
current_model_dtype = model.dtype # Get dtype from the loaded model
print(f"Using device: {current_model_device}, Model dtype: {current_model_dtype}")

# Prompt IDs for Chinese transcription (should be same as earlier cells)
# This creates a prompt like <|startoftranscript|><|zh|><|transcribe|><|notimestamps|>
# The [0] gets the tuple of IDs, e.g., (50258, 50308, 50260, 50362)
_prompt_ids_tuple = processor.get_decoder_prompt_ids(language="chinese", task="transcribe")[0] #
# decoder_prompt will be shape (1, num_prompt_tokens)
decoder_prompt = torch.tensor([_prompt_ids_tuple], device=current_model_device) #

SR         = 16000      # 取樣率 (Hertz)
CHUNK_SEC  = 30         # 模型一次最大支援 30 秒
STRIDE_SEC = 5          # 左右各重疊 5 秒，可調 (步進 = CHUNK_SEC - 2*STRIDE_SEC if centered, or CHUNK_SEC - STRIDE_SEC if one-sided)
                        # The original code implies chunking every CHUNK_SEC and then adding STRIDE_SEC padding on both sides for reading
                        # Let's keep the original logic for segment calculation:
                        # mid_start = i * CHUNK_SMP
                        # seg_start = max(0, mid_start - STRIDE_SMP)
                        # seg_end   = min(mid_start + CHUNK_SMP + STRIDE_SMP, total_samples)
                        # This means each chunk is CHUNK_SEC long, but we read CHUNK_SEC + 2*STRIDE_SEC for context,
                        # which Whisper will then process up to its 30s limit.

CHUNK_SMP  = CHUNK_SEC  * SR  # 30*16000 = 480000 samples
STRIDE_SMP = STRIDE_SEC * SR  # 5*16000  = 80000 samples

print(f"Chunk setup: {CHUNK_SEC}s chunks with {STRIDE_SEC}s stride/overlap on each side for reading context.")
print(f"Decoder prompt for generation (on {decoder_prompt.device}): {decoder_prompt}")

print("\n請上傳最長 ≥1h 的音檔（wav/mp3/m4a...）")
uploaded = files.upload()

if not uploaded:
    print("未上傳任何檔案。")
else:
    for fp in uploaded:
        print(f"\n--- 開始處理檔案：{fp} ---")
        try:
            # 2. 用 SoundFile 流式讀取
            with sf.SoundFile(fp, 'r') as snd:
                total_samples = len(snd)
                # Calculate number of primary 30-second chunks
                # This calculation determines how many times we advance our main processing window.
                # Each advance is by CHUNK_SMP.
                n_chunks = int(np.ceil(total_samples / CHUNK_SMP))
                if total_samples == 0:
                    print(f"錯誤：檔案 {fp} 為空或無法讀取時長。")
                    continue

                show_str = f"檔案時長: {total_samples/SR:.2f} 秒 ({total_samples/SR/60:.2f} 分鐘). "
                show_str += f"預計切成 {n_chunks} 個主要區塊 (每區塊 {CHUNK_SEC}s)."
                print(show_str)

                segments = []
                t_all_start = time.time()

                for i in range(n_chunks):
                    chunk_num = i + 1
                    print(f"  處理區塊 {chunk_num}/{n_chunks}...")

                    # 2-1. 決定讀取區間 (含重疊)
                    # mid_start is the beginning of the nominal CHUNK_SEC segment
                    mid_start = i * CHUNK_SMP

                    # Read a wider segment for context, then Whisper's feature extractor will crop/pad to 30s
                    seg_start = max(0, mid_start - STRIDE_SMP)
                    seg_end   = min(total_samples, mid_start + CHUNK_SMP + STRIDE_SMP) # Read up to CHUNK_SEC + 2*STRIDE_SMP window
                                                                                      # But feature_extractor will handle max 30s.

                    if seg_start >= seg_end: # Should not happen if total_samples > 0
                        print(f"    跳過空的讀取區間: seg_start={seg_start}, seg_end={seg_end}")
                        continue

                    # 2-2. 流式讀取該段樣本
                    snd.seek(seg_start)
                    # Read (seg_end - seg_start) samples
                    audio_segment = snd.read(seg_end - seg_start, dtype="float32", always_2d=False) # Read as 1D array
                    # audio_segment shape will be (num_samples_in_segment,)

                    if audio_segment.ndim > 1 and audio_segment.shape[0] > 1 and audio_segment.shape[1] > 1: # if stereo
                        audio_segment = np.mean(audio_segment, axis=1) # Convert to mono by averaging channels

                    if len(audio_segment) == 0:
                        print(f"    警告：區塊 {chunk_num} 讀取到 0 個樣本。 seg_start={seg_start}, seg_end={seg_end}")
                        continue

                    # 2-3. 特徵抽取 + 推理
                    # The feature_extractor will automatically handle audio longer than 30s
                    # by taking the first 30s (480,000 samples).
                    # It also pads shorter audio to 30s.
                    feats = processor.feature_extractor(
                        audio_segment,
                        sampling_rate=SR,
                        return_tensors="pt",
                        return_attention_mask=True
                    )

                    # Move features to device and cast to BF16
                    input_features_bf16 = feats.input_features.to(current_model_device).to(dtype=current_model_dtype)
                    attention_mask_device = feats.attention_mask.to(current_model_device)

                    t_gen_start = time.time()
                    with torch.no_grad():
                        generated_ids = model.generate(
                            input_features_bf16,
                            attention_mask=attention_mask_device,
                            decoder_input_ids=decoder_prompt, # Use the pre-defined prompt for Chinese transcription
                            max_length=model.config.max_target_positions # e.g. 448
                        )
                    xm.mark_step() # Ensure graph execution for this step is flushed
                    t_gen_elapsed = time.time() - t_gen_start

                    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                    segments.append(text)

                    preview = text[:80].replace("\n", " ") + ("..." if len(text) > 80 else "")
                    print(f"    區塊 {chunk_num} 推理完成 ({t_gen_elapsed:.2f}s): \"{preview}\"")

                # snd.close() happens automatically with 'with sf.SoundFile(...) as snd:'

                # 3. 拼接所有段落文字
                # This is a simple concatenation. Overlapping text from strides is not explicitly handled/merged here,
                # but Whisper is often robust enough if the overlap is reasonable.
                full_transcript = "".join(segments)

                # 4. 輸出
                out_name = os.path.splitext(fp)[0] + "_transcript_long.txt"
                with open(out_name, "w", encoding="utf-8") as f_out:
                    f_out.write(full_transcript)

                t_all_elapsed = time.time() - t_all_start
                print(f"✅ 完成檔案 {fp} → {out_name} | 總耗時 {t_all_elapsed:.1f} 秒")
                if n_chunks > 0:
                     print(f"  平均每主要區塊耗時: {t_all_elapsed/n_chunks:.2f} 秒 (包含讀檔與推理)")

        except Exception as e:
            print(f"處理檔案 {fp} 時發生錯誤: {e}")
            import traceback
            traceback.print_exc()

    print("\n所有上傳檔案處理完畢！")

Using device: xla:0, Model dtype: torch.bfloat16
Chunk setup: 30s chunks with 5s stride/overlap on each side for reading context.
Decoder prompt for generation (on xla:0): tensor([[    1, 50260]], device='xla:0')

請上傳最長 ≥1h 的音檔（wav/mp3/m4a...）


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Saving Team Project Conflict Resolution Attempt.wav to Team Project Conflict Resolution Attempt.wav

--- 開始處理檔案：Team Project Conflict Resolution Attempt.wav ---
檔案時長: 505.82 秒 (8.43 分鐘). 預計切成 17 個主要區塊 (每區塊 30s).
  處理區塊 1/17...
