# Whisper v3 is here!

Whisper v3 is a new model open sourced by OpenAI. The model can do multilingual transcriptions and is quite impressive. For example, you can change from English to Spanish or Chinese in the middle of a sentence and it will work well!

The model can be run in a free Google Colab instance and is integrated into `transformers` already, so switching can be a very smooth process if you already use the previous versions.

In [None]:
%%capture
!pip install git+https://github.com/huggingface/transformers gradio

Let's use the high level `pipeline` from the `transformers` library to load the model.

In [None]:
import torch
from transformers import pipeline

pipe = pipeline("automatic-speech-recognition",
               "openai/whisper-large-v3",
               torch_dtype=torch.float16,
               device="cuda:0")

In [None]:
pipe("https://cdn-media.huggingface.co/speech_samples/sample1.flac")

Let's now build a quick Gradio demo where we can play with the model directly using our microphone! You can run this code in a Google Colab instance (or locally!) or just head to the <a href="https://huggingface.co/spaces/hf-audio/whisper-large-v3" target="_blank">Space</a> to play directly with it online.

In [None]:
import gradio as gr

def transcribe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please record an audio before submitting your request.")

    text = pipe(inputs, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
    return text

demo = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["microphone", "upload"], type="filepath"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        " checkpoint [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

demo.launch()


In [None]:
#!/usr/bin/env python3
import os
import time
import gradio as gr
import torch
from datetime import timedelta
from transformers import pipeline

# ===== Whisper ASR Pipeline =====
# 根据你的环境选择设备：GPU 优先，否则 CPU
DEVICE = 0 if torch.cuda.is_available() else -1
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

pipe = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-large-v3",
    torch_dtype=DTYPE,
    device=DEVICE,
    generate_kwargs={"task": "transcribe"},     # 原声转写（非翻译）
    chunk_length_s=30,                          # 长音频自动分块
    batch_size=8,
    return_timestamps=True
)

def _format_timestamp(seconds: float) -> str:
    """秒 -> SRT 时间戳 00:00:00,000"""
    if seconds is None:
        seconds = 0.0
    td = timedelta(seconds=float(seconds))
    total_seconds = int(td.total_seconds())
    millis = int((td.total_seconds() - total_seconds) * 1000)
    h = total_seconds // 3600
    m = (total_seconds % 3600) // 60
    s = total_seconds % 60
    return f"{h:02d}:{m:02d}:{s:02d},{millis:03d}"

def _chunks_to_srt(chunks) -> str:
    """
    将 pipeline 返回的 chunks（含 'timestamp': (start, end), 'text'）
    转换为标准 .srt 文本。
    """
    lines = []
    idx = 1
    for ch in chunks:
        ts = ch.get("timestamp", None)
        text = (ch.get("text") or "").strip()
        if not text:
            continue
        # ts 可能是 (start, end) 或 None
        if ts and isinstance(ts, (list, tuple)) and len(ts) == 2:
            start, end = ts
        else:
            # 某些情况下没有时间戳，尽量填充
            start, end = None, None
        start_str = _format_timestamp(start if start is not None else 0.0)
        end_str   = _format_timestamp(end   if end   is not None else 0.0)
        lines.append(str(idx))
        lines.append(f"{start_str} --> {end_str}")
        lines.append(text)
        lines.append("")  # 段落空行
        idx += 1
    return "\n".join(lines).strip() + "\n"

def transcribe(audio_path):
    if audio_path is None:
        raise gr.Error("No audio file submitted! Please record or upload audio before submitting.")

    # 运行转写
    result = pipe(audio_path)  # dict: {'text': '...', 'chunks': [...]}（开启了 return_timestamps）
    text = result.get("text", "").strip()
    chunks = result.get("chunks", []) or []

    # 生成 SRT 内容
    srt_text = _chunks_to_srt(chunks)

    # 保存到临时文件
    base = os.path.splitext(os.path.basename(audio_path))[0]
    safe_base = base if base else f"transcript_{int(time.time())}"
    srt_path = os.path.abspath(f"{safe_base}.srt")
    with open(srt_path, "w", encoding="utf-8") as f:
        f.write(srt_text)

    return text, srt_path

demo = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio (mic/upload)")
    ],
    outputs=[
        gr.Textbox(label="Transcript (Plain Text)", lines=8),
        gr.File(label="Download .srt")
    ],
    title="Whisper Large V3: Transcribe Audio → SRT",
    description=(
        "Transcribe long-form microphone or audio inputs with Whisper Large V3 and export subtitles (.srt). "
        "Model: openai/whisper-large-v3 (🤗 Transformers)."
    ),
    allow_flagging="never",
)

if __name__ == "__main__":
    demo.launch()
