In [1]:
import warnings
from pathlib import Path

import torch
from accelerate import Accelerator
from dotenv import load_dotenv
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

warnings.filterwarnings("ignore")

load_dotenv()


def whisper_ft(
    audio: str | bytes,
    peft_model_id: str = "OrcinusOrca/Whisper-Cantonese",
    base_model_id: str = "openai/whisper-large-v3-turbo",
) -> dict:
    """Transcribe audio file using whisper-large-v3-turbo model with Hugging Face optimization.

    Args:
        audio (str | bytes): The audio file path string or bytes data to be transcribed.

    Returns:
        dict: A dictionary containing the transcription result with the following structure:
            {
                "text": str,    # Full transcribed text
                "chunks": [     # List of transcription chunks
                    # Each chunk is a dictionary with:
                    {
                        "timestamp": tuple[float],  # Start and end time of the chunk
                        "text": str,               # Transcribed text for this chunk
                    },
                ]
            }
    """
    accelerator = Accelerator()
    device = accelerator.device
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    print(f"Using device: {device} ({torch_dtype})")

    # Initialize model and processor
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        peft_model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        use_safetensors=True,
    ).to(device)
    processor = AutoProcessor.from_pretrained(base_model_id)

    # Prepare model and processor for distributed training
    model, processor = accelerator.prepare(model, processor)

    # Create pipeline with the unwrapped model
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model.module if hasattr(model, "module") else model,  # Handle both DDP and non-DDP cases
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        device=device,
    )

    result = pipe(
        audio,
        generate_kwargs={"language": "yue", "task": "transcribe"},
    )

    return result

2025-05-06 07:06:58.483428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746515218.499082 2006020 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746515218.503617 2006020 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746515218.516082 2006020 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746515218.516096 2006020 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746515218.516097 2006020 computation_placer.cc:177] computation placer alr

[2025-05-06 07:07:00,245] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_audit_symbind_alt@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `__nptl_change_stack_perm@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_find_dso_for_object@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_fatal_printf@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_exception_create@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `__tunable_get_val@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_audit_preinit@GLIBC_PRIVATE'
collect2: error: ld returned 1 exit status
/home/t

In [None]:
import warnings
from pathlib import Path

import torch
from accelerate import Accelerator
from dotenv import load_dotenv
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

warnings.filterwarnings("ignore")

load_dotenv()


def whisper_hf(
    audio: str | bytes,
    model_id: str = "openai/whisper-large-v3-turbo",
) -> dict:
    """Transcribe audio file using whisper-large-v3-turbo model with Hugging Face optimization.

    Args:
        audio (str | bytes): The audio file path string or bytes data to be transcribed.

    Returns:
        dict: A dictionary containing the transcription result with the following structure:
            {
                "text": str,    # Full transcribed text
                "chunks": [     # List of transcription chunks
                    # Each chunk is a dictionary with:
                    {
                        "timestamp": tuple[float],  # Start and end time of the chunk
                        "text": str,               # Transcribed text for this chunk
                    },
                ]
            }
    """
    accelerator = Accelerator()
    device = accelerator.device
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    print(f"Using device: {device} ({torch_dtype})")

    # Initialize model and processor
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        use_safetensors=True,
    ).to(device)
    processor = AutoProcessor.from_pretrained(model_id)

    # Prepare model and processor for distributed training
    model, processor = accelerator.prepare(model, processor)

    # Create pipeline with the unwrapped model
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model.module if hasattr(model, "module") else model,  # Handle both DDP and non-DDP cases
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        batch_size=16,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        device=device,
    )

    result = pipe(
        audio,
        generate_kwargs={"language": "yue", "task": "transcribe"},
    )

    return result

2025-05-15 04:06:08.398739: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747281968.419133 3168077 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747281968.424861 3168077 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747281968.440690 3168077 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747281968.440715 3168077 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747281968.440717 3168077 computation_placer.cc:177] computation placer alr

[2025-05-15 04:06:10,807] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_audit_symbind_alt@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `__nptl_change_stack_perm@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_find_dso_for_object@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_fatal_printf@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_exception_create@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `__tunable_get_val@GLIBC_PRIVATE'
/home/teron/miniconda3/compiler_compat/ld: /lib/x86_64-linux-gnu/libc.so.6: undefined reference to `_dl_audit_preinit@GLIBC_PRIVATE'
collect2: error: ld returned 1 exit status
/home/t

In [8]:
import warnings
from pathlib import Path

import torch
from accelerate import Accelerator
from dotenv import load_dotenv
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

warnings.filterwarnings("ignore")

load_dotenv()


def whisper_zh(
    audio: str | bytes,
    model_id: str = "alvanlii/whisper-small-cantonese",
) -> dict:
    """Transcribe audio file using whisper-small-cantonese model with Hugging Face optimization.

    Args:
        audio (str | bytes): The audio file path string or bytes data to be transcribed.
        model_id (str, optional): The model ID to use for transcription. Defaults to "alvanlii/whisper-small-cantonese".

    Returns:
        dict: A dictionary containing the transcription result with the following structure:
            {
                "text": str,    # Full transcribed text
                "chunks": [     # List of transcription chunks
                    # Each chunk is a dictionary with:
                    {
                        "timestamp": tuple[float],  # Start and end time of the chunk
                        "text": str,               # Transcribed text for this chunk
                    },
                ]
            }
    """
    accelerator = Accelerator()
    device = accelerator.device
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    print(f"Device set to use {device}")

    # Initialize model and processor
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        use_safetensors=True,
    ).to(device)
    processor = AutoProcessor.from_pretrained(model_id)

    # Prepare model and processor for distributed training
    model, processor = accelerator.prepare(model, processor)

    # Create pipeline with the unwrapped model
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model.module if hasattr(model, "module") else model,  # Handle both DDP and non-DDP cases
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        batch_size=16,
        # Fix: Set return_timestamps to False to avoid the error
        return_timestamps=False,
        torch_dtype=torch_dtype,
        device=device,
    )

    result = pipe(audio)

    # Since we're not using timestamps, ensure the result has a consistent format
    if isinstance(result, str):
        result = {"text": result, "chunks": []}
    elif not isinstance(result, dict):
        result = {"text": str(result), "chunks": []}
    elif "chunks" not in result:
        result["chunks"] = []

    return result

In [9]:
audio_path = "dataset/bboblackboxoffice/0v03xkSi4Ic/0v03xkSi4Ic_1-15.mp3"
result1 = whisper_zh(audio_path)
print(result1["text"])

result2 = whisper_hf(audio_path)
print(result2["text"])

Device set to use cuda


Device set to use cuda


iffe 冇啲一條龍呀嗰啲呢跟住跟住可能想問呀等我透試下睇下佢條底褲正唔正事如果我離 M嗰就直接底褲都唔使買 直接今晚染紅係 直接噉嘛染紅係 究竟有乜嘢驅使佢連底褲都唔着呢 係求偶吖 求愛吖 定係求平安呀 肥我喇
Using device: cuda (torch.float16)


Device set to use cuda
You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


看有沒有一條龍那些然後他可能心想啊... 透視一下看看他底裤正不正如果我來M的那樣就直接底裤也不用買直接今晚添紅他究竟有什麼驱使他连底裤也不穿呢是求偶啊求愛啊還是...求平安啊...啊...啊......死我啦


In [4]:
from peft import PeftConfig, PeftModel
from transformers import (
    BitsAndBytesConfig,
    Seq2SeqTrainer,
    WhisperForConditionalGeneration,
)
from transformers.models.whisper import WhisperConfig

peft_model_id = "OrcinusOrca/Whisper-Cantonese"
peft_config: PeftConfig = PeftConfig.from_pretrained(peft_model_id)
quantization_config: BitsAndBytesConfig = BitsAndBytesConfig(load_in_8bit=True)

model: WhisperForConditionalGeneration = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path,
    quantization_config=quantization_config,
    device_map="auto",
)
model: PeftModel = PeftModel.from_pretrained(model, peft_model_id)

model.model.use_cache = True

In [None]:
from accelerate import Accelerator
from peft import PeftConfig, PeftModel
from transformers import (
    AutomaticSpeechRecognitionPipeline,
    WhisperForConditionalGeneration,
    WhisperProcessor,
    WhisperTokenizer,
)

accelerator = Accelerator()
device = accelerator.device
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

peft_model_id = "OrcinusOrca/Whisper-Cantonese"
language = "yue"
task = "transcribe"
peft_config: PeftConfig = PeftConfig.from_pretrained(peft_model_id)
quantization_config: BitsAndBytesConfig = BitsAndBytesConfig(load_in_8bit=True)

model = WhisperForConditionalGeneration.from_pretrained(
    peft_config.base_model_name_or_path,
    quantization_config=quantization_config,
)

model: PeftModel = PeftModel.from_pretrained(model, peft_model_id)

processor = WhisperProcessor.from_pretrained(
    peft_config.base_model_name_or_path,
    language=language,
    task=task,
)
feature_extractor = processor.feature_extractor
tokenizer = processor.tokenizer
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)

pipe = AutomaticSpeechRecognitionPipeline(
    model=model,
    feature_extractor=feature_extractor,
    tokenizer=tokenizer,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
)


def transcribe(audio):
    with torch.cuda.amp.autocast():
        text = pipe(audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]
    return text


audio_path = "dataset/bboblackboxoffice/0v03xkSi4Ic/0v03xkSi4Ic_1-15.mp3"
transcribe(audio_path)

adapter_config.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

NameError: name 'BitsAndBytesConfig' is not defined