In [1]:
import os

# for ROCm, 10.3.0 is gfx1030
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.0"
# for multiple GPUs ,e.g. you have GPU + iGPU/APU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# export HSA_OVERRIDE_GFX_VERSIONPGHOST="10.3.0"
# export CUDA_VISIBLE_DEVICES="0"

In [2]:
import bentoml
import os
import typing as t

from pathlib import Path

LANGUAGE_CODE = "en"


@bentoml.service(
    traffic={"timeout": 30},
    resources={
        "gpu": 1,
        "memory": "8Gi",
    },
)
class WhisperX:
    """
    This class is inspired by the implementation shown in the whisperX project.
    Source: https://github.com/m-bain/whisperX
    """

    def __init__(self):
        import torch
        import whisperx

        self.batch_size = 16  # reduce if low on GPU mem
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        compute_type = "float16" if torch.cuda.is_available() else "int8"
        self.model = whisperx.load_model(
            "large-v2", self.device, compute_type=compute_type, language=LANGUAGE_CODE
        )
        self.model_a, self.metadata = whisperx.load_align_model(
            language_code=LANGUAGE_CODE, device=self.device
        )

    @bentoml.api
    def transcribe(self, audio_file: Path) -> t.Dict:
        import whisperx

        audio = whisperx.load_audio(audio_file)
        result = self.model.transcribe(audio, batch_size=self.batch_size)
        result = whisperx.align(
            result["segments"],
            self.model_a,
            self.metadata,
            audio,
            self.device,
            return_char_alignments=False,
        )

        return result

In [1]:
import torch

torch.cuda.is_available()
torch.cuda.get_device_name(0)

'AMD Radeon RX 6900 XT'

In [2]:
from faster_whisper import WhisperModel

model_size = "large-v3"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = model.transcribe("./female.wav", beam_size=5)

print(
    "Detected language '%s' with probability %f"
    % (info.language, info.language_probability)
)

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

RuntimeError: CUDA failed with error CUDA driver version is insufficient for CUDA runtime version