## Install necessary libaries for benchmarking speech to text with GPU T4x2

In [6]:
# Install torch with CUDA support
!pip install cuda-python
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install Whisper
!pip install git+https://github.com/openai/whisper.git

# Install dotenv for environment variable management
!pip install python-dotenv

# Install OpenAI Python client library
!pip install openai

# Install Groq library
!pip install groq

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-p8zsrk8l
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-p8zsrk8l
  Resolved https://github.com/openai/whisper.git to commit 90db0de1896c23cbfaf0c58bc2d30665f709f170
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [7]:
import torch

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available. Using CPU.")

Using GPU: Tesla T4


# Speech to Text code

In [8]:
import os
import time
import whisper
import torch
import warnings
from dotenv import load_dotenv
from openai import OpenAI
from groq import Groq

# Ignore the warning when using CPU
warnings.filterwarnings("ignore", message="You are using `torch.load` with `weights_only=False`")

# Load the API_KEY. Replace with your API Key
api_key = "OPENAI_API_KEY"
groq_api_key = "GROQ_API_KEY"

# Initialize the OpenAI client
client = OpenAI(api_key=api_key)

# Initialize the Groq client
groq_client = Groq(api_key=groq_api_key)

# Function to transcribe an audio file using OpenAI API with the whisper large model
def transcribe_audio_with_openai(audio_file: str) -> str:
    try:
        with open(audio_file, "rb") as file:
            transcription = client.audio.transcriptions.create(
                model="whisper-1", 
                file=file
            )
        return transcription.text
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""

# Function to transcribe an audio file using Groq API with the whisper large model
def transcribe_audio_with_groq(audio_file: str) -> str:
    try:
        with open(audio_file, "rb") as file:
            transcription = groq_client.audio.transcriptions.create(
                file=(audio_file, file.read()),
                model="whisper-large-v3-turbo",
                response_format="verbose_json",
            )
        return transcription.text
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""
    
# Function to transcribe an audio file using Whisper model running locally
def transcribe_audio_with_whisper_local(audio_file: str) -> str:
    try:
        # Check for CUDA availability, otherwise fallback to CPU
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")

        # Load the Whisper large model - turbo version is optimized for speed
        model = whisper.load_model("turbo", device=device)

        # Perform transcription
        if(device == "cpu"):
            result = model.transcribe(audio_file, fp16=False)
        else:
            result = model.transcribe(audio_file, fp16=True)
        return result["text"]
    except Exception as e:
        print(f"An error occurred: {e}")
        return ""

In [9]:
# List of audio file paths
audio_files = [f"/kaggle/input/audio-benchmark/audio_{i}.wav" for i in range(7)]

# Function definitions for OpenAI, Groq, and Whisper transcriptions
# Assume these functions are already defined: transcribe_audio_with_openai, transcribe_audio_with_groq, transcribe_audio_with_whisper_local

if __name__ == "__main__":
    for audio_file_path in audio_files:
        print(f"\nProcessing {audio_file_path}...")

        # Measure time for OpenAI transcription
        try:
            start_time = time.time()
            transcription_text = transcribe_audio_with_openai(audio_file_path)
            openai_duration = time.time() - start_time
            if transcription_text:
                print(f"OpenAI Transcription: {transcription_text}")
            print(f"OpenAI Transcription Time: {openai_duration:.2f} seconds")
        except Exception as e:
            print(f"OpenAI Transcription Error: {e}")

        # Measure time for Groq transcription
        try:
            start_time = time.time()
            transcription_text_groq = transcribe_audio_with_groq(audio_file_path)
            groq_duration = time.time() - start_time
            if transcription_text_groq:
                print(f"Groq Transcription: {transcription_text_groq}")
            print(f"Groq Transcription Time: {groq_duration:.2f} seconds")
        except Exception as e:
            print(f"Groq Transcription Error: {e}")

        # Measure time for Whisper local transcription
        try:
            start_time = time.time()
            transcription_text_whisper_local = transcribe_audio_with_whisper_local(audio_file_path)
            whisper_local_duration = time.time() - start_time
            if transcription_text_whisper_local:
                print(f"Whisper Local Transcription: {transcription_text_whisper_local}")
            print(f"Whisper Local Transcription Time: {whisper_local_duration:.2f} seconds")
        except Exception as e:
            print(f"Whisper Local Transcription Error: {e}")


Processing /kaggle/input/audio-benchmark/audio_0.wav...
OpenAI Transcription: 느낌이 확실히 나죠? 그래서 앞코가 조금 더 나는 동그란 타입을 원합니다 하시는 분들은 저희 소가죽 첼시 앵클부츠 그리고 제가 신고 있는 제품
OpenAI Transcription Time: 1.83 seconds
Groq Transcription:  느낌이 확실히 나죠? 그래서 앞코가 조금 더 나는 동그란 타입을 원합니다 하시는 분들은 저희 소가죽 첼시 앵클부츠 그리고 제가 신고 있는 제품
Groq Transcription Time: 0.54 seconds
Using device: cuda
Whisper Local Transcription:  느낌이 확실히 나죠? 그래서 앞코가 조금 더 나는 동그란 타입을 원합니다 하시는 분들은 저희 소가죽 첼시 앵클부츠 그리고 제가 신고 있는 제품
Whisper Local Transcription Time: 15.75 seconds

Processing /kaggle/input/audio-benchmark/audio_1.wav...
OpenAI Transcription: 앞코가 스퀘어라인으로 빠져있어서 조금 더 트렌디한 느낌이 납니다. 와, 비교 감사드려요 하는데요. 아직 한 가지 차이점이 더 남아있습니다.
OpenAI Transcription Time: 1.32 seconds
Groq Transcription:  보면 앞코가 스퀘어 라인으로 빠져 있어서 조금 더 트렌디한 느낌이 납니다 우와 비교 감사드려야 하는데요 아직 한 가지 차이점이 더 남아있습니다
Groq Transcription Time: 0.68 seconds
Using device: cuda
Whisper Local Transcription:  앞코가 스퀘어 라인으로 빠져 있어서 조금 더 트렌디한 느낌이 납니다 우와 비교 감사드려야 하는데요 아직 한 가지 차이점이 더 남아있습니다
Whisper Local Transcrip