## 1. Install required packages

In [None]:
!pip install -q openai-whisper
!pip install -q gradio

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m108.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.9 MB/s[0m eta [36m0:

In [None]:
!pip install -q -U google-genai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2. Import required libraries

In [None]:
import whisper
import gradio as gr
import os
import tempfile
import torch

## 3. Create the WhisperSTT class

In [None]:
class WhisperSTT:
    """
    Speech-to-Text class using OpenAI's Whisper model for Arabic transcription
    with automatic text correction using Google's Gemini model when possible.
    """

    def __init__(self, model_size="medium", gemini_api_key=None):
        """
        Initialize the Whisper model for Arabic speech recognition.

        Args:
            model_size (str): Size of the Whisper model (tiny, base, small, medium, large)
            gemini_api_key (str, optional): API key for Google's Gemini model for text correction
        """
        self.model = whisper.load_model(model_size)
        self.model_size = model_size
        self.gemini_api_key = gemini_api_key
        self.gemini_client = None

        # Initialize Gemini client if API key is provided
        if gemini_api_key:
            try:
                from google import genai
                self.gemini_client = genai.Client(api_key=gemini_api_key)
                print("Gemini text correction enabled")
            except ImportError:
                print("Warning: google-generativeai package not installed. Using raw transcription.")
            except Exception as e:
                print(f"Warning: Failed to initialize Gemini client: {str(e)}. Using raw transcription.")

    def transcribe(self, audio_path):
        """
        Transcribe Arabic speech to text with automatic text correction when possible.

        Args:
            audio_path (str): Path to the audio file

        Returns:
            str: Transcribed text (corrected if Gemini is available, otherwise raw)
        """
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        # Transcribe the audio with Arabic language hint
        result = self.model.transcribe(audio_path, language="ar")
        transcription = result["text"]

        # Try to apply text correction if Gemini client is available
        if self.gemini_client and self.gemini_api_key:
            try:
                corrected_text = self._correct_text(transcription)
                if corrected_text:
                    return corrected_text
            except Exception as e:
                print(f"Warning: Text correction failed: {str(e)}. Using raw transcription.")

        return transcription

    def _correct_text(self, text):
        """
        Correct the transcribed text using Google's Gemini model.

        Args:
            text (str): Original transcribed text

        Returns:
            str: Corrected text, or None if correction fails
        """
        try:
            from google import genai

            prompt = f"""
            صحّح النص التالي المكتوب باللهجة اللبنانية:
            - صحّح الأخطاء الإملائية والنحوية فقط.
            - إذا كان في جملة استفهامية، ضيف علامة استفهام.
            - ما تضيف ولا كلمة زيادة أو شرح.
            - رجّع فقط النص المصحَّح، بدون علامات تنصيص أو أي إضافات.
            النص: "{text}"
            """

            response = self.gemini_client.models.generate_content(
            model="gemini-2.0-flash", contents=prompt
            )
            return response.text.strip()
        except Exception as e:
            print(f"Text correction error: {str(e)}")
            return None

## 4.Testing

In [None]:
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
print("Gemini API key successfully loaded from secrets")

# Initialize with Gemini API key
stt_model = WhisperSTT(
    model_size="medium",
    gemini_api_key=GEMINI_API_KEY
)

# Transcribe with correction
transcription = stt_model.transcribe("/content/arabic_speech.wav")
print(transcription)


Gemini API key successfully loaded from secrets
Gemini text correction enabled
مرحبا فيروز، كيفك؟ جاهزة للمقابلة اليوم؟


In [None]:
# Transcribe with correction
transcription = stt_model.transcribe("/content/arabic_speech2.wav")
print(transcription)

أنا ماغي بو غصن، ممثلة لبنانية. عندي ولدان: ريان ويارا. بيحبوني كتير، بتمنى شوفهم عم يكبروا قدامي وعيش معهم لبقية حياتي.
