# Interactive Voice Call with Maguy Abou Ghosn

This notebook creates an interactive voice-based conversational system featuring Lebanese actress Maguy Abou Ghosn. The system uses a combination of cutting-edge AI technologies to enable natural voice interactions in Lebanese Arabic dialect.

## Components

- **Speech Recognition**: Uses OpenAI's Whisper model to transcribe Lebanese Arabic speech to text
- **Language Model**: Implements a fine-tuned Gemma 3 4B model that responds in character as Maguy Abou Ghosn
- **Text-to-Speech**: Leverages Edge TTS to convert the AI responses back to speech
-**Voice Cloning**: Convert generated speech into maguy's voice using coqui voice conversion model as alternative to RVC



In [None]:
#@title ### Prevent timeouts due to inactivity
from IPython.display import display, Javascript
import time

def keep_alive():
    display(Javascript('''
        function click() {
            console.log("Keeping session alive...");
            document.querySelector("colab-toolbar-button#connect").click();
        }
        setInterval(click, 60000);
        '''))

keep_alive()

<IPython.core.display.Javascript object>

## 1. Installation

In [None]:
!pip install -q openai-whisper

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m54.9 MB/s[0m eta [36m0:

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    !pip install --no-deps unsloth vllm
# Install latest Hugging Face for Gemma-3!
!pip install --no-deps git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [None]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth vllm
else:
    !pip install --no-deps unsloth vllm
    # [NOTE] Do the below ONLY in Colab! Use [[pip install unsloth vllm]]
    # Skip restarting message in Colab
    import sys, re, requests; modules = list(sys.modules.keys())
    for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft "trl==0.15.2" triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

    # vLLM requirements - vLLM breaks Colab due to reinstalling numpy
    f = requests.get("https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/requirements/common.txt").content
    with open("vllm_requirements.txt", "wb") as file:
        file.write(re.sub(rb"(transformers|numpy|xformers)[^\n]{1,}\n", b"", f))
    !pip install -r vllm_requirements.txt

In [None]:
!pip install -q -U google-genai
!pip install edge-tts
!pip install gradio
#!pip install fastrtc[vad]

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/159.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.7/159.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting edge-tts
  Downloading edge_tts-7.0.1-py3-none-any.whl.metadata (5.5 kB)
Collecting srt<4.0.0,>=3.4.1 (from edge-tts)
  Downloading srt-3.5.3.tar.gz (28 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading edge_tts-7.0.1-py3-none-any.whl (26 kB)
Building wheels for collected packages: srt
  Building wheel for srt (setup.py) ... [?25l[?25hdone
  Created wheel for srt: filename=srt-3.5.3-py3-none-any.whl size=22427 sha256=d1e9cca374c13f8d7ab9a8765b700df0887036374b066633b7fb28121f8af15e
  Stored in directory: /root/.cache/pip/wheels/1f/43/f1/23ee9119497fcb57d9f7046fbf34c6d9027c46a1fa7824cf08
Successfully built srt
Installing collected packages: srt, edge-tts
Successfully installed edge-tts-7.0.1 srt-3.5.3
Collecting gradio
 

In [None]:
!pip install -q TTS

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m118.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m126.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00

In [None]:
import numpy as np
np.__version__

'1.26.4'

## 2. Define Classes

In [None]:
#@title ## WhisperSTT
import whisper
import os

class WhisperSTT:
    """
    Speech-to-Text class using OpenAI's Whisper model for Arabic transcription
    with automatic text correction using Google's Gemini model when possible.
    """

    def __init__(self, model_size="medium", gemini_api_key=None):
        """
        Initialize the Whisper model for Arabic speech recognition.

        Args:
            model_size (str): Size of the Whisper model (tiny, base, small, medium, large)
            gemini_api_key (str, optional): API key for Google's Gemini model for text correction
        """
        self.model = whisper.load_model(model_size)
        self.model_size = model_size
        self.gemini_api_key = gemini_api_key
        self.gemini_client = None

        # Initialize Gemini client if API key is provided
        if gemini_api_key:
            try:
                from google import genai
                self.gemini_client = genai.Client(api_key=gemini_api_key)
                print("Gemini text correction enabled")
            except ImportError:
                print("Warning: google-generativeai package not installed. Using raw transcription.")
            except Exception as e:
                print(f"Warning: Failed to initialize Gemini client: {str(e)}. Using raw transcription.")

    def transcribe(self, audio_path):
        """
        Transcribe Arabic speech to text with automatic text correction when possible.

        Args:
            audio_path (str): Path to the audio file

        Returns:
            str: Transcribed text (corrected if Gemini is available, otherwise raw)
        """
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")

        # Transcribe the audio with Arabic language hint
        result = self.model.transcribe(audio_path, language="ar")
        transcription = result["text"]

        # Try to apply text correction if Gemini client is available
        if self.gemini_client and self.gemini_api_key:
            try:
                corrected_text = self._correct_text(transcription)
                if corrected_text:
                    return corrected_text
            except Exception as e:
                print(f"Warning: Text correction failed: {str(e)}. Using raw transcription.")

        return transcription

    def _correct_text(self, text):
        """
        Correct the transcribed text using Google's Gemini model.

        Args:
            text (str): Original transcribed text

        Returns:
            str: Corrected text, or None if correction fails
        """
        try:
            from google import genai

            prompt = f"""
            صحّح النص التالي المكتوب باللهجة اللبنانية:
            - صحّح الأخطاء الإملائية والنحوية فقط.
            - إذا كان في جملة استفهامية، ضيف علامة استفهام.
            - ما تضيف ولا كلمة زيادة أو شرح.
            - رجّع فقط النص المصحَّح، بدون علامات تنصيص أو أي إضافات.
            النص: "{text}"
            """

            response = self.gemini_client.models.generate_content(
            model="gemini-2.0-flash", contents=prompt
            )
            return response.text.strip()
        except Exception as e:
            print(f"Text correction error: {str(e)}")
            return None

In [None]:
#@title ## GemmaLLM
import unsloth
from unsloth import FastModel
from peft import PeftModel
import torch

class GemmaLLM:
    """
    Language Model component using Gemma 3 12B with a fine-tuned LoRA adapter
    for generating responses in Lebanese dialect as Maguy Abou Ghosn.
    """

    def __init__(self, base_model_name="unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
                 adapter_path="lara1510/maguy-gemma-3-12b-lora",
                 max_seq_length=2048):
        """
        Initialize the Gemma model with Maguy Abou Ghosn fine-tuning.

        Args:
            base_model_name (str): HF path to the base model
            adapter_path (str): HF path to the LoRA adapter
            max_seq_length (int): Maximum sequence length for the model
        """
        # Load the base model and tokenizer
        print("Loading the model...")
        self.base_model, self.tokenizer = FastModel.from_pretrained(
            model_name=base_model_name,
            max_seq_length=max_seq_length,
            load_in_4bit=True,
            device_map="auto",
        )

        print("Loading the adapter...")
        # Load and apply the LoRA adapter
        self.model = PeftModel.from_pretrained(self.base_model, adapter_path)

        # Define the system message for Maguy's personality
        self.system_message = """إنتي ماغي بو غصن، الممثلة اللبنانية المشهورة. بتحكي باللهجة اللبنانية متل ما إنتي بتحكي بالحياة الحقيقية، مش بالفصحى.

        معلومات عنك:
        - خلقتي ب18 أيلول 1975 (50 سنة) ببصاليم، لبنان
        - عايشة هلأ ببيروت
        - طولك 1.56 متر
        - درستي تمثيل وإخراج بالجامعة اللبنانية
        - إنتي البنت الوحيدة بين أخوتك جهاد ووسام وإنتي بالنص

        عيلتك:
        - تجوزتي مرتين
        - جوزك الحالي هو المنتج جمال سنان (تجوزتو ب4 تشرين الأول 2003)
        - عندك بس ولدين: ريان ويارا

        شغلك ومسيرتك:
        - بلشتي التمثيل بسنة 1991 مع المخرج أنطوان ريمي بمسلسلات "الأخوان" و"المغاور"
        - شهرتك الحقيقية كانت من خلال الدراما السورية
        - من أهم أعمالك: "للموت"، "بالدما" (2025)، "ع أمل" (2024)، "أولاد آدم" (2020)
        - من أفلامك: "تايم آوت" (2018)، "حبة كراميل" (2017)، "ولعانة" (2016)

        صحتك:
        - ب22 تموز 2019، عملتي عملية لإزالة ورم دماغي حميد وتعافيتي منو بعد كم أسبوع

        جوايزك:
        - جايزة الموريكس دور 2010 كأحسن دور مساند ب"دكتور هلا" و"متر ندى"
        - جايزة أحسن ممثلة لبنانية 2021 من مهرجان الموريكس دور عن دورك بمسلسل "للموت"
        - جايزة الموريكس دور 2024 كأحسن ممثلة لبنانية عن دورك بمسلسل "ع أمل"

        شغلات تانية عنك:
        - فزتي ببرنامج "ديو المشاهير" بسنة 2011 وتبرعتي بالجايزة (50 ألف دولار) لجمعية سان جود لعلاج سرطان الأطفال
        - درستي موسيقى بالمعهد العالي للموسيقى لما كان عمرك 12 سنة
        - خدتي الميدالية الدهبية ببرنامج "واحة الأولاد" عتلفزيون لبنان بعد ما غنيتي للست فيروز

        لازم تكون كل إجاباتك باللهجة اللبنانية مش بالفصحى أبداً! وخلي إجاباتك قصيرة لأنو هيدي مكالمة صوتية تفاعلية."""

    def generate_response(self, user_input, temperature=0.7, max_tokens=100):
        """
        Generate a response to the user input in Lebanese dialect as Maguy Abou Ghosn.

        Args:
            user_input (str): User's question or statement
            temperature (float): Controls randomness (0.0-1.0)
            max_tokens (int): Maximum number of tokens to generate

        Returns:
            str: Model's response in Lebanese dialect
        """
        if not user_input or not user_input.strip():
            return "شو بدك تسألني؟"

        # Create messages format for chat template
        messages = [
            {"role": "system", "content": self.system_message},
            {"role": "user", "content": user_input}
        ]

        # Apply the chat template
        text = self.tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True
        )

        # Generate response
        outputs = self.model.generate(
            **self.tokenizer([text], return_tensors="pt").to(self.model.device),
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.95,
            top_k=64,
        )

        # Decode and process the response
        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract just the model's response
        if user_input in full_response:
            response = full_response.split(user_input, 1)[1].strip()
        else:
            response = full_response.split("[/INST]", 1)[-1].strip()

        # Remove any trailing "model" text if it exists
        if "model" in response.lower():
            response = response.lower().split("model", 1)[1].strip()
        print("Response:")
        return response

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-22 07:14:47 [__init__.py:239] Automatically detected platform cuda.


In [None]:
#@title ##EdgeTTS
import edge_tts
import io
import tempfile
from IPython.display import Audio, display


class EdgeTTS:
    """Simple class to handle Edge TTS functionality."""

    async def text_to_speech(self, text, voice="ar-OM-AyshaNeural"):
      if not text.strip():
          return None, "Please enter text to convert."
      if not voice:
          return None, "Please select a voice."

      communicate = edge_tts.Communicate(text, voice)

      # Save directly to mp3 file (Edge TTS actually outputs mp3 format)
      with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
          tmp_path = tmp_file.name
          await communicate.save(tmp_path)

      return tmp_path, None

In [None]:
#@title ##VoiceConverter
class VoiceConverter:
    """
    Voice conversion class using FreeVC24 model to convert TTS output to match Maguy Abou Ghosn's voice.
    """

    def __init__(self, model_name="voice_conversion_models/multilingual/vctk/freevc24",
                 reference_audio_path="/content/maguy_initial.wav"):
        """
        Initialize the voice conversion model.

        Args:
            model_name (str): Name or path of the voice conversion model
            reference_audio_path (str): Path to a reference audio file containing Maguy's voice
        """
        try:
            from TTS.api import TTS
            self.tts = TTS(model_name=model_name, progress_bar=True, gpu=True)
            self.reference_audio_path = reference_audio_path
            print("Voice conversion model initialized successfully")
        except ImportError:
            print("Warning: TTS package not installed. Install with 'pip install TTS'")
            self.tts = None
        except Exception as e:
            print(f"Warning: Failed to initialize voice conversion model: {str(e)}")
            self.tts = None

    def convert_voice(self, input_audio_path, output_audio_path=None):
        """
        Convert the voice in the input audio to match Maguy's voice.

        Args:
            input_audio_path (str): Path to the input audio file (EdgeTTS output)
            output_audio_path (str, optional): Path where to save the converted audio.
                                              If None, a temporary file will be created.

        Returns:
            str: Path to the converted audio file
        """
        if self.tts is None:
            print("Voice conversion model not available. Returning original audio.")
            return input_audio_path

        try:
            # Create a temporary output file if not provided
            if output_audio_path is None:
                import tempfile
                temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
                output_audio_path = temp.name
                temp.close()

            # Perform voice conversion
            self.tts.voice_conversion_to_file(
                input_audio_path,
                self.reference_audio_path,
                output_audio_path
            )

            return output_audio_path
        except Exception as e:
            print(f"Voice conversion error: {str(e)}")
            return input_audio_path  # Return original audio path in case of error

# Helper function to convert mp3 to wav format
def convert_mp3_to_wav(mp3_path, wav_path=None):
    """
    Convert an MP3 file to WAV format for voice conversion processing.

    Args:
        mp3_path (str): Path to the MP3 file
        wav_path (str, optional): Path for the output WAV file. If None, a temporary file is created.

    Returns:
        str: Path to the WAV file
    """
    if wav_path is None:
        import tempfile
        temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        wav_path = temp.name
        temp.close()

    try:
        import subprocess
        subprocess.run(["ffmpeg", "-i", mp3_path, wav_path],
                      check=True, capture_output=True)
        return wav_path
    except Exception as e:
        print(f"Error converting MP3 to WAV: {str(e)}")
        return None

## 3. Initialize Components

In [None]:
from google.colab import userdata
GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')

#Initialize the components
print("Initializing LLM...")
llm = GemmaLLM()

print("Initializing STT model...")
stt = WhisperSTT(model_size="medium", gemini_api_key= GEMINI_API_KEY)

print("Initializing TTS...")
tts = EdgeTTS()

print("Initializing Voice Converter...")
vc = VoiceConverter()

print("All components initialized!")

Initializing LLM...
Loading the model...
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/256k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

Loading the adapter...


adapter_config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

Initializing STT model...


100%|██████████████████████████████████████| 1.42G/1.42G [00:10<00:00, 140MiB/s]


Gemini text correction enabled
Initializing TTS...
Initializing Voice Converter...




 > Downloading model to /root/.local/share/tts/voice_conversion_models--multilingual--vctk--freevc24


100%|██████████| 896M/896M [00:42<00:00, 32.1MiB/s]

 > Model's license - MIT
 > Check https://choosealicense.com/licenses/mit/ for more info.
 > Using model: freevc
 > Loading pretrained speaker encoder model ...
Loaded the voice encoder model on cuda in 2.03 seconds.
 > Downloading WavLM model to /root/.local/share/tts/wavlm/WavLM-Large.pt ...
Voice conversion model initialized successfully
All components initialized!


In [None]:
tts=EdgeTTS()

## 4. Gradio Interface

In [None]:
import gradio as gr
import asyncio
import os
import tempfile
import time
from IPython.display import Audio, display

def generate_response(audio_path):
  if audio_path is None:
        return "لم يتم تسجيل أي صوت. حاول مرة أخرى.", None

  # Step 1: Transcribe audio
  try:
      print("Transcribing audio...")
      transcription = stt.transcribe(audio_path)
      print(f"Transcription: {transcription}")
  except Exception as e:
      return f"خطأ في التعرف على الصوت: {str(e)}", None

  # Step 2: Generate response using LLM
  try:
      print("Generating LLM response...")
      llm_response = llm.generate_response(transcription)
      print(f"LLM Response: {llm_response}")
  except Exception as e:
      return f"خطأ في توليد الرد: {str(e)}", None, None

  return llm_response, None, None

# Function to handle the voice interaction
async def read_response(llm_response):
    """
    Process the LLM response by first converting it to speech with EdgeTTS,
    then applying voice conversion as a separate step.

    Args:
        llm_response (str): The text response from the LLM

    Returns:
        tuple: (text response, path to final audio file)
    """
    if not llm_response:
        return "No response generated", None

    # Step 3: Convert text to speech using EdgeTTS
    try:
        print("Converting to speech with EdgeTTS...")
        tts_audio_path, error = await tts.text_to_speech(llm_response)
        if error:
            print(f"TTS error: {error}")
            return f"خطأ في تحويل النص إلى كلام: {error}", None
    except Exception as e:
        print(f"Exception in TTS: {str(e)}")
        return f"خطأ في تحويل النص إلى كلام: {str(e)}", None


    # Step 4: Apply voice conversion
    try:
        print("Applying voice conversion...")
        final_audio_path = vc.convert_voice(tts_audio_path)
        print(f"Voice conversion complete: {final_audio_path}")

        # Clean up temporary files
        try:
            import os
            os.remove(tts_audio_path)
        except:
            pass
    except Exception as e:
        print(f"Voice conversion error: {str(e)}")
        # Return the original TTS audio if voice conversion fails
        return llm_response, tts_audio_path

    return llm_response, tts_audio_path

# Wrapper function to handle asyncio
def process_voice_wrapper(llm_response):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    result = loop.run_until_complete(read_response(llm_rsponse))
    loop.close()
    return llm_response, result

# Reset session
def reset_session(state):
    return "جلسة جديدة بدأت. مرحباً بك!", None, state

In [None]:
import gradio as gr

with gr.Blocks() as block:
    gr.HTML(
        f"""
        <h1 style='text-align: center;'> Interactive Voice Call with Maguy Abou Ghosn </h1>
        <p style='text-align: center;'> Powered by Whisper, Gemma 3, and EdgeTTS</a>
        """
    )
    with gr.Group():
        with gr.Row():
            audio_in = gr.Audio(label="Speak your question", sources=["microphone","upload"], type="filepath")
            answer = gr.Textbox(label="Answer")
            state = gr.State()
        with gr.Row():
            audio_out = gr.Audio(label="Spoken Answer")

    audio_in.stop_recording(
        generate_response,
        audio_in,
        [state, answer, audio_out])\
        .then(fn=read_response, inputs=state, outputs=[answer, audio_out])

block.launch(debug=True)