<a href="https://colab.research.google.com/github/vishnuy/AI-ML/blob/main/Python_Voice_Translator_openSrcTTSwith_Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Default title text

import json
import time
import base64
import requests # Used for making HTTP requests to the Google API endpoints
import os # For simulating file existence checks in a real scenario
import google.generativeai as genai # Import the Generative AI library
from google.colab import userdata # Used to securely store your API key


!pip install openai-whisper

# Your API_KEY will be automatically provided by the Canvas environment at runtime.
# Do not hardcode an API key here.
# API_KEY = "" # No longer needed for direct model usage

# ---  Whisper Speech-to-Text Function ---
def transcribe_audio_with_whisper_simulated(audio_filepath):
    """
    Transcribes audio using the Whisper ASR model (simulated).

    Args:
        audio_filepath (str): Path to the audio file to be transcribed.

    Returns:
        str: The transcribed text, or None if transcription fails.
    """

    try:
        import whisper # Assuming 'pip install openai-whisper' has been run
        model = whisper.load_model("base") # Or "small", "medium", "large"
        result = model.transcribe(audio_filepath)
        transcribed_text = result["text"]
        print(f"Whisper Transcribed: '{transcribed_text}'")
        return transcribed_text
    except ImportError:
        print("Whisper library not found. Please install it ('pip install openai-whisper').")
        return None
    except Exception as e:
        print(f"Error during Whisper transcription: {e}")
        return None


# --- Translation Function (using direct model interaction) ---
def translate_text(LANG_MAP, text, source_lang_display, target_lang_display):
    """
    Translates text using the Gemini 2.5 Flash model directly.

    Args:
        text (str): The text to be translated.
        source_lang_display (str): The display name of the source language (e.g., "English").
        target_lang_display (str)  The display name of the target language (e.g., "Spanish").

    Returns:
        str: The translated text, or None if translation fails.
    """
    try:
        # Initialize the Generative AI model
        # Ensure you have set your API key in the Colab secrets manager as GOOGLE_API_KEY
        GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
        genai.configure(api_key=GOOGLE_API_KEY)
        gemini_model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20') # Use the same model as the API

        # LANG_MAP is defined in the main function for this implementation.
        # In a real application, this would be a global or passed parameter.
        # For this example, we'll define a temporary one for function scope if not already defined.
        if 'LANG_MAP' not in locals() and 'LANG_MAP' not in globals():
             LANG_MAP = {
                "English": {"translate_code": "English", "tts_voice": "Kore", "tts_lang_code": "en-US"},
                "Spanish": {"translate_code": "Spanish", "tts_voice": "Puck", "tts_lang_code": "es-US"},
                "French": {"translate_code": "French", "tts_voice": "Charon", "tts_lang_code": "fr-FR"},
                "German": {"translate_code": "German", "tts_voice": "Fenrir", "tts_lang_code": "de-DE"},
                "Japanese": {"translate_code": "Japanese", "tts_voice": "Leda", "tts_lang_code": "ja-JP"},
                "Hindi": {"translate_code": "Hindi", "tts_voice": "hi-IN-Neural2-A", "tts_lang_code": "hi-IN"},

            }


        source_lang_code = LANG_MAP[source_lang_display]["translate_code"]
        target_lang_code = LANG_MAP[target_lang_display]["translate_code"]

        # Construct the prompt for the LLM to perform translation
        prompt = f"Translate the following text from {source_lang_code} to {target_lang_code}: '{text}'"
        chat_history = [{"role": "user", "parts": [{"text": prompt}]}]

        print(f"\nTranslating from {source_lang_display} to {target_lang_display} using the model directly...")
        response = gemini_model.generate_content(chat_history)

        if response and response.text:
            translated_text = response.text
            print(f"Translated Text: {translated_text}")
            return translated_text
        else:
            print("Translation failed: Model did not return a valid response.")
            return None
    except Exception as e:
        print(f"Error during translation: {e}")
        return None

!pip install tts

from TTS.api import TTS


# --- Text-to-Speech Function (using open src TTS Text-to-Speech API) ---
def synthesize_speech(text, target_lang_display):
    """
    Synthesizes speech from text using the open src TTS  API.

    Args:
        text (str): The text to be converted to speech.
        target_lang_display (str): The display name of the target language (e.g., "Spanish").

    Returns:
        bytes: The audio content as bytes, or None if synthesis fails.
    """
    print(f"\nSynthesizing speech for text: '{text}' in {target_lang_display} using open src Text-to-Speech...")



    # Inspect the TTS object to find available methods
    tts_object = TTS()
    print(dir(tts_object))



    # List available models
    print("🔍 Available TTS models:")
    # Corrected line to list models
    for model in TTS().models:
      print(model)

    # Load a multilingual or English model
    model_name = "tts_models/en/ljspeech/tacotron2-DDC"

    # Initialize the TTS engine
    tts = TTS(model_name=model_name)

    # Input text to convert to speech
    #text = "Hello! This is a demonstration of open-source text-to-speech using Coqui TTS."

    # Synthesize and save to file
    output_path = "tts-output.wav"
    tts.tts_to_file(text=text, file_path=output_path)

    print(f"✅ Audio saved to: {output_path}")

    return None


!pip install sounddevice numpy scipy

!apt-get update
!apt-get install libportaudio2 -y

import sounddevice
import numpy as np
import scipy.io.wavfile

def record_audio(duration=5, filename="temp_audio.wav", samplerate=44100):
    """
    Records audio from the microphone for a specified duration and saves it to a WAV file.

    Args:
        duration (int): The duration of the recording in seconds.
        filename (str): The name of the file to save the audio to.
        samplerate (int): The sample rate of the recording.

    Returns:
        str: The filename of the saved audio file, or None if an error occurred.
    """
    print(f"Recording started for {duration} seconds...")
    try:
        # Record audio from the default input device
        audio_data = sounddevice.rec(int(duration * samplerate), samplerate=samplerate, channels=2, dtype='int16')
        sounddevice.wait()  # Wait until recording is finished
        print("Recording finished.")

        # Save the recorded audio to a WAV file
        scipy.io.wavfile.write(filename, samplerate, audio_data)
        print(f"Audio saved to {filename}")
        return filename
    except Exception as e:
        print(f"Error during audio recording: {e}")
        return None

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.1/803.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m12.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-w

KeyboardInterrupt: 

In [None]:



def main():
    """
    Main function to run the console-based voice translator.
    Handles user input for audio file path or live recording, language selection,
    then calls transcription, translation, and TTS functions.
    """
    # Define the available languages and their corresponding codes/voices for the APIs.
    # 'translate_code' is for the text generation model's understanding.
    # 'tts_voice' is a prebuilt voice name for the TTS model.
    # 'tts_lang_code' is the language code for the TTS model, sometimes used for context.
    LANG_MAP = {
        "English": {"translate_code": "English", "tts_voice": "Kore", "tts_lang_code": "en-US"},
        "Spanish": {"translate_code": "Spanish", "tts_voice": "Puck", "tts_lang_code": "es-US"},
        "French": {"translate_code": "French", "tts_voice": "Charon", "tts_lang_code": "fr-FR"},
        "German": {"translate_code": "German", "tts_voice": "Fenrir", "tts_lang_code": "de-DE"},
        "Japanese": {"translate_code": "Japanese", "tts_voice": "Leda", "tts_lang_code": "ja-JP"},
        "Hindi": {"translate_code": "Hindi", "tts_voice": "hi-IN-Neural2-A", "tts_lang_code": "hi-IN"},
    }

    print("--- Python Voice Translator (Console) ---")
    print("Available languages:")
    for i, lang in enumerate(LANG_MAP.keys()):
        print(f"{i+1}. {lang}")

    while True:
        try:
            print("\n--- Input Audio Source ---")
            print("1. Record audio from microphone")
            print("2. Provide an audio file path")
            print("3. Quit")

            choice = input("Select an option (1-3): ").strip()

            if choice == '3' or choice.lower() == 'quit':
                break

            audio_filepath = None
            if choice == '1':
                # Record live audio
                try:
                    duration_str = input("Enter recording duration in seconds (e.g., 5): ").strip()
                    duration = int(duration_str)
                    audio_filepath = record_audio(duration=duration)
                    if not audio_filepath:
                        print("Audio recording failed. Please try again.")
                        continue
                except ValueError:
                    print("Invalid duration. Please enter a number.")
                    continue
                except Exception as e:
                    print(f"An error occurred during recording: {e}")
                    continue

            elif choice == '2':
                # Get audio file path from user
                audio_filepath_input = "/content/sample_data/KrishnaVoice.m4a" # input("Enter path to audio file (e.g., 'my_audio.wav'): ").strip()
                # Simulate checking if the file exists (for a real local setup)
                if not os.path.exists(audio_filepath_input) and audio_filepath_input != "my_audio.wav": # Allow 'my_audio.wav' as a placeholder
                    print(f"Warning: File '{audio_filepath_input}' not found. Proceeding with simulated transcription.")
                audio_filepath = audio_filepath_input

            else:
                print("Invalid option. Please choose 1, 2, or 3.")
                continue

            if not audio_filepath:
                 print("No audio source selected or recording failed.")
                 continue

            # Step 1: Transcribe audio using simulated Whisper
            transcribed_text = transcribe_audio_with_whisper_simulated(audio_filepath)

            if not transcribed_text:
                print("Skipping translation due to failed or empty transcription.")
                continue

            # Get source language choice from user (for translation context)
            print("\n--- Translation Settings ---")
            while True:
                try:
                    source_choice = int(input(f"Select input language (1-{len(LANG_MAP)}) for translation context: "))
                    if 1 <= source_choice <= len(LANG_MAP):
                        source_lang_display = list(LANG_MAP.keys())[source_choice - 1]
                        break
                    else:
                        print("Invalid choice. Please enter a number within the range.")
                except ValueError:
                    print("Invalid input. Please enter a number.")

            # Get target language choice from user
            while True:
                try:
                    target_choice = int(input(f"Select output language (1-{len(LANG_MAP)}): "))
                    if 1 <= target_choice <= len(LANG_MAP):
                        target_lang_display = list(LANG_MAP.keys())[target_choice - 1]
                        break
                    else:
                        print("Invalid choice. Please enter a number within the range.")
                except ValueError:
                    print("Invalid input. Please enter a number.")

            # Step 2: Perform translation
            if source_lang_display == target_lang_display:
                print("Source and target languages are the same. No translation needed.")
                translated_text = transcribed_text
            else:
                translated_text = translate_text( LANG_MAP, transcribed_text, source_lang_display, target_lang_display)

            # Step 3: If translation was successful, synthesize speech
            if translated_text:
                synthesize_speech(translated_text, target_lang_display)

        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            print("Please try again.")

if __name__ == "__main__":
    main()

--- Python Voice Translator (Console) ---
Available languages:
1. English
2. Spanish
3. French
4. German
5. Japanese
6. Hindi

--- Input Audio Source ---
1. Record audio from microphone
2. Provide an audio file path
3. Quit
Select an option (1-3): 1


# Task
Modify the code in the selected cell to take live audio input from the microphone instead of an audio file path.