In [1]:
!pip install google-cloud-texttospeech



In [2]:
from google.cloud import texttospeech

# Text-to-Speech: Convert text to audio file
def synthesize_text(text, output_file="output.mp3", language_code="en-US"):
    """
    Synthesizes speech from text and saves to an output file.

    Args:
        text: The text to synthesize
        output_file: Path to the output audio file
        language_code: Language of the text (default: en-US)
    """
    # Instantiate a client
    client = texttospeech.TextToSpeechClient()

    # Set the text input
    synthesis_input = texttospeech.SynthesisInput(text=text)

    # Build the voice request
    voice = texttospeech.VoiceSelectionParams(
        language_code=language_code,
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL,
    )

    # Select the audio file type
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
    )

    # Perform the text-to-speech request
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # Write the response to the output file
    with open(output_file, "wb") as out:
        out.write(response.audio_content)
        print(f'Audio content written to file "{output_file}"')

# Text-to-Speech: Advanced customization
def synthesize_with_custom_voice(text, output_file="output.mp3"):
    """
    Synthesizes speech with customized voice settings.

    Args:
        text: The text to synthesize
        output_file: Path to the output audio file
    """
    # Instantiate a client
    client = texttospeech.TextToSpeechClient()

    # Set the text input
    synthesis_input = texttospeech.SynthesisInput(text=text)

    # Build the voice request with specific voice name
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Wavenet-F",  # Specific voice model
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE,
    )

    # Select the audio file type with enhanced settings
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3,
        speaking_rate=0.9,   # 0.25 to 4.0, 1.0 is normal speed
        pitch=1.5,           # -20.0 to 20.0, 0.0 is normal pitch
        volume_gain_db=2.0,  # -96.0 to 16.0, 0.0 is normal volume
        sample_rate_hertz=24000,  # Higher quality audio
    )

    # Perform the text-to-speech request
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # Write the response to the output file
    with open(output_file, "wb") as out:
        out.write(response.audio_content)
        print(f'Audio content written to file "{output_file}"')

In [4]:
synthesize_text("Hello, this is a test of Google Cloud Text-to-Speech API!", "hello.mp3")

Audio content written to file "hello.mp3"
