# Text to Speech experiments

In [None]:
# Install dependencies
!pip install --upgrade --user openai
!pip install --upgrade --user azure-cognitiveservices-speech
!pip install --upgrade --user google-cloud-texttospeech

In [25]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [22]:
# Define texts

text_en = """
"We are friends now, I don't understand why you don't discuss your plans!", I shouted. 
John replied, "I don't want to talk about it, I'm not sure what I'm going to do yet." 
I became scared and said, "John, you're scaring me, please tell me what's going on!"
John whispered, "I'm sorry, I can't tell you, they are watching us."
"""

text_cz = """
"Jsme teď přátelé, nechápu, proč nemluvíš o svých plánech!" zakřičela jsem. 
John odpověděl: "Nechci o tom mluvit, ještě nevím, co udělám." 
Začala jsem se bát a řekla: "Johne, děsíš mě, prosím, řekni mi, co se děje"
John zašeptal: "Promiň, nemohu ti to říct, oni nás sledují.”
"""

text_en_azure_ssml = """
<!--ID=B7267351-473F-409D-9765-754A8EBCDE05;Version=1|{"VoiceNameToIdMapItems":[{"Id":"e0638b39-fbd2-4497-a482-e2f65759412a","Name":"Microsoft Server Speech Text to Speech Voice (en-US, GuyNeural)","ShortName":"en-US-GuyNeural","Locale":"en-US","VoiceType":"StandardVoice"}]}-->
<!--ID=FCB40C2B-1F9F-4C26-B1A1-CF8E67BE07D1;Version=1|{"Files":{}}-->
<!--ID=5B95B1CC-2C7B-494F-B746-CF22A0E779B7;Version=1|{"Locales":{"en-US":{"AutoApplyCustomLexiconFiles":[{}]}}}-->
<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="en-US-GuyNeural"><s /><mstts:express-as style="shouting">"We are friends now, I don't understand why you don't discuss your plans!"</mstts:express-as><mstts:express-as style="Default">, I shouted. </mstts:express-as><s />
<s /><mstts:express-as style="Default">John replied, </mstts:express-as><s /><mstts:express-as style="angry">"I don't want to talk about it, I'm not sure what I'm going to do yet." </mstts:express-as><s />
<s /><mstts:express-as style="Default">I became scared and said, </mstts:express-as><s /><mstts:express-as style="terrified">"John, you're scaring me, please tell me what's going on!"</mstts:express-as><s />
<s /><mstts:express-as style="Default"><prosody rate="-20.00%">John whispered,</prosody></mstts:express-as><mstts:express-as style="whispering" styledegree="1.2">"I'm sorry, I can't tell you, they are watching us."</mstts:express-as><s /></voice></speak>
"""

### OpenAI TTS

In [11]:
from pathlib import Path
from openai import OpenAI
client = OpenAI()

# English
response = client.audio.speech.create(
  model="tts-1",
  voice="alloy",
  input=text_en
)

response.stream_to_file("openai_en.mp3")

# Czech
response = client.audio.speech.create(
  model="tts-1",
  voice="alloy",
  input=text_cz
)

response.stream_to_file("openai_cz.mp3")

### Azure TTS

In [None]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)

# English
speech_config.speech_synthesis_voice_name = "en-US-GuyNeural"
file_config = speechsdk.audio.AudioOutputConfig(filename="azure_en.mp3")
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)
speech_synthesizer.speak_text_async(text_en).get()

# English with SSML
file_config = speechsdk.audio.AudioOutputConfig(filename="azure_en_ssml.mp3")
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)
speech_synthesizer.speak_ssml_async(text_en_azure_ssml).get()

# Czech
speech_config.speech_synthesis_voice_name = "cs-CZ-AntoninNeural"
file_config = speechsdk.audio.AudioOutputConfig(filename="azure_cz.mp3")
speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=file_config)
speech_synthesizer.speak_text_async(text_cz).get()

### Google TTS

In [None]:
from google.cloud import texttospeech

client = texttospeech.TextToSpeechClient()
audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)

# English
synthesis_input = texttospeech.SynthesisInput(text=text_en)
voice = texttospeech.VoiceSelectionParams(language_code="en-US", name="en-US-Studio-M")
response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)

open('google_en.mp3', 'wb').write(response.audio_content)

# Czech
synthesis_input = texttospeech.SynthesisInput(text=text_cz)
voice = texttospeech.VoiceSelectionParams(language_code="cs-CZ", name="cs-CZ-Wavenet-A")
response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)

open('google_cz.mp3', 'wb').write(response.audio_content)