# 3. Generate Audio (HD Voice)

https://learn.microsoft.com/en-us/azure/ai-services/speech-service/high-definition-voices

In [27]:
from dotenv import load_dotenv
load_dotenv()

True

In [28]:
import json

with open("KFE_paper_sample_conversation.json", "r", encoding="utf-8") as f:
    jsfile = f.read()

conversation = json.loads(jsfile)
conversation

[{'speaker': 'Host (Alice)',
  'text': "Hey everyone, welcome back to Science Unplugged! I'm Jane, your host."},
 {'speaker': 'Host (Alice)',
  'text': "Today, we're cracking open the world of fusion reactors—no lab coat required!"},
 {'speaker': 'Host (Alice)',
  'text': 'Joining me is Dr. Beom Seok Kim from the National Fusion Research Institute.'},
 {'speaker': 'Host (Alice)',
  'text': 'Dr. Kim, thanks for coming on. Ready for some tough questions?'},
 {'speaker': 'Guest',
  'text': 'Absolutely, Jane. Fire away—hopefully not literally!'},
 {'speaker': 'Host (Alice)',
  'text': "Ha! Okay, fusion is called 'clean' nuclear energy. But is it really waste-free?"},
 {'speaker': 'Guest',
  'text': 'Great question. Fusion avoids the nastiest waste, but it’s not totally spotless.'},
 {'speaker': 'Host (Alice)',
  'text': 'So, what actually gets radioactive in a fusion reactor?'},
 {'speaker': 'Guest',
  'text': 'Mainly the parts inside the reactor—like blankets and divertors. Neutrons hit t

## HD Options

Default [temperature parameter](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/high-definition-voices#how-to-use-azure-ai-speech-hd-voices) is 1.0

In [41]:
def generate_ssml(host_voice, guest_voice, temperature=1.0):
    
    ssml = "<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='en-US'>"
    for r in conversation:
        #print(row.to_dict())
        
        if r['speaker'] == 'Host (Alice)':
            ssml += f"\n<voice name='{host_voice}' parameters='temperature={temperature}'>{r['text']}</voice>"
        else:
            ssml += f"\n<voice name='{guest_voice}' parameters='temperature={temperature}'>{r['text']}</voice>"
    ssml += "\n</speak>"

    #print(ssml)

    return ssml

In [42]:
import azure.cognitiveservices.speech as speechsdk
import os
speech_key = os.getenv('SPEECH_KEY')
service_region = os.getenv('SPEECH_REGION')

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.set_property(speechsdk.PropertyId.Speech_LogFilename, "logs")

In [43]:
import random
import string

def generate_random_filename(length=8):
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))

def generate_podcast(ssml, filename=None):
    if filename is None:
        temporary_file= "./" + generate_random_filename() + ".wav"
    else:
        temporary_file = filename
        
    audio_output = speechsdk.audio.AudioOutputConfig(filename=temporary_file)

    # Creates a speech synthesizer using the Azure Speech Service.
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_output)
    result = speech_synthesizer.speak_ssml_async(ssml).get()
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesis was successful. Audio was written to '{}'".format(temporary_file))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(cancellation_details.error_details))
        print("Did you update the subscription info?")

In [None]:
# HD voice (optimized for podcasts)
# host_voice = 'en-us-Ava3:DragonHDLatestNeural'
# guest_voice = 'en-us-Andrew3:DragonHDLatestNeural'

# HD voice (optimized for conversational content) - works best for a 2-person podcast
host_voice = 'en-us-Emma2:DragonHDLatestNeural'
guest_voice = 'en-us-Andrew2:DragonHDLatestNeural'


In [47]:
temp = 0.9
ssml = generate_ssml(host_voice, guest_voice, temp)
generate_podcast(ssml, "KFE_paper_sample_audio_hd.wav")

Speech synthesis was successful. Audio was written to 'KFE_paper_sample_audio_hd.wav'
