# Setup

In [35]:
import os
import azure.cognitiveservices.speech as speech_sdk
from dotenv import load_dotenv

load_dotenv()

AI_SERVICE_ENDPOINT = os.getenv("AI_SERVICE_ENDPOINT")
AI_SERVICE_KEY = os.getenv("AI_SERVICE_KEY")
AI_SERVICE_REGION = os.getenv("AI_SERVICE_REGION")

In [36]:
speech_config = speech_sdk.SpeechConfig(
    subscription=AI_SERVICE_KEY, 
    region=AI_SERVICE_REGION
)
print('Ready to use speech service in:', speech_config.region)

Ready to use speech service in: eastus


# Usage

## Audio transcription (speech-to-text)

In [4]:
audio_config = speech_sdk.audio.AudioConfig(filename="data/time.wav")
speech_recognizer = speech_sdk.SpeechRecognizer(speech_config, audio_config)

speech = speech_recognizer.recognize_once_async().get()

if speech.reason == speech_sdk.ResultReason.RecognizedSpeech:
    command = speech.text
    print(command)
else:
    print(speech.reason)
    if speech.reason == speech_sdk.ResultReason.Canceled:
        cancellation = speech.cancellation_details
        print(cancellation.reason)
        print(cancellation.error_details)

What time is it?


## Speech synthesis (text-to-speech)

In [9]:
output_file = "data/output.wav"
speech_config.speech_synthesis_voice_name = "en-GB-RyanNeural"
audio_config = speech_sdk.audio.AudioConfig(filename=output_file)
speech_synthesizer = speech_sdk.SpeechSynthesizer(speech_config, audio_config)

speak = speech_synthesizer.speak_text_async("Hello, this audio is generated using the Azure AI Speech.").get()
if speak.reason != speech_sdk.ResultReason.SynthesizingAudioCompleted:
    print(speak.reason)
else:
    print("Spoken output saved in " + output_file)

Spoken output saved in data/output.wav


## Speech synthesis with Speech Synthesis Markup Language (SSML)

In [8]:
ssml = """<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
  <voice name='en-GB-LibbyNeural'> 
    <prosody rate="medium" pitch="+1st">
      Hello, this audio is generated using the Azure AI Speech.
    </prosody>
  </voice>
</speak>"""

output_file = "data/output_ssml.wav"
ssml_audio_config = speech_sdk.audio.AudioConfig(filename=output_file)
ssml_speech_synthesizer = speech_sdk.SpeechSynthesizer(speech_config, ssml_audio_config)

speak = ssml_speech_synthesizer.speak_ssml_async(ssml).get()
if speak.reason != speech_sdk.ResultReason.SynthesizingAudioCompleted:
    print(speak.reason)
else:
    print("Spoken output saved in " + output_file)

Spoken output saved in data/output_ssml.wav


## Speech translation

In [16]:
translation_config = speech_sdk.translation.SpeechTranslationConfig(
    subscription=AI_SERVICE_KEY,
    region=AI_SERVICE_REGION
)
translation_config.speech_recognition_language = 'en-US'
translation_config.add_target_language('fr')
translation_config.add_target_language('vi')
translation_config.add_target_language('ja')
print('Ready to translate from',translation_config.speech_recognition_language)

Ready to translate from en-US


In [23]:
audio_config_in = speech_sdk.AudioConfig(filename="data/time.wav")
translator = speech_sdk.translation.TranslationRecognizer(translation_config=translation_config,
                                                          audio_config=audio_config_in)
result = translator.recognize_once_async().get()
print('Translating "{}"'.format(result.text))

translations = {}

print("Result:")
for language, text in result.translations.items():
    print(f"\t{language}: {text}")
    translations[language] = text
print()
print(result)

Translating "What time is it?"
Result:
	fr: Quelle heure est-il?
	vi: Mấy giờ rồi?
	ja: 何時ですか。

TranslationRecognitionResult(result_id=aa0074bf70564e9e9c0ebaea5a06e59c, translations={'fr': 'Quelle heure est-il?', 'vi': 'Mấy giờ rồi?', 'ja': '何時ですか。'}, reason=ResultReason.TranslatedSpeech)


## Translation synthesis 

In [None]:
output_file = "data/output_{}.wav"
voices = {
        "fr": "fr-FR-HenriNeural",
        "vi": "en-GB-AdaMultilingualNeural",
        "ja": "en-GB-AdaMultilingualNeural"
}

for language, voice in voices.items():
    speech_config.speech_synthesis_voice_name = voice
    audio_config_out = speech_sdk.audio.AudioConfig(filename=output_file.format(language))
    speech_synthesizer = speech_sdk.SpeechSynthesizer(speech_config, audio_config_out)
    speak = speech_synthesizer.speak_text_async(translations.get(language)).get()
    if speak.reason != speech_sdk.ResultReason.SynthesizingAudioCompleted:
        print(speak.reason)
    else:
        print("Spoken output saved in " + output_file.format(language))

Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
Spoken output saved in data/output_fr.wav
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
Spoken output saved in data/output_vi.wav
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
Spoken output saved in data/output_ja.wav


Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: received close frame, sending a close response frame.
Info: on_underlying_io_close_sent: uws_client=0x1078126e0, io_send_result:0
Info: on_underlying_io_close_sent: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.
Info: on_underlying_io_bytes_received: Close frame received
Info: on_underlying_io_bytes_received: received close frame, sending a close response frame.
Info: on_underlying_io_close_sent: uws_client=0x150a4a020, io_send_result:0
Info: on_underlying_io_close_sent: closing underlying io.
Info: on_underlying_io_close_complete: uws_state: 6.


## Diarization

In [None]:
import time


def conversation_transcriber_recognition_canceled_cb(evt: speech_sdk.SessionEventArgs):
    print('Canceled event')

def conversation_transcriber_session_stopped_cb(evt: speech_sdk.SessionEventArgs):
    print('SessionStopped event')

def conversation_transcriber_transcribed_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
    print('\nTRANSCRIBED:')
    if evt.result.reason == speech_sdk.ResultReason.RecognizedSpeech:
        print('\tText={}'.format(evt.result.text))
        print('\tSpeaker ID={}\n'.format(evt.result.speaker_id))
    elif evt.result.reason == speech_sdk.ResultReason.NoMatch:
        print('\tNOMATCH: Speech could not be TRANSCRIBED: {}'.format(evt.result.no_match_details))

def conversation_transcriber_transcribing_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
    print('TRANSCRIBING:')
    print('\tText={}'.format(evt.result.text))
    print('\tSpeaker ID={}'.format(evt.result.speaker_id))

def conversation_transcriber_session_started_cb(evt: speech_sdk.SessionEventArgs):
    print('SessionStarted event')

def recognize_from_file():
    speech_config = speech_sdk.SpeechConfig(subscription=AI_SERVICE_KEY, region=AI_SERVICE_REGION)
    speech_config.speech_recognition_language="en-US"
    speech_config.set_property(property_id=speech_sdk.PropertyId.SpeechServiceResponse_DiarizeIntermediateResults, value='true')

    audio_config = speech_sdk.audio.AudioConfig(filename="data/katiesteve.wav")
    conversation_transcriber = speech_sdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)

    transcribing_stop = False

    def stop_cb(evt: speech_sdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal transcribing_stop
        transcribing_stop = True

    # Connect callbacks to the events fired by the conversation transcriber
    conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
    # conversation_transcriber.transcribing.connect(conversation_transcriber_transcribing_cb)
    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
    # stop transcribing on either session stopped or canceled events
    conversation_transcriber.session_stopped.connect(stop_cb)
    conversation_transcriber.canceled.connect(stop_cb)

    conversation_transcriber.start_transcribing_async()

    # Waits for completion.
    while not transcribing_stop:
        time.sleep(.5)

    conversation_transcriber.stop_transcribing_async()

In [44]:
try:
    recognize_from_file()
except Exception as err:
    print("Encountered exception. {}".format(err))

SessionStarted event

TRANSCRIBED:
	Text=Good morning, Steve. Good morning, Katie. Have you tried the latest real time diarization in Microsoft Speech Service, which can tell you who said what in real time?
	Speaker ID=Guest-1


TRANSCRIBED:
	Text=Not yet. I've been using the batch transcription with diarization functionality, but it produces diarization result until whole audio get processed. Is the new feature can diarise in real time?
	Speaker ID=Guest-1


TRANSCRIBED:
	Text=Absolutely.
	Speaker ID=Guest-1


TRANSCRIBED:
	Text=That's exciting. Let me try it right now.
	Speaker ID=Guest-2


TRANSCRIBED:
	Text=
	Speaker ID=Unknown


TRANSCRIBED:
	Text=
	Speaker ID=Unknown

Canceled event
CLOSING on ConversationTranscriptionCanceledEventArgs(session_id=b76ebe02a7614eeaa3c93594fe488daa, result=ConversationTranscriptionResult(result_id=037f685746a84cdf91ad336e424e120e, speaker_id=, text=, reason=ResultReason.Canceled))
SessionStopped event
CLOSING on SessionEventArgs(session_id=b76ebe02a