## Set up dependencies

In [36]:
import os
import time

from dotenv import load_dotenv
import openai
import azure.cognitiveservices.speech as speechsdk


## Set up environment variables

In [37]:
load_dotenv()

True

In [38]:
openai.api_type = os.getenv('OPENAI_API_TYPE')
openai.api_key = os.getenv('OPENAI_API_KEY') 
openai.api_base = os.getenv('OPENAI_API_BASE') 
openai.api_version = os.getenv('OPENAI_API_VERSION')

In [39]:
SPEECH_API_KEY = os.getenv('SPEECH_API_KEY')
SPEECH_API_REGION = os.getenv('SPEECH_API_REGION')

COMPLETIONS_MODEL = os.getenv('COMPLETIONS_MODEL')

In [40]:
def recognize_speech_from_file(filename):
    speech_api_key = SPEECH_API_KEY
    speech_api_region = SPEECH_API_REGION

    speech_config = speechsdk.SpeechConfig(subscription=speech_api_key, region=speech_api_region)
    audio_config = speechsdk.audio.AudioConfig(filename=filename)
    # Creates a speech recognizer using a file as audio input, also specify the speech language
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config,  audio_config=audio_config)
    global done 
    done = False
    global recognized_text_list 
    recognized_text_list=[]
    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        global done
        done = True

    def recognize_cb(evt: speechsdk.SpeechRecognitionEventArgs):
        """callback for recognizing the recognized text"""
        global recognized_text_list
        recognized_text_list.append(evt.result.text)
        print('RECOGNIZED: {}'.format(evt.result.text))

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(recognize_cb)
    speech_recognizer.session_started.connect(lambda evt: print('STT SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('STT SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    speech_recognizer.stop_continuous_recognition()

    return recognized_text_list


In [41]:
import Levenshtein

def calculate_wer(reference, hypothesis):
    # Tokenize the reference and hypothesis into words
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()

    # Calculate Levenshtein distance
    distance = Levenshtein.distance(reference, hypothesis)

    # Calculate WER
    wer = distance / len(ref_tokens)

    return wer

def test_speech_to_text_conversion(audio_file_path, ground_truth):
    # Call speech-to-text conversion function and get the transcribed text
    transcribed_text = recognize_speech_from_file(audio_file_path)
    print(transcribed_text)

    # Convert the transcribed text list into a single string
    transcribed_text_str = ' '.join(transcribed_text)
    print(transcribed_text_str)

    # Calculate WER
    wer = calculate_wer(ground_truth, transcribed_text_str)

    return wer


In [42]:
with open('../data/ground-truth.txt', 'r') as file:
    ground_truth = file.read()

audio_file_path = "../data/sample.wav" 


wer_result = test_speech_to_text_conversion(audio_file_path, ground_truth)
print(f"Word Error Rate (WER): {wer_result * 100:.2f}%")

STT SESSION STARTED: SessionEventArgs(session_id=247e995baedf4a0bb7f0ea28c0b53faf)
RECOGNIZING: SpeechRecognitionEventArgs(session_id=247e995baedf4a0bb7f0ea28c0b53faf, result=SpeechRecognitionResult(result_id=546691099a484498a8662c80c222167d, text="hello mrs steven", reason=ResultReason.RecognizingSpeech))
RECOGNIZING: SpeechRecognitionEventArgs(session_id=247e995baedf4a0bb7f0ea28c0b53faf, result=SpeechRecognitionResult(result_id=7f991ba634774fee812974d186e4cd7e, text="hello mrs stevens", reason=ResultReason.RecognizingSpeech))
RECOGNIZING: SpeechRecognitionEventArgs(session_id=247e995baedf4a0bb7f0ea28c0b53faf, result=SpeechRecognitionResult(result_id=92c7eb16c2be410e9223676b7e087cfc, text="hello mrs stevens my name is", reason=ResultReason.RecognizingSpeech))
RECOGNIZING: SpeechRecognitionEventArgs(session_id=247e995baedf4a0bb7f0ea28c0b53faf, result=SpeechRecognitionResult(result_id=40f5fbdcdf9449a9bce2a78423bc478d, text="hello mrs stevens my name is jane", reason=ResultReason.Recogni