In [None]:
%pip install azure-cognitiveservices-speech --quiet
%pip install openai==1.30.1 --quiet
%pip install sounddevice --quiet
%pip install azure-ai-formrecognizer azure-cognitiveservices-speech azure-identity --quiet
%pip install load_dotenv --quiet
%pip install prompt_toolkit --quiet
%pip install azure-storage-blob --quiet
%pip install matplotlib


In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv('.env')

# Access the environment variables
speech_key = os.getenv('SPEECH_KEY')
speech_region = os.getenv('SPEECH_REGION')
whisper_key = os.getenv('WHISPER_KEY')
whisper_endpoint = os.getenv('WHISPER_ENDPOINT')
whisper_deployment = os.getenv('WHISPER_DEPLOYMENT')
storage_endpoint = os.getenv('AZURE_STORAGE_ENDPOINT')
storage_sas = os.getenv('AZURE_STORAGE_SAS_TOKEN')


# Use the environment variables in your code
print(f'Speech API Key: {speech_key}')
print(f'Speech Region: {speech_region}')
print(f'Whisper API Key: {whisper_key}')
print(f'Whisper Endpoint: {whisper_endpoint}')
print(f'Whisper Deployment: {whisper_deployment}')
print(f'Storage Endpoint: {storage_endpoint}')
print(f'Storage SAS Token: {storage_sas}')

In [None]:
import azure.cognitiveservices.speech as speechsdk

def recognize_from_microphone():
    # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_region)
    speech_config.speech_recognition_language="en-US"

    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print("Speak into your microphone.")
    speech_recognition_result = speech_recognizer.recognize_once_async().get()

    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        #print("Recognized: {}".format(speech_recognition_result.text))
        return speech_recognition_result.text
    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized: {}".format(speech_recognition_result.no_match_details))
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print("Error details: {}".format(cancellation_details.error_details))
            print("Did you set the speech resource key and region values?")

text = recognize_from_microphone()
print(text)

In [None]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=whisper_key,
    api_version="2024-02-01",
    azure_endpoint = whisper_endpoint
)

audio_test_file = "./helloThere.m4a"

result = client.audio.transcriptions.create(
    file=open(audio_test_file, "rb"),            
    model=whisper_deployment
)

print(result)

In [None]:
'''
  For more samples please visit https://github.com/Azure-Samples/cognitive-services-speech-sdk 
'''

import azure.cognitiveservices.speech as speechsdk

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=speech_region)
# Note: the voice setting will not overwrite the voice element in input SSML.
speech_config.speech_synthesis_voice_name = "en-GB-RyanNeural"

text = "Hi, this is Ryan"

def text_to_speech(text):
  # use the default speaker as audio output.
  speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
  result = speech_synthesizer.speak_text_async(text).get()
  # Check result
  if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
      print("Speech synthesized for text [{}]".format(text))
  elif result.reason == speechsdk.ResultReason.Canceled:
      cancellation_details = result.cancellation_details
      print("Speech synthesis canceled: {}".format(cancellation_details.reason))
      if cancellation_details.reason == speechsdk.CancellationReason.Error:
          print("Error details: {}".format(cancellation_details.error_details))

text_to_speech(text)



In [None]:
import os
from azure.storage.blob import BlobServiceClient


# Create the BlobServiceClient object using the account URL and SAS token
blob_service_client = BlobServiceClient(account_url=storage_endpoint, credential=storage_sas)
blob_container_client = blob_service_client.get_container_client("paintings")
blob_list = blob_container_client.list_blobs()

for blob in blob_list:
    print(blob.name)

In [None]:
from promptflow.core import Prompty
from IPython.display import Image

image_path = storage_endpoint + "paintings/D4_Painting.png?" + storage_sas
print("Painting: " + image_path)
display(Image(url=image_path))
firstName = "John"

def get_oai_response(firstName=None, question="John", image_path=image_path, conversation_history=[]):
    path_to_prompty = "basic.prompty"
    flow = Prompty.load(path_to_prompty)
    image = image_path
    context = "this is a painting"
    result = flow(
        firstName = firstName,
        context = context,
        question = question,
        image = image,
        conversation_history = conversation_history
    )

    return result

result = get_oai_response(firstName=firstName, question="What can you tell me about the painting?", image_path=image_path)
print(result)

In [None]:
from IPython.display import Image, display

# URL of the image
image_url = "https://th.bing.com/th/id/R.842fb9a1885e50a762ef352821d9078d?rik=ykwt7kPK%2f3ngFA&riu=http%3a%2f%2fupload.wikimedia.org%2fwikipedia%2fcommons%2fe%2fe8%2fVan_Gogh_The_Olive_Trees..jpg&ehk=o8ZPcWcu3H0Vdk%2b2E5YJ63CXJSSHb3BFrzKL3UG5HOU%3d&risl=1&pid=ImgRaw&r=0"

# Display the image
display(Image(url=image_url))


In [None]:
image_path = "https://th.bing.com/th/id/R.842fb9a1885e50a762ef352821d9078d?rik=ykwt7kPK%2f3ngFA&riu=http%3a%2f%2fupload.wikimedia.org%2fwikipedia%2fcommons%2fe%2fe8%2fVan_Gogh_The_Olive_Trees..jpg&ehk=o8ZPcWcu3H0Vdk%2b2E5YJ63CXJSSHb3BFrzKL3UG5HOU%3d&risl=1&pid=ImgRaw&r=0"
print("Painting: " + image_path)
display(Image(url=image_path))
conversation_history = []
def dialog():
    text = recognize_from_microphone()
    result = get_oai_response(firstName="Linda", question=text, image_path=image_path, conversation_history=conversation_history)
    print(conversation_history)
    print(result)
    conversation_history.append(f'''
User: {text}
Assistant: {result}
''')
    speech = text_to_speech(result)
    if "goodbye" in text.lower():
        return
    dialog()
dialog()

