In [6]:
def play_vlc_file(fp):
    import vlc, time
    vlc_instance = vlc.Instance()
    player = vlc_instance.media_player_new()
    media = vlc_instance.media_new(fp)
    player.set_media(media)
    player.play()
    time.sleep(1.5)
    duration = player.get_length() / 1000
    time.sleep(duration)

In [56]:
import IPython
import os
from openai import OpenAI
import speech_recognition as sr

from settings import OPENAI_API_KEY


class SpeechBot:
    def __init__(
            self, 
            audio_handling_prompt,
            tp_model_name='gpt-4',
            stt_model_name='whisper-1',
            tts_model_name='tts-1',
            tts_voice='alloy'
        ):
        self.recognizer = sr.Recognizer()
        self.client = OpenAI(
            api_key=OPENAI_API_KEY,
        )

        self.audio_handling_prompt = audio_handling_prompt
        self.stt_model_name = stt_model_name
        self.tp_model_name = tp_model_name
        self.tts_model_name = tts_model_name
        self.tts_voice = tts_voice

        self.messages = [
            {
                "role": "system",
                "content": audio_handling_prompt
            }
        ]
    
    def listen(self):
        """Capture audio from the microphone until the user stops speaking and return as text."""
        with sr.Microphone() as source:
            print("Listening...")
            self.recognizer.adjust_for_ambient_noise(source)
            
            # Listen continuously until the user stops speaking
            audio = self.recognizer.listen(source, phrase_time_limit=None, timeout=None)
            audio_file_path = 'data/audio.wav'
            with open(audio_file_path, 'wb') as f:
                f.write(audio.get_wav_data())

        try:
            print("Recognizing...")
            response = self.client.audio.transcriptions.create(
                file=open(audio_file_path, 'rb'),
                model=self.stt_model_name
            )
            print(response)
            text = response.text.strip()

            self.messages.append({
                "role": "user",
                "content": text
            })

            os.remove(audio_file_path)
            
            print(f"You said: {text}")
            return text
        except sr.UnknownValueError:
            print("Sorry, I did not understand that.")
            return ""
        except sr.RequestError:
            print("Sorry, there seems to be a problem with the service.")
            return ""
    
    def process_text(self):
        response = self.client.chat.completions.create(
            model=self.tp_model_name,
            messages=self.messages,
        )
        response_text = response.choices[0].message.content.strip()
        self.messages.append({
            "role": "assistant",
            "content": response_text
        })
        print("Assistant:", response_text)
        return response_text
    
    def speak(self, text):
        speech_file_path = 'data/speech.mp3'
        with self.client.audio.speech.with_streaming_response.create(
            model=self.tts_model_name,
            voice=self.tts_voice,
            input=text
        ) as response: 
            response.stream_to_file(speech_file_path)
        
        IPython.display.Audio(speech_file_path)
        # os.remove(speech_file_path)

    def run(self):
        while True:
            text = self.listen()
            if not text:
                continue
            response = self.process_text()
            
            self.speak(response)
            break

In [57]:
AUDIO_HANDLING_PROMPT = \
"""
You are a sales expert at a jobs search company.
You speak to a customers to sell a service that provides highly accurate job recommendations to companies seeking to hire highly qualified and relevant candidates.
You are trying to sell a new service that uses a new machine learning algorithm to provide accurate job recommendations.
You need to respond to the customer, ask questions to understand their needs, and explain how the new service can help them.
"""

sales_agent = SpeechBot(AUDIO_HANDLING_PROMPT)

In [58]:
sales_agent.run()

Listening...
Recognizing...
Transcription(text='What are you selling today?')
You said: What are you selling today?
Assistant: Hello! Today, we're introducing a revolutionary new service that uses cutting-edge machine learning algorithms to greatly improve your company's hiring process. This service provides highly accurate job recommendations, narrowing down the pool of candidates to ones that are the best match for your specific needs. 

Before we go further, could you please share the major challenges your company currently faces when trying to hire new employees? It'll help us understand your needs better and demonstrate how our new service can solve these issues for you.
