In [2]:
from pvrecorder import PvRecorder

for index, device in enumerate(PvRecorder.get_available_devices()):
    print(f"[{index}] {device}")

[0] OnePlus Bullets Wireless Z2
[1] MacBook Air Microphone


In [56]:

import io
import wave
import IPython
import pyaudio
import os
from openai import OpenAI
import speech_recognition as sr

from settings import OPENAI_API_KEY


def byte_stream_generator(response, buffer_size=256):
    """
    Generator function that yields a stream of bytes from the response.

    :param response: The response object from the OpenAI API call.
    """
    try:
        for byte_chunk in response.iter_bytes(chunk_size=buffer_size):
            if byte_chunk:  # Only yield non-empty byte chunks
                yield byte_chunk
            else:
                print("Skipped an empty or corrupted packet")
    except Exception as e:
        print(f"Error while streaming bytes: {e}")



class SpeechBot:
    def __init__(
            self, 
            audio_handling_prompt,
            tp_model_name='gpt-4',
            stt_model_name='whisper-1',
            tts_model_name='tts-1',
            tts_voice='alloy',
            pause_threshold=2
        ):
        self.recognizer = sr.Recognizer()
        self.pause_threshold = pause_threshold
        self.client = OpenAI(
            api_key=OPENAI_API_KEY,
        )

        self.audio_handling_prompt = audio_handling_prompt
        self.stt_model_name = stt_model_name
        self.tp_model_name = tp_model_name
        self.tts_model_name = tts_model_name
        self.tts_voice = tts_voice

        self.messages = [
            {
                "role": "system",
                "content": audio_handling_prompt
            }
        ]
    
    def listen(self):
        """Capture audio from the microphone until the user stops speaking and return as text."""

        with sr.Microphone() as source:
            print("Listening...")
            self.recognizer.adjust_for_ambient_noise(source)
            self.recognizer.pause_threshold = self.pause_threshold
            
            # Listen continuously until the user stops speaking
            audio = self.recognizer.listen(source)
            audio_file_path = 'data/audio.wav'
            with open(audio_file_path, 'wb') as f:
                f.write(audio.get_wav_data())

        try:
            print("Recognizing...")
            # response = self.client.audio.transcriptions.with_streaming_response.create(
            #     file=open(audio_file_path, 'rb'),
            #     model=self.stt_model_name
            # )

            response = self.recognizer.recognize_google_cloud(audio, language='en-in')
            print(response)
            text = response.text.strip()

            self.messages.append({
                "role": "user",
                "content": text
            })

            os.remove(audio_file_path)
            
            print(f"You said: {text}")
            return text
        except sr.UnknownValueError as e:
            print("Sorry, I did not understand that.")
            raise e
        except sr.RequestError as e:
            print("Sorry, there seems to be a problem with the service.")
            raise e
    
    def process_text(self):
        response = self.client.chat.completions.create(
            model=self.tp_model_name,
            messages=self.messages,
            stream=True
        )
        # create variables to collect the stream of chunks
        collected_chunks = []
        collected_messages = []
        # iterate through the stream of events
        for chunk in response:
            collected_chunks.append(chunk)  # save the event response
            chunk_message = chunk.choices[0].delta.content  # extract the message
            collected_messages.append(chunk_message)  # save the message
            print(chunk_message, end="")  # print the message
            
        collected_messages = [m for m in collected_messages if m is not None]
        full_reply_content = ''.join(collected_messages)

        self.messages.append({
            "role": "assistant",
            "content": full_reply_content
        })
        return full_reply_content
    
    
    def speak(self, text):
        speech_file_path = 'data/speech.mp3'
        with self.client.audio.speech.with_streaming_response.create(
            model="tts-1-hd",
            voice="nova",
            input=text,
            response_format= "wav",
        ) as response:
            try:
                # Initialize PyAudio
                p = pyaudio.PyAudio()

                # Open the stream
                stream = p.open(
                    format=pyaudio.paInt16, 
                    channels=1, 
                    rate=16000, 
                    output=True
                )

                # Initialize the WAV header
                wav_header = None

                for audio_chunk in byte_stream_generator(response=response):
                    # Check if this is the first chunk (WAV header)
                    if wav_header is None:
                        wav_header = audio_chunk
                        # Extract the WAV format parameters from the header
                        wav_format = wave.open(io.BytesIO(wav_header), 'rb')
                        channels, samp_width, framerate, nframes, comptype, compname = wav_format.getparams()
                        # Reopen the stream with the correct parameters
                        stream = p.open(
                            format=p.get_format_from_width(samp_width), 
                            channels=channels, 
                            rate=framerate, 
                            output=True
                        )
                    else:
                        # Write the audio chunk to the stream
                        stream.write(audio_chunk)

                # Close the stream and PyAudio
                stream.stop_stream()
                stream.close()
                p.terminate()

            except Exception as e:
                print(f"Error during playback: {e}")
        
        # IPython.display.Audio(speech_file_path)
        # os.remove(speech_file_path)

    def run(self):
        while True:
            text = self.listen()
            if not text:
                continue
            response = self.process_text()
            
            self.speak(response)
            break

In [57]:
AUDIO_HANDLING_PROMPT = \
"""
You are a hiring recruiter lead at a tech saas company. You are simulating the recruiter talking to a sales agent from a company "foundit", and 
responding to the sales agent's offer to sell you a recruiting solution. You need to dig deeper into value proposition,
lay objections on the product and decide if you want to take a demo or not based on the sales agents conversations.
You also need to bring up issues on the recent bad PR the company faced about its poor pricing. 
Tone of conversation - rude, blunt and a little fast paced.
"""

sales_agent = SpeechBot(AUDIO_HANDLING_PROMPT)

In [58]:
sales_agent.run()

Listening...
Recognizing...
Transcription(text='What are you selling today?')
You said: What are you selling today?
Assistant: Hello! Today, we're introducing a revolutionary new service that uses cutting-edge machine learning algorithms to greatly improve your company's hiring process. This service provides highly accurate job recommendations, narrowing down the pool of candidates to ones that are the best match for your specific needs. 

Before we go further, could you please share the major challenges your company currently faces when trying to hire new employees? It'll help us understand your needs better and demonstrate how our new service can solve these issues for you.
