uberduck-ai · Sobsz · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/clients/daily/daily_bot.py b/clients/daily/daily_bot.py
@@ -15,7 +15,6 @@
 
 
 class PyAudioApp:
-
     def __init__(self):
         self.__app_quit = False
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,8 +5,8 @@ services:
     build: .
     command: >
       bash -c "python setup.py develop &&  \
-               mkdir -p models/styletts2  && \
-               aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \ 
+               mkdir -p models && \
+               aws s3 sync s3://uberduck-models-us-west-2/prototype models && \ 
                uvicorn openduck_py.routers.main:app --reload --host 0.0.0.0 --port 8000"
     working_dir: /openduck-py/openduck-py
     volumes:
@@ -21,8 +21,8 @@ services:
     build: .
     command: >
       bash -c "python setup.py develop &&  \ 
-               mkdir -p models/styletts2 && \ 
-               aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \ 
+               mkdir -p models && \ 
+               aws s3 sync s3://uberduck-models-us-west-2/prototype models && \ 
                apt update && apt install -y python3-pip && \ 
                pip install watchdog[watchmedo] && \ 
                watchmedo auto-restart --directory=./openduck_py --pattern=*.py --recursive -- python openduck_py/routers/voice.py"

diff --git a/openduck-py/openduck_py/prompts/system-prompt.md b/openduck-py/openduck_py/prompts/system-prompt.md
@@ -9,4 +9,6 @@ For example, in the following exchange:
 Assistant: 'Mammals are unique because they have hair or fur, produce milk to feed their young, and are warm-blooded, meaning they can regulate their own body temperature.'
 User: 'mammals are unique because they have hair or fur produce milk to feed their young and are warm blooded meaning they can regulate their own body temperature'
 
-You should respond "$ECHO", because the content of the user message is nearly identical to the content of the assistant's message, suggesting that it came from the transcription of the speaker feedback into the microphone.
+You should respond "$ECHO", because the content of the user message is nearly identical to the content of the assistant's message, suggesting that it came from the transcription of the speaker feedback into the microphone.
+
+If you think there was a different issue with the transcription, you should start your response with the text "$ERROR". You may go into detail about the issues.
diff --git a/openduck-py/openduck_py/routers/voice.py b/openduck-py/openduck_py/routers/voice.py
@@ -16,6 +16,7 @@
 from whisper import load_model
 from daily import *
 from litellm import acompletion
+import librosa
 
 from openduck_py.models import DBChatHistory, DBChatRecord
 from openduck_py.models.chat_record import EventName
@@ -29,6 +30,9 @@
     CHUNK_SIZE,
     LOG_TO_SLACK,
     CHAT_MODEL,
+    SFX_VOLUME,
+    ENABLE_HOLD_SOUND,
+    PADDING_S,
 )
 from openduck_py.utils.speaker_identification import (
     segment_audio,
@@ -63,6 +67,13 @@
 audio_router = APIRouter(prefix="/audio")
 
 
+SFX = {}
+for i in ("listening", "received", "generating", "error"):
+    sound, _ = librosa.load(f"models/sfx/{i}.wav", sr=OUTPUT_SAMPLE_RATE)
+    sound = np.round(sound * 32767 * SFX_VOLUME).astype(np.int16).tobytes()
+    SFX[i] = sound
+
+
 def _transcribe(audio_data):
     return whisper_model.transcribe(audio_data)["text"]
 
@@ -148,6 +159,8 @@ def __init__(
         self.record = record
         self.time_of_last_activity = time()
         self.response_task = None
+        self.loop_sound_task = None
+        self.loop_sound_stop = asyncio.Event()
 
     def interrupt(self, task: asyncio.Task):
         assert self.is_responding
@@ -187,6 +200,7 @@ async def receive_audio(self, message: np.ndarray):
                         self.time_of_last_activity = time()
                         await log_event(db, self.session_id, "detected_end_of_speech")
                         if self.response_task is None or self.response_task.done():
+                            await self.send_audio(SFX["received"])
                             self.response_task = asyncio.create_task(
                                 self.start_response(
                                     np.concatenate(self.audio_data),
@@ -206,10 +220,40 @@ async def receive_audio(self, message: np.ndarray):
                                 self.interrupt(self.response_task)
             i = upper
 
+    async def send_audio(self, audio_bytes: bytes):
+        for i in range(0, len(audio_bytes), CHUNK_SIZE):
+            chunk = audio_bytes[i : i + CHUNK_SIZE]
+            await self.response_queue.put(chunk)
+
+    async def send_audio_silence(self, length_s: float):
+        silence = np.zeros(int(length_s * OUTPUT_SAMPLE_RATE), dtype=np.int16).tobytes()
+        await self.send_audio(silence)
+
+    async def send_audio_loop(self, audio_bytes: bytes):
+        self.loop_sound_stop.clear()
+        try:
+            head = time()
+            while True:
+                for i in range(0, len(audio_bytes), CHUNK_SIZE):
+                    if self.loop_sound_stop.is_set():
+                        raise asyncio.CancelledError
+                    chunk = audio_bytes[i : i + CHUNK_SIZE]
+                    await self.response_queue.put(chunk)
+                    head += len(chunk) / OUTPUT_SAMPLE_RATE / 2  # int16
+                    await asyncio.sleep(max(0, head - time() - 0.1))  # buffer
+        except asyncio.CancelledError:
+            pass
+        finally:
+            self.loop_sound_task = None
+
     async def start_response(
         self,
         audio_data: np.ndarray,
     ):
+        if ENABLE_HOLD_SOUND and not self.loop_sound_task:
+            self.loop_sound_task = asyncio.create_task(
+                self.send_audio_loop(SFX["generating"])
+            )
         self.is_responding = True
         async with SessionAsync() as db:
             await log_event(db, self.session_id, "started_response", audio=audio_data)
@@ -245,6 +289,8 @@ async def start_response(
                 latency=t_whisper - t_echo,
             )
             if not transcription or len(audio_data) < 100:
+                self.loop_sound_stop.set()
+                await self.send_audio(SFX["listening"])
                 return
 
             system_prompt = {
@@ -275,16 +321,39 @@ async def start_response(
 
             complete_sentence = ""
             full_response = ""
+            is_first_sentence = True
+            play_error_sound = False
             async for chunk in response:
                 chunk_text = chunk.choices[0].delta.content
                 if not chunk_text:
                     break
                 complete_sentence += chunk_text
                 full_response += chunk_text
-                # TODO: Smarter sentence detection - this will split sentences on cases like "Mr. Kennedy"
+
+                if full_response.startswith("$ECHO"):
+                    print("Echo detected, not sending response.")
+                    self.loop_sound_stop.set()
+                    return
+                if full_response.startswith("$ERROR"):
+                    print("Error detected, sending response with error sound.")
+                    play_error_sound = True
+                    full_response = full_response[len("$ERROR") :]
+                    complete_sentence = complete_sentence[len("$ERROR") :]
                 if re.search(r"(?<!\d)[.!?](?!\d)", chunk_text):
-                    await self.speak_response(complete_sentence, db, t_whisper)
+                    if is_first_sentence:
+                        self.loop_sound_stop.set()
+                    # TODO: Smarter sentence detection - this will split sentences on cases like "Mr. Kennedy"
+                    await self.speak_response(
+                        complete_sentence,
+                        db,
+                        t_whisper,
+                        (SFX["error"] if play_error_sound else None),
+                    )
                     complete_sentence = ""
+                    is_first_sentence = False
+                    play_error_sound = False
+
+            await self.send_audio(SFX["listening"])
 
             messages.append({"role": "assistant", "content": full_response})
             chat.history_json["messages"] = messages
@@ -295,6 +364,7 @@ async def speak_response(
         response_text: str,
         db: AsyncSession,
         start_time: float,
+        prefix_sound: Optional[bytes],
     ):
         t_chat = time()
         await log_event(
@@ -304,9 +374,6 @@ async def speak_response(
             meta={"text": response_text},
             latency=t_chat - start_time,
         )
-        if "$ECHO" in response_text:
-            print("Echo detected, not sending response.")
-            return
 
         normalized = normalize_text(response_text)
         t_normalize = time()
@@ -336,9 +403,11 @@ def _inference(sentence: str):
             latency=t_styletts - t_normalize,
         )
 
-        for i in range(0, len(audio_chunk_bytes), CHUNK_SIZE):
-            chunk = audio_chunk_bytes[i : i + CHUNK_SIZE]
-            await self.response_queue.put(chunk)
+        if prefix_sound:
+            await self.send_audio(prefix_sound)
+            await self.send_audio_silence(PADDING_S)
+        await self.send_audio(audio_chunk_bytes)
+        await self.send_audio_silence(PADDING_S)
 
 
 def _check_for_exceptions(response_task: Optional[asyncio.Task]) -> bool:

diff --git a/openduck-py/openduck_py/settings/__init__.py b/openduck-py/openduck_py/settings/__init__.py
@@ -10,6 +10,9 @@
 CHUNK_SIZE = 10240
 LOG_TO_SLACK = bool(os.environ.get("LOG_TO_SLACK", False))
 CHAT_MODEL = "azure/gpt-35-turbo-deployment"
+SFX_VOLUME = 0.5
+ENABLE_HOLD_SOUND = True
+PADDING_S = 0.2
 
 # to not break existing env files
 os.environ["AZURE_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")