Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sound effects #67

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion clients/daily/daily_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@


class PyAudioApp:

def __init__(self):
self.__app_quit = False

Expand Down
8 changes: 4 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ services:
build: .
command: >
bash -c "python setup.py develop && \
mkdir -p models/styletts2 && \
aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \
mkdir -p models && \
aws s3 sync s3://uberduck-models-us-west-2/prototype models && \
uvicorn openduck_py.routers.main:app --reload --host 0.0.0.0 --port 8000"
working_dir: /openduck-py/openduck-py
volumes:
Expand All @@ -21,8 +21,8 @@ services:
build: .
command: >
bash -c "python setup.py develop && \
mkdir -p models/styletts2 && \
aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \
mkdir -p models && \
aws s3 sync s3://uberduck-models-us-west-2/prototype models && \
apt update && apt install -y python3-pip && \
pip install watchdog[watchmedo] && \
watchmedo auto-restart --directory=./openduck_py --pattern=*.py --recursive -- python openduck_py/routers/voice.py"
Expand Down
4 changes: 3 additions & 1 deletion openduck-py/openduck_py/prompts/system-prompt.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ For example, in the following exchange:
Assistant: 'Mammals are unique because they have hair or fur, produce milk to feed their young, and are warm-blooded, meaning they can regulate their own body temperature.'
User: 'mammals are unique because they have hair or fur produce milk to feed their young and are warm blooded meaning they can regulate their own body temperature'

You should respond "$ECHO", because the content of the user message is nearly identical to the content of the assistant's message, suggesting that it came from the transcription of the speaker feedback into the microphone.
You should respond "$ECHO", because the content of the user message is nearly identical to the content of the assistant's message, suggesting that it came from the transcription of the speaker feedback into the microphone.

If you think there was a different issue with the transcription, you should start your response with the text "$ERROR". You may go into detail about the issues.
85 changes: 77 additions & 8 deletions openduck-py/openduck_py/routers/voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from whisper import load_model
from daily import *
from litellm import acompletion
import librosa

from openduck_py.models import DBChatHistory, DBChatRecord
from openduck_py.models.chat_record import EventName
Expand All @@ -29,6 +30,9 @@
CHUNK_SIZE,
LOG_TO_SLACK,
CHAT_MODEL,
SFX_VOLUME,
ENABLE_HOLD_SOUND,
PADDING_S,
)
from openduck_py.utils.speaker_identification import (
segment_audio,
Expand Down Expand Up @@ -63,6 +67,13 @@
audio_router = APIRouter(prefix="/audio")


SFX = {}
for i in ("listening", "received", "generating", "error"):
sound, _ = librosa.load(f"models/sfx/{i}.wav", sr=OUTPUT_SAMPLE_RATE)
sound = np.round(sound * 32767 * SFX_VOLUME).astype(np.int16).tobytes()
SFX[i] = sound


def _transcribe(audio_data):
return whisper_model.transcribe(audio_data)["text"]

Expand Down Expand Up @@ -148,6 +159,8 @@ def __init__(
self.record = record
self.time_of_last_activity = time()
self.response_task = None
self.loop_sound_task = None
self.loop_sound_stop = asyncio.Event()

def interrupt(self, task: asyncio.Task):
assert self.is_responding
Expand Down Expand Up @@ -187,6 +200,7 @@ async def receive_audio(self, message: np.ndarray):
self.time_of_last_activity = time()
await log_event(db, self.session_id, "detected_end_of_speech")
if self.response_task is None or self.response_task.done():
await self.send_audio(SFX["received"])
self.response_task = asyncio.create_task(
self.start_response(
np.concatenate(self.audio_data),
Expand All @@ -206,10 +220,40 @@ async def receive_audio(self, message: np.ndarray):
self.interrupt(self.response_task)
i = upper

async def send_audio(self, audio_bytes: bytes):
for i in range(0, len(audio_bytes), CHUNK_SIZE):
chunk = audio_bytes[i : i + CHUNK_SIZE]
await self.response_queue.put(chunk)

async def send_audio_silence(self, length_s: float):
silence = np.zeros(int(length_s * OUTPUT_SAMPLE_RATE), dtype=np.int16).tobytes()
await self.send_audio(silence)

async def send_audio_loop(self, audio_bytes: bytes):
self.loop_sound_stop.clear()
try:
head = time()
while True:
for i in range(0, len(audio_bytes), CHUNK_SIZE):
if self.loop_sound_stop.is_set():
raise asyncio.CancelledError
chunk = audio_bytes[i : i + CHUNK_SIZE]
await self.response_queue.put(chunk)
head += len(chunk) / OUTPUT_SAMPLE_RATE / 2 # int16
await asyncio.sleep(max(0, head - time() - 0.1)) # buffer
except asyncio.CancelledError:
pass
finally:
self.loop_sound_task = None

async def start_response(
self,
audio_data: np.ndarray,
):
if ENABLE_HOLD_SOUND and not self.loop_sound_task:
self.loop_sound_task = asyncio.create_task(
self.send_audio_loop(SFX["generating"])
)
self.is_responding = True
async with SessionAsync() as db:
await log_event(db, self.session_id, "started_response", audio=audio_data)
Expand Down Expand Up @@ -245,6 +289,8 @@ async def start_response(
latency=t_whisper - t_echo,
)
if not transcription or len(audio_data) < 100:
self.loop_sound_stop.set()
await self.send_audio(SFX["listening"])
return

system_prompt = {
Expand Down Expand Up @@ -275,16 +321,39 @@ async def start_response(

complete_sentence = ""
full_response = ""
is_first_sentence = True
play_error_sound = False
async for chunk in response:
chunk_text = chunk.choices[0].delta.content
if not chunk_text:
break
complete_sentence += chunk_text
full_response += chunk_text
# TODO: Smarter sentence detection - this will split sentences on cases like "Mr. Kennedy"

if full_response.startswith("$ECHO"):
print("Echo detected, not sending response.")
self.loop_sound_stop.set()
return
if full_response.startswith("$ERROR"):
print("Error detected, sending response with error sound.")
play_error_sound = True
full_response = full_response[len("$ERROR") :]
complete_sentence = complete_sentence[len("$ERROR") :]
if re.search(r"(?<!\d)[.!?](?!\d)", chunk_text):
await self.speak_response(complete_sentence, db, t_whisper)
if is_first_sentence:
self.loop_sound_stop.set()
# TODO: Smarter sentence detection - this will split sentences on cases like "Mr. Kennedy"
await self.speak_response(
complete_sentence,
db,
t_whisper,
(SFX["error"] if play_error_sound else None),
)
complete_sentence = ""
is_first_sentence = False
play_error_sound = False

await self.send_audio(SFX["listening"])

messages.append({"role": "assistant", "content": full_response})
chat.history_json["messages"] = messages
Expand All @@ -295,6 +364,7 @@ async def speak_response(
response_text: str,
db: AsyncSession,
start_time: float,
prefix_sound: Optional[bytes],
):
t_chat = time()
await log_event(
Expand All @@ -304,9 +374,6 @@ async def speak_response(
meta={"text": response_text},
latency=t_chat - start_time,
)
if "$ECHO" in response_text:
print("Echo detected, not sending response.")
return

normalized = normalize_text(response_text)
t_normalize = time()
Expand Down Expand Up @@ -336,9 +403,11 @@ def _inference(sentence: str):
latency=t_styletts - t_normalize,
)

for i in range(0, len(audio_chunk_bytes), CHUNK_SIZE):
chunk = audio_chunk_bytes[i : i + CHUNK_SIZE]
await self.response_queue.put(chunk)
if prefix_sound:
await self.send_audio(prefix_sound)
await self.send_audio_silence(PADDING_S)
await self.send_audio(audio_chunk_bytes)
await self.send_audio_silence(PADDING_S)


def _check_for_exceptions(response_task: Optional[asyncio.Task]) -> bool:
Expand Down
3 changes: 3 additions & 0 deletions openduck-py/openduck_py/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
CHUNK_SIZE = 10240
LOG_TO_SLACK = bool(os.environ.get("LOG_TO_SLACK", False))
CHAT_MODEL = "azure/gpt-35-turbo-deployment"
SFX_VOLUME = 0.5
ENABLE_HOLD_SOUND = True
PADDING_S = 0.2

# to not break existing env files
os.environ["AZURE_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
Expand Down
Loading