In [1]:
import asyncio
import getpass
import io
import json
import numpy as np
import os
import requests
import sys
import time
from IPython.display import Audio

cartesia_dir = f"/home/{getpass.getuser()}/cartesia-python"
sys.path.insert(0, cartesia_dir)
from cartesia.tts import CartesiaTTS, AsyncCartesiaTTS

In [2]:
api_key = os.environ.get("CARTESIA_API_KEY")

def get_stream_generation(generator):
    audio_data = io.BytesIO()
    for output in generator:
        buffer = output["audio"]
        audio_data.write(buffer)
    audio_data.seek(0)
    return {"audio": audio_data, "sampling_rate": output["sampling_rate"]}

async def get_async_stream_generation(generator):
    audio_data = io.BytesIO()
    async for output in generator:
        buffer = output["audio"]
        audio_data.write(buffer)
    audio_data.seek(0)
    return {"audio": audio_data, "sampling_rate": output["sampling_rate"]}

async def get_async_generation(client, transcript, voice, websocket):
    return await client.generate(transcript=transcript, voice=voice, websocket=websocket)

def display_audio_result(result_dict):
    audio = result_dict["audio"]
    if not isinstance(audio, bytes):
        audio = audio.read()
    audio = np.frombuffer(audio, dtype=np.float32)
    print(len(audio))
    display(Audio(audio, rate=result_dict["sampling_rate"]))

In [3]:
try:
    # there seems to be some weird caching with display(Audio()) when the variable already exists
    del sync_output_socket, sync_output_http, sync_output_socket_stream, sync_output_http_stream
    del async_output_socket, async_output_http, async_output_socket_stream, async_output_http_stream
except:
    pass

client = CartesiaTTS(api_key=api_key)
voices = client.get_voices()
embedding = client.get_voice_embedding(voice_id=voices["Milo"]["id"])
transcript = "Hello! Welcome to Cartesia. It is great to speak with you!"

start = time.time()
sync_output_socket = client.generate(transcript=transcript, voice=embedding, websocket=True)
print(f"Sync socket generation took {time.time()-start:.2f} sec")

start = time.time()
sync_output_http = client.generate(transcript=transcript, voice=embedding, websocket=False)
print(f"Sync HTTP generation took {time.time()-start:.2f} sec")

start = time.time()
sync_output_socket_stream = get_stream_generation(client.generate(transcript=transcript, voice=embedding, websocket=True, stream=True))
print(f"Sync socket streaming generation took {time.time()-start:.2f} sec")

start = time.time()
sync_output_http_stream = get_stream_generation(client.generate(transcript=transcript, voice=embedding, websocket=False, stream=True))
print(f"Sync HTTP streaming generation took {time.time()-start:.2f} sec")

async_client = AsyncCartesiaTTS(api_key=api_key)

start = time.time()
async_output_socket = await get_async_generation(async_client, transcript, embedding, websocket=True)
print(f"Async socket generation took {time.time()-start:.2f} sec")

start = time.time()
async_output_http = await get_async_generation(async_client, transcript, embedding, websocket=False)
print(f"Async HTTP generation took {time.time()-start:.2f} sec")

start = time.time()
async_output_socket_stream = await get_async_stream_generation(
    await async_client.generate(transcript=transcript, voice=embedding, websocket=True, stream=True))
print(f"Async socket streaming generation took {time.time()-start:.2f} sec")

start = time.time()
async_output_http_stream = await get_async_stream_generation(
    await async_client.generate(transcript=transcript, voice=embedding, websocket=False, stream=True))
print(f"Async HTTP streaming generation took {time.time()-start:.2f} sec")

print()
for var in ['sync_output_socket', 'sync_output_http', 'sync_output_socket_stream', 'sync_output_http_stream',
            'async_output_socket', 'async_output_http', 'async_output_socket_stream', 'async_output_http_stream']:
    print(var)
    display_audio_result(locals()[var])

Sync socket generation took 2.01 sec
Sync HTTP generation took 1.94 sec
Sync socket streaming generation took 1.60 sec
Sync HTTP streaming generation took 1.73 sec
Async socket generation took 1.78 sec
Async HTTP generation took 1.88 sec
Async socket streaming generation took 1.88 sec
Async HTTP streaming generation took 1.70 sec

sync_output_socket
142848


sync_output_http
138240


sync_output_socket_stream
119808


sync_output_http_stream
129024


async_output_socket
124416


async_output_http
147456


async_output_socket_stream
142848


async_output_http_stream
129024


In [4]:
sync_transcription = client.transcribe(os.path.join(cartesia_dir, "tests/mock_data/sample_speech.wav"))
print(sync_transcription)

async_transcription = await async_client.transcribe(os.path.join(cartesia_dir, "tests/mock_data/sample_speech.wav"))
print(async_transcription)

#del async_client  # needed for proper resource cleanup

It is a great day to be alive when all of the trees are green.
It is a great day to be alive when all of the trees are green.
