# Working with Audio

## Univeral Code Used for the Entire Notebook

Let's set up our libraries and client

In [None]:
# Install necessary packages for handling sound files and sound devices
# Uncomment the following line if you need to install the packages
# !pip install soundfile sounddevice
# !pip install pyaudio
# !pip install --upgrade openai

In [1]:
import os  # For interacting with the operating system
import requests  # For making HTTP requests
from io import BytesIO  # For handling byte streams

import soundfile as sf  # For reading and writing sound files
import sounddevice as sd  # For playing and recording sound
import pyaudio

from IPython.display import Audio, display, clear_output, Markdown  # For displaying content in Jupyter Notebooks

from openai import OpenAI, AssistantEventHandler  # For OpenAI API and event handling
from typing_extensions import override  # For method overriding in subclasses

from pathlib import Path
import openai
from IPython.display import display, HTML
import pyaudio
import time
import threading
import queue
import re


In [2]:
client = OpenAI()  # Initialize the OpenAI client


### Generating an Audio File

Using the openai api library approach

In [None]:
# Define the speech file path
speech_file_path = "./fight_on_the_beaches.mp3"

# Create the TTS (Text-to-Speech) request
response = client.audio.speech.create(
    model="tts-1-hd",  # Specify the TTS model to use
    voice="fable",  # Specify the voice to use for the TTS
    input="""
    Even though large tracts of Europe and many old and famous States have fallen or may fall into the grip of the Gestapo and all the odious apparatus of Nazi rule, we shall not flag or fail. We shall go on to the end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing confidence and growing strength in the air, we shall defend our Island, whatever the cost may be, we shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a moment believe, this Island or a large part of it were subjugated and starving, then our Empire beyond the seas, armed and guarded by the British Fleet, would carry on the struggle, until, in God’s good time, the New World, with all its power and might, steps forth to the rescue and the liberation of the old.
    """  # Input text to be converted to speech
)

# Save the response audio to a file
with open(speech_file_path, 'wb') as file:
    file.write(response.content)  # Write the audio content to the file

# Print a message indicating where the audio was saved
print(f"Audio saved to {speech_file_path}")


Using the API endpoint approach

In [None]:

# Define the speech file path
speech_file_path = "./old_soldiers_never_die.mp3"
api_key = os.getenv("OPENAI_API_KEY")

# API endpoint and headers
url = "https://api.openai.com/v1/audio/speech"
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# Data payload for the request
data = {
    "model": "tts-1",
    "voice": "shimmer",
    "input": """
    I still remember the refrain of one of the most popular barracks ballads of that day which proclaimed most proudly that old soldiers never die; they just fade away. And like the old soldier of that ballad, I now close my military career and just fade away, an old soldier who tried to do his duty as God gave him the light to see that duty.
    """
}

# Make the synchronous request
response = requests.post(url, headers=headers, json=data)

# Check if the request was successful
if response.status_code == 200:
    with open(speech_file_path, 'wb') as file:
        file.write(response.content)
    print(f"Audio saved to {speech_file_path}")
else:
    print(f"Error: {response.status_code} - {response.text}")


### Using Chat Completion and Assistant Output

Chat Completion to Audio without End-to-End Streaming

In [None]:
# Set up OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

# Initialize the OpenAI client
client = OpenAI()

# Define the speech file path
speech_file_path = "./war_message_to_congress.mp3"

# Create the chat completion request
chat_completion = client.chat.completions.create(
    model="gpt-4o",  # Specify the model to use
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},  # System message to set the assistant's behavior
        {"role": "user", "content": "Give me one paragraph on penguins"}  # User message to initiate the conversation
    ],
    stream=True  # Enable streaming responses
)

# Capture the chat output in real-time
chat_output = ""
for chunk in chat_completion:
    delta = chunk.choices[0].delta
    content = getattr(delta, 'content', None)  # Safely get the content attribute
    if content:
        print(content, end='')  # Print without newline to maintain the flow
        chat_output += content

# API endpoint and headers for TTS
url = "https://api.openai.com/v1/audio/speech"
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# Data payload for the TTS request
data = {
    "model": "tts-1-hd",
    "voice": "onyx",
    "input": chat_output,  # Use chat output as input for TTS
    "response_format": "mp3"  # Use MP3 format for response
}

# Make the TTS request with streaming enabled
response = requests.post(url, headers=headers, json=data, stream=True)

# Check if the TTS request was successful
if response.status_code == 200:
    buffer = BytesIO()

    # Save audio chunks to file and buffer
    with open(speech_file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
                buffer.write(chunk)
    
    buffer.seek(0)
    
    # Read and play the audio from the buffer
    data, samplerate = sf.read(buffer)
    sd.play(data, samplerate)
    sd.wait()  # Wait until the audio playback is done

    print(f"Audio saved to {speech_file_path}")
else:
    print(f"Error: {response.status_code} - {response.text}")


Assistant to Audio without End-to-end Streaming

In [None]:
# Set up OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")
OpenAI.api_key = api_key

# Initialize the OpenAI client
client = OpenAI()

class EventHandler(AssistantEventHandler):
    """Custom event handler for processing assistant events."""

    def __init__(self):
        super().__init__()
        self.results = []  # Initialize an empty list to store the results

    @override
    def on_text_delta(self, delta, snapshot):
        """Handle the event when there is a text delta (partial text)."""
        # Append the delta value (partial text) to the results list
        text = delta.value
        self.results.append(text)
        # Call the method to update the Jupyter Notebook cell
        self.update_output()

    def update_output(self):
        """Update the Jupyter Notebook cell with the current markdown content."""
        # Clear the current output in the Jupyter Notebook cell
        clear_output(wait=True)
        # Join all the text fragments stored in results to form the complete markdown content
        markdown_content = "".join(self.results)
        # Display the markdown content in the Jupyter Notebook cell
        display(Markdown(markdown_content))

    def get_complete_response(self):
        """Return the complete response as a single string."""
        return "".join(self.results)

def text_to_speech(text, api_key):
    """Convert text to speech and play the audio."""
    tts_url = "https://api.openai.com/v1/audio/speech"
    tts_headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    tts_model = "tts-1-hd"
    voice = "onyx"

    data = {
        "model": tts_model,
        "voice": voice,
        "input": text,
        "response_format": "mp3"
    }
    response = requests.post(tts_url, headers=tts_headers, json=data, stream=True)
    if response.status_code == 200:
        buffer = BytesIO()
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                buffer.write(chunk)
        buffer.seek(0)
        data, samplerate = sf.read(buffer)
        sd.play(data, samplerate)
        sd.wait()
    else:
        print(f"Error: {response.status_code} - {response.text}")

# Create an assistant using the client library.
assistant = client.beta.assistants.create(
    model="gpt-4o",  # Specify the model to be used.
    instructions="You are a helpful assistant.",  # Instructions for the assistant.
    temperature=1,  # Set the temperature for response variability.
    top_p=1,  # Set the top_p for nucleus sampling.
)

# Create a new assistant thread with an initial user message
assistant_thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Give me one paragraph on penguins",
                },
            ],
        },
    ]
)

# Create an instance of the custom event handler
event_handler = EventHandler()

# Stream the assistant's response
with client.beta.threads.runs.stream(
    thread_id=assistant_thread.id,  # Specify the thread ID.
    assistant_id=assistant.id,  # Specify the assistant ID.
    event_handler=event_handler,  # Use the custom event handler for processing events.
) as stream:
    stream.until_done()  # Continue streaming until the assistant has finished responding.

# Get the complete response from the event handler
complete_response = event_handler.get_complete_response()

# Convert the complete response to speech
text_to_speech(complete_response, api_key)


### Generating an Audio File with Streaming

Using the openai api library approach

(doesn't work)

In [None]:
# Define the speech file path
speech_file_path = "./wonderfulday.mp3"

# Create the TTS (Text-to-Speech) request
response = client.audio.speech.create(
    model="tts-1",  # Specify the TTS model to use
    voice="alloy",  # Specify the voice to use for the TTS
    input="Today is a wonderful day to build something people love!"  # Input text to be converted to speech
)

# Save the response audio to a file using stream_to_file method
response.stream_to_file(speech_file_path)

# Print a message indicating where the audio was saved
print(f"Audio saved to {speech_file_path}")


Using the openai api library approach

(corrected version)

In [4]:
from openai import OpenAI
from pathlib import Path

# Define the speech file path
speech_file_path = "./wonderfulday.mp3"

# Create the TTS (Text-to-Speech) request using the recommended method
with client.audio.speech.with_streaming_response.create(
    model="tts-1",
    voice="alloy",
    input="Today is a wonderful day to build something people love!"
) as response:
    with open(speech_file_path, 'wb') as f:
        for chunk in response.iter_bytes():
            f.write(chunk)

# Print a message indicating where the audio was saved
print(f"Audio saved to {speech_file_path}")



Audio saved to ./wonderfulday.mp3


In [26]:


# Initialize OpenAI client with your API key
client = OpenAI()

# Initialize PyAudio, which provides bindings for PortAudio, a cross-platform audio library
p = pyaudio.PyAudio()

# Open a stream with specific audio format parameters
stream = p.open(format=pyaudio.paInt16,  # Format: 16-bit PCM (Pulse Code Modulation)
                channels=1,              # Channels: 1 (Mono)
                rate=24000,              # Sample rate: 24,000 Hz (samples per second)
                output=True)             # Stream opened for output (playback)

# Function to stream and play audio in real-time
def stream_audio():
    # Create a TTS (Text-to-Speech) request
    with client.audio.speech.with_streaming_response.create(
        model="tts-1",                   # Specify the TTS model to use
        voice="alloy",                   # Specify the voice to use for TTS
        input="""
    The present German submarine warfare against commerce is a warfare against mankind.

    It is war against all nations.

    American ships have been sunk, American lives taken, in ways which it has stirred us very deeply to learn of, but the ships and people of other neutral and friendly nations have been sunk and overwhelmed in the waters in the same way. There has been no discrimination. The challenge is to all mankind.

    Each nation must decide for itself how it will meet it. The choice we make for ourselves must be made with a moderation of counsel and temperateness of judgment befitting our character and our motives as a nation. We must put excited feeling away. Our motive will not be revenge or the victorious assertion of the physical might of the nation, but only the vindication of right, of human right, of which we are only a single champion.
    """,  # Input text to be converted to speech
        response_format="pcm"            # Response format: PCM (Pulse Code Modulation)
    ) as response:
        # Iterate over the audio chunks in the response
        for chunk in response.iter_bytes(1024):  # Read 1024 bytes at a time
            stream.write(chunk)  # Write each chunk to the PyAudio stream for playback

# Start streaming and playing the audio
stream_audio()

# Close the PyAudio stream properly
stream.stop_stream()  # Stop the stream
stream.close()        # Close the stream
p.terminate()         # Terminate the PyAudio session


### Using Chat Completion and Assistant Output

Chat Completion to Audio with End-to-End Streaming

In [9]:
# Create the chat completion request
chat_completion = client.chat.completions.create(
    model="gpt-4o",  # Specify the model to use
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},  # System message to set the assistant's behavior
        {"role": "user", "content": "Give me a paragraph about the importance of space exploration."}  # User message to initiate the conversation
    ],
    stream=True  # Enable streaming responses
)

# Function to stream the response
def stream_response(chat_completion):
    full_response = ""
    display_id = display(HTML(full_response), display_id=True)
    for chunk in chat_completion:
        if chunk.choices[0].delta.content is not None:
            full_response += chunk.choices[0].delta.content
            display_id.update(HTML(full_response))

# Call the function to stream the response
stream_response(chat_completion)


In [11]:
# Initialize OpenAI client
client = openai.OpenAI()

# Initialize PyAudio
p = pyaudio.PyAudio()

# Open a stream with specific audio format parameters
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=24000,
                output=True)

# Create separate queues for text and audio chunks
text_queue = queue.Queue()
sentence_queue = queue.Queue()
audio_queue = queue.Queue()

# Flags for process control
text_generation_complete = threading.Event()
sentence_processing_complete = threading.Event()
audio_generation_complete = threading.Event()

def generate_and_display_text():
    chat_completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Give me a paragraph about the importance of space exploration."}
        ],
        stream=True
    )
    
    full_response = ""
    display_id = display(HTML(full_response), display_id=True)
    
    for chunk in chat_completion:
        if chunk.choices[0].delta.content:
            new_text = chunk.choices[0].delta.content
            full_response += new_text
            display_id.update(HTML(full_response))
            text_queue.put(new_text)
    
    text_generation_complete.set()

def process_sentences():
    sentence_buffer = ""
    while not (text_generation_complete.is_set() and text_queue.empty()):
        try:
            new_text = text_queue.get(timeout=0.1)
            sentence_buffer += new_text
            sentences = re.findall(r'[^.!?]+[.!?]', sentence_buffer)
            for sentence in sentences:
                sentence_queue.put(sentence.strip())
            sentence_buffer = re.sub(r'.*[.!?]', '', sentence_buffer)
        except queue.Empty:
            continue
    
    if sentence_buffer:
        sentence_queue.put(sentence_buffer.strip())
    
    sentence_processing_complete.set()

def generate_audio():
    while not (sentence_processing_complete.is_set() and sentence_queue.empty()):
        try:
            sentence = sentence_queue.get(timeout=0.5)
            with client.audio.speech.with_streaming_response.create(
                model="tts-1",
                voice="alloy",
                input=sentence,
                response_format="pcm"
            ) as response:
                for audio_chunk in response.iter_bytes(1024):
                    audio_queue.put(audio_chunk)
            
            # Add a short pause between sentences
            audio_queue.put(b'\x00' * 4800)  # 0.1 seconds of silence at 24000 Hz
        except queue.Empty:
            continue
    
    audio_generation_complete.set()

def play_audio():
    audio_started = False
    while not (audio_generation_complete.is_set() and audio_queue.empty()):
        try:
            audio_chunk = audio_queue.get(timeout=0.5)
            stream.write(audio_chunk)
            if not audio_started:
                audio_started = True
        except queue.Empty:
            continue
    

# Start text generation and display in a separate thread
text_thread = threading.Thread(target=generate_and_display_text)
text_thread.start()

# Start sentence processing in a separate thread
sentence_thread = threading.Thread(target=process_sentences)
sentence_thread.start()

# Start audio generation in a separate thread
audio_gen_thread = threading.Thread(target=generate_audio)
audio_gen_thread.start()

# Wait a short moment before starting audio playback
time.sleep(1)

# Start audio playback in a separate thread
audio_play_thread = threading.Thread(target=play_audio)
audio_play_thread.start()

# Wait for all threads to complete
text_thread.join()
sentence_thread.join()
audio_gen_thread.join()
audio_play_thread.join()

# Close the PyAudio stream properly
stream.stop_stream()
stream.close()
p.terminate()


Assistant to Audio with End-to-end Streaming

In [27]:

# Initialize the OpenAI client
client = OpenAI()

class EventHandler(AssistantEventHandler):
    """Custom event handler for processing assistant events."""

    def __init__(self, batch_interval=5):
        super().__init__()
        self.results = []  # Initialize an empty list to store the results
        self.tts_url = "https://api.openai.com/v1/audio/speech"
        self.tts_headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        self.tts_model = "tts-1-hd"
        self.voice = "onyx"
        self.buffer = BytesIO()
        self.batch_interval = batch_interval  # Time interval for batching in seconds
        self.last_batch_time = time.time()

    @override
    def on_text_delta(self, delta, snapshot):
        """Handle the event when there is a text delta (partial text)."""
        # Append the delta value (partial text) to the results list
        text = delta.value
        self.results.append(text)
        # Call the method to update the Jupyter Notebook cell
        self.update_output()

        # Check if it's time to send a batch to TTS
        current_time = time.time()
        if current_time - self.last_batch_time >= self.batch_interval:
            self.batch_to_tts()
            self.last_batch_time = current_time

    def update_output(self):
        """Update the Jupyter Notebook cell with the current markdown content."""
        # Clear the current output in the Jupyter Notebook cell
        clear_output(wait=True)
        # Join all the text fragments stored in results to form the complete markdown content
        markdown_content = "".join(self.results)
        # Display the markdown content in the Jupyter Notebook cell
        display(Markdown(markdown_content))

    

# Create an assistant using the client library.
assistant = client.beta.assistants.create(
    model="gpt-4o",  # Specify the model to be used.
    instructions="You are a helpful assistant.",  # Instructions for the assistant.
    temperature=1,  # Set the temperature for response variability.
    top_p=1,  # Set the top_p for nucleus sampling.
)

# Create a new assistant thread with an initial user message
assistant_thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Give me one paragraph on penguins",
                },
            ],
        },
    ]
)

# Create an instance of the custom event handler
event_handler = EventHandler(batch_interval=5)  # Adjust batch interval as needed

# Stream the assistant's response
with client.beta.threads.runs.stream(
    thread_id=assistant_thread.id,  # Specify the thread ID.
    assistant_id=assistant.id,  # Specify the assistant ID.
    event_handler=event_handler,  # Use the custom event handler for processing events.
) as stream:
    stream.until_done()  # Continue streaming until the assistant has finished responding.



Penguins are fascinating flightless birds primarily found in the Southern Hemisphere, with a significant population in Antarctica. These unique creatures are well-adapted to their cold environments with their distinctive black and white plumage and a thick layer of blubber. Penguins are exceptional swimmers, able to dive to great depths and swim at impressive speeds to catch their prey, which mainly consists of fish, squid, and krill. They exhibit remarkable social behaviors, often huddling together for warmth and engaging in elaborate mating rituals. Despite their seemingly clumsy waddles on land, penguins are agile and graceful in the water, showcasing nature's incredible adaptability.

Process completed.


In [29]:
import time
import threading
import queue
import pyaudio
import re
from openai import OpenAI
from IPython.display import clear_output, display, Markdown

# Initialize the OpenAI client
client = OpenAI()

class EventHandler(AssistantEventHandler):
    def __init__(self):
        super().__init__()
        self.results = []
        self.text_buffer = ""
        self.sentence_queue = queue.Queue()
        self.audio_queue = queue.Queue()
        self.text_generation_complete = threading.Event()

        # Initialize PyAudio
        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(format=pyaudio.paInt16,
            channels=1,
            rate=24000,
            output=True)

        # Start audio processing and playback threads
        self.audio_processing_thread = threading.Thread(target=self.process_sentences)
        self.audio_processing_thread.start()
        self.audio_playback_thread = threading.Thread(target=self.play_audio)
        self.audio_playback_thread.start()

    def on_text_delta(self, delta, snapshot):
        text = delta.value
        self.results.append(text)
        self.text_buffer += text
        self.process_text_buffer()
        self.update_output()

    def process_text_buffer(self):
        sentences = re.findall(r'[^.!?]+[.!?]', self.text_buffer)
        for sentence in sentences:
            self.sentence_queue.put(sentence.strip())
        self.text_buffer = re.sub(r'.*[.!?]', '', self.text_buffer)

    def update_output(self):
        clear_output(wait=True)
        markdown_content = "".join(self.results)
        display(Markdown(markdown_content))

    def process_sentences(self):
        while not self.text_generation_complete.is_set() or not self.sentence_queue.empty():
            try:
                sentence = self.sentence_queue.get(timeout=0.1)
                with client.audio.speech.with_streaming_response.create(
                    model="tts-1",
                    voice="onyx",
                    input=sentence,
                    response_format="pcm"
                ) as response:
                    for audio_chunk in response.iter_bytes(1024):
                        self.audio_queue.put(audio_chunk)
                # Add a short pause between sentences
                self.audio_queue.put(b'\x00' * 2400)  # 0.05 seconds of silence at 24000 Hz
            except queue.Empty:
                continue

    def play_audio(self):
        while not self.text_generation_complete.is_set() or not self.audio_queue.empty():
            try:
                audio_chunk = self.audio_queue.get(timeout=0.1)
                self.stream.write(audio_chunk)
            except queue.Empty:
                continue

        self.stream.stop_stream()
        self.stream.close()
        self.p.terminate()

    def on_end(self):
        # Process any remaining text in the buffer
        self.process_text_buffer()
        if self.text_buffer:
            self.sentence_queue.put(self.text_buffer.strip())
        
        self.text_generation_complete.set()
        self.audio_processing_thread.join()
        self.audio_playback_thread.join()

# Create an assistant using the client library.
assistant = client.beta.assistants.create(
    model="gpt-4o",
    instructions="You are a helpful assistant.",
    temperature=1,
    top_p=1,
)

# Create a new assistant thread with an initial user message
assistant_thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Give me one paragraph on penguins",
                },
            ],
        },
    ]
)

# Create an instance of the custom event handler
event_handler = EventHandler()

# Stream the assistant's response
with client.beta.threads.runs.stream(
    thread_id=assistant_thread.id,
    assistant_id=assistant.id,
    event_handler=event_handler,
) as stream:
    stream.until_done()

event_handler.on_end()


Penguins are a unique group of flightless birds known for their distinctive black and white plumage and their incredible adaptation to life in the ocean. Predominantly found in the Southern Hemisphere, with the majority residing in Antarctica, these birds have evolved to withstand extreme cold with their thick layers of blubber and tightly packed feathers. Penguins are also exceptional swimmers, using their flipper-like wings to propel themselves through the water with remarkable agility in pursuit of fish, krill, and other sea creatures. On land, they exhibit a range of fascinating behaviors, such as intricate courtship rituals and communal nesting practices, making them one of the most captivating avian species.

Process completed.
