In [26]:
from pydub import AudioSegment
from dotenv import load_dotenv
import io, os, wave
import assemblyai as aai

In [27]:
load_dotenv()

True

In [45]:
# Function to convert MP3/M4A to PCM S16LE format
def convert_audio_to_pcm_s16le(file_path):
    print(f"Converting {file_path} to PCM S16LE format...")
    audio = AudioSegment.from_file(file_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)  # 16-bit, mono, 16kHz
    buffer = io.BytesIO()
    audio.export(buffer, format="raw")
    print("Conversion completed.")
    return buffer.getvalue()

In [28]:
# Function to convert MP3/M4A to PCM MULAW format
def convert_audio_to_mulaw(file_path):
    print(f"Converting {file_path} to PCM MULAW format...")
    audio = AudioSegment.from_file(file_path)
    audio = audio.set_frame_rate(8000).set_channels(1).set_sample_width(2)
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    print("Conversion completed.")
    return buffer.getvalue()

In [54]:
def verify_audio_pcm_s16le(audio_data, expected_sample_rate=16000, expected_channels=1, expected_sample_width=2):
    """
    Verify the properties of raw PCM S16LE audio data.
    """
    print(f"Verifying raw PCM data: {len(audio_data)} bytes")
    sample_width_bytes = expected_sample_width  # 2 bytes for 16-bit audio
    num_samples = len(audio_data) // (sample_width_bytes * expected_channels)
    duration_seconds = num_samples / expected_sample_rate

    print(f"Audio duration: {duration_seconds:.2f} seconds")
    assert num_samples > 0, "Audio data contains no samples"
    print("Raw PCM data verification passed.")


In [48]:
# Function to verify audio format
def verify_audio_mulaw(audio_data):
    with io.BytesIO(audio_data) as audio_file:
        with wave.open(audio_file, "rb") as wf:
            print(f"Audio format: {wf.getnchannels()} channel(s), {wf.getframerate()} Hz, {wf.getsampwidth()} byte(s) per sample")
            assert wf.getnchannels() == 1, "Audio is not mono"
            assert wf.getframerate() == 8000, "Sample rate is not 8kHz"
            assert wf.getsampwidth() == 2, "Sample width is not 16-bit PCM"

In [49]:
api_key = os.getenv('ASSEMBLYAI_API_KEY')
if not api_key:
    raise ValueError("ASSEMBLYAI_API_KEY not set in .env file")
aai.settings.api_key = api_key

In [51]:
def stream_audio_realtime(audio_data, sample_rate:int = 8000, mulaw_encoding:bool = True):
    """
    Stream audio data in chunks to AssemblyAI for realtime transcription.
    """
    final_transcripts = []
    session_opened = False

    if mulaw_encoding:
        encoding = aai.AudioEncoding.pcm_mulaw
    else:
        encoding = aai.AudioEncoding.pcm_s16le

    # Define callback functions
    def on_open(session_opened_event: aai.RealtimeSessionOpened):
        nonlocal session_opened
        session_opened = True
        print("Realtime session opened:", session_opened_event.session_id)

    def on_data(transcript: aai.RealtimeTranscript):
        if isinstance(transcript, aai.RealtimeFinalTranscript):
            final_transcripts.append(transcript.text)
            print("Final:", transcript.text)
        else:
            print("Partial:", transcript.text)

    def on_error(error: aai.RealtimeError):
        print("Realtime transcription error:", error)

    def on_close():
        print("Realtime transcription session closed.")

    # Create realtime transcriber
    transcriber = aai.RealtimeTranscriber(
        sample_rate=sample_rate,
        on_open=on_open,
        on_data=on_data,
        on_error=on_error,
        on_close=on_close,
        encoding=encoding
    )
    transcriber.connect()

    # Wait for the session to be opened
    print("Waiting for the session to open...")
    while not session_opened:
        pass  # Busy wait; ideally use threading.Event

    # Define a generator for audio chunks
    def audio_generator():
        chunk_size = 3200  # 200ms of audio at 8kHz
        for i in range(0, len(audio_data), chunk_size):
            chunk = audio_data[i:i + chunk_size]
            print(f"Yielding chunk {i // chunk_size + 1}, size={len(chunk)} bytes")
            yield chunk

    # Stream the audio chunks
    print("Streaming audio chunks...")
    try:
        transcriber.stream(audio_generator())
    except Exception as e:
        print("Error during streaming:", e)
    finally:
        transcriber.close()

    return "\n".join(final_transcripts)


# testing Mew Law

In [52]:
# Test the Methods
audio_path = "Untitled.m4a"  # Replace with your audio file path
try:
    audio_data = convert_audio_to_mulaw(audio_path)
    verify_audio_mulaw(audio_data)
except Exception as e:
    print("Error during audio conversion or verification:", e)
    raise

Converting Untitled.m4a to PCM MULAW format...
Conversion completed.
Audio format: 1 channel(s), 8000 Hz, 2 byte(s) per sample


In [44]:
# Realtime Transcription Test
print("\n---- Realtime Transcription ----")
try:
    realtime_transcription = stream_audio_realtime(audio_data)
    print("Final Realtime Transcription Result:", realtime_transcription)
except Exception as e:
    print("Error during transcription:", e)


---- Realtime Transcription ----
Waiting for the session to open...
Realtime session opened: 684a4a77-7adf-49cf-a81b-5ea6b0d5a1c7
Streaming audio chunks...
Yielding chunk 1, size=3200 bytes
Yielding chunk 2, size=3200 bytes
Yielding chunk 3, size=3200 bytes
Yielding chunk 4, size=3200 bytes
Yielding chunk 5, size=3200 bytes
Yielding chunk 6, size=3200 bytes
Yielding chunk 7, size=3200 bytes
Yielding chunk 8, size=3200 bytes
Yielding chunk 9, size=3200 bytes
Yielding chunk 10, size=3200 bytes
Yielding chunk 11, size=3200 bytes
Yielding chunk 12, size=3052 bytes
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Partial: 
Realtime transcription session closed.
Final Realtime Transcription Result: 


# testing s16le

In [55]:
# Test the Methods
audio_path = "Untitled.m4a"  # Replace with your audio file path
try:
    audio_data = convert_audio_to_pcm_s16le(audio_path)
    verify_audio_pcm_s16le(audio_data)
except Exception as e:
    print("Error during audio conversion or verification:", e)
    raise

Converting Untitled.m4a to PCM S16LE format...
Conversion completed.
Verifying raw PCM data: 76416 bytes
Audio duration: 2.39 seconds
Raw PCM data verification passed.


In [56]:
# Realtime Transcription Test
print("\n---- Realtime Transcription ----")
try:
    realtime_transcription = stream_audio_realtime(audio_data, sample_rate=16000, mulaw_encoding=False)
    print("Final Realtime Transcription Result:", realtime_transcription)
except Exception as e:
    print("Error during transcription:", e)


---- Realtime Transcription ----
Waiting for the session to open...
Realtime session opened: 2c8b2231-c7d0-4b44-8e96-ed68c0556755
Streaming audio chunks...
Yielding chunk 1, size=3200 bytes
Yielding chunk 2, size=3200 bytes
Yielding chunk 3, size=3200 bytes
Yielding chunk 4, size=3200 bytes
Yielding chunk 5, size=3200 bytes
Yielding chunk 6, size=3200 bytes
Yielding chunk 7, size=3200 bytes
Yielding chunk 8, size=3200 bytes
Yielding chunk 9, size=3200 bytes
Yielding chunk 10, size=3200 bytes
Yielding chunk 11, size=3200 bytes
Yielding chunk 12, size=3200 bytes
Yielding chunk 13, size=3200 bytes
Yielding chunk 14, size=3200 bytes
Yielding chunk 15, size=3200 bytes
Yielding chunk 16, size=3200 bytes
Yielding chunk 17, size=3200 bytes
Yielding chunk 18, size=3200 bytes
Yielding chunk 19, size=3200 bytes
Yielding chunk 20, size=3200 bytes
Yielding chunk 21, size=3200 bytes
Yielding chunk 22, size=3200 bytes
Yielding chunk 23, size=3200 bytes
Yielding chunk 24, size=2816 bytes
Partial: 
Pa