In [3]:
import sounddevice as sd
import numpy as np
from scipy.io.wavfile import write
from pydub import AudioSegment

def record_audio(duration, filename):
    # Record audio for the given duration in seconds
    samplerate = 44100  # Sample rate in Hz
    print("Recording audio...")
    recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=2, dtype='int16')
    sd.wait()  # Wait until the recording is finished
    print("Recording complete")

    # Save as WAV file
    write(f"{filename}.wav", samplerate, recording)

    # Convert to MP3
    audio = AudioSegment.from_wav(f"{filename}.wav")
    audio.export(f"{filename}.mp3", format="mp3")
    print(f"Audio saved as {filename}.mp3")

# Example usage
record_audio(10, "user_input")  # Record for 10 seconds

Recording audio...
Recording complete


FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg'

In [9]:
import boto3
import time

transcribe = boto3.client('transcribe')

def transcribe_audio(filename):
    job_name = filename
    job_uri = f"s3://text-to-speech-bedrock-data/{filename}.mp3"
    
    transcribe.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': job_uri},
        MediaFormat='mp3',
        LanguageCode='en-US'
    )
    
    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
        print("Waiting for transcription...")
        print(status['TranscriptionJob']['TranscriptionJobStatus'])
        time.sleep(10)
    
    if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':
        transcript_uri = status['TranscriptionJob']['Transcript']['TranscriptFileUri']
        print(f"Transcript URL: {transcript_uri}")
        return transcript_uri

# Example usage
transcript_uri = transcribe_audio("audio_file")


Waiting for transcription...
IN_PROGRESS
Waiting for transcription...
IN_PROGRESS
Waiting for transcription...
IN_PROGRESS
Waiting for transcription...
IN_PROGRESS
Waiting for transcription...
IN_PROGRESS
Transcript URL: https://s3.us-east-1.amazonaws.com/aws-transcribe-us-east-1-prod/471112626564/audio_file/8905c293-fef0-45ec-bd6d-01a94171384d/asrOutput.json?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEEAaCXVzLWVhc3QtMSJGMEQCIDbdp8SzZMrqqCYuGGFMugnGGwi44ZY0s0Q65L0yhoTCAiA5zF6UlVsW%2F9UY3xcXESFRBfcEHV%2F4j0tiPEJ6%2FpDjqCqxBQgpEAQaDDI3NjY1NjQzMzE1MyIM7FEUlrNK2H3KDdilKo4FXTjD62soItNmoT5QPprT37PqBxLb7XexsxNgNcrP3kH8s6dp1iLAaEpnSdie5ll7A%2Fef7MoXSzgJtAQG1gTCKgMJbqbCORtp4dIRNowARyf0gAnFjwmgRPJB7ahnP3fQPkI6HRlbpvByt7011rd1bK7Ze6ASXJ%2BUgAj8RP4apvJTbuULAS%2Fh0DG7%2B5f29AATXLUtg2cWdb1VMwVT6CM8%2B6MfkAqeG18k%2FjUook5TGteukblOeY1LGwHGXeGBwNra9SIZFegECZiHMU4uIzEMmLSexgzWD5w7wPbZYAbV0VL%2B186Ocm9%2BLMdSNOSlnGQkYg71ibtjJfVK7YISy7T%2Bj2wQc8nsu%2BX8cv%2FAM03CwZzZ8JaP4TnqYboc%2FEe4LH9gK8gkRUJsZdjB2i7yEFSp%2F

In [1]:
from api_request_schema import api_request_list
import os

model_ids = os.getenv('MODEL_ID', 'amazon.titan-text-lite-v1')
aws_region = os.getenv('AWS_REGION', 'us-east-1')

api_request = api_request_list[model_ids]

print(api_request['modelId'])

# Configuration settings
config = {
    'log_level': 'info',  # One of: info, debug, none
    'last_speech': f"My name is Stephen and I am an AI powered robot talking to you. I am using the model: {api_request['modelId']}",
    'region': aws_region,
    'polly': {
        'Engine': 'neural',
        'LanguageCode': 'en-US',
        'VoiceId': 'Stephen',
        'OutputFormat': 'pcm',
    },
    'translate': {
        'SourceLanguageCode': 'en',
        'TargetLanguageCode': 'en',
    },
    'bedrock': {
        'response_streaming': True,
        'api_request': api_request
    }
}

amazon.titan-text-lite-v1


In [2]:
# Utility function for logging
def printer(text, level):
    if config['log_level'] == 'info' and level == 'info':
        print(text)
    elif config['log_level'] == 'debug' and level in ['info', 'debug']:
        print(text)

In [3]:
import json

# Wrapper class for Bedrock models
class BedrockModelsWrapper:

    @staticmethod
    def define_body(text):
        model_id = config['bedrock']['api_request']['modelId']
        model_provider = model_id.split('.')[0]
        body = config['bedrock']['api_request']['body']

        if model_provider == 'amazon':
            body['inputText'] = text
        elif model_provider == 'meta':
            body['prompt'] = text
        elif model_provider == 'anthropic':
            body['prompt'] = f'\n\nHuman: {text}\n\nAssistant:'
        elif model_provider == 'cohere':
            body['prompt'] = text
        else:
            raise Exception('Unknown model provider.')

        return body

    @staticmethod
    def get_stream_chunk(event):
        return event.get('chunk')

    @staticmethod
    def get_stream_text(chunk):
        model_id = config['bedrock']['api_request']['modelId']
        model_provider = model_id.split('.')[0]

        chunk_obj = ''
        text = ''
        if model_provider == 'amazon':
            chunk_obj = json.loads(chunk.get('bytes').decode())
            text = chunk_obj['outputText']
        elif model_provider == 'meta':
            chunk_obj = json.loads(chunk.get('bytes').decode())
            text = chunk_obj['generation']
        elif model_provider == 'anthropic':
            chunk_obj = json.loads(chunk.get('bytes').decode())
            text = chunk_obj['completion']
        elif model_provider == 'cohere':
            chunk_obj = json.loads(chunk.get('bytes').decode())
            text = ' '.join([c["text"] for c in chunk_obj['generations']])
        else:
            raise NotImplementedError('Unknown model provider.')

        printer(f'[DEBUG] {chunk_obj}', 'debug')
        return text

In [4]:
# Configuration settings
config = {
    'log_level': 'info',  # One of: info, debug, none
    'last_speech': f"My name is Stephen and I am an AI powered robot talking to you. I am using the model: {api_request['modelId']}",
    'region': aws_region,
    'polly': {
        'Engine': 'neural',
        'LanguageCode': 'en-US',
        'VoiceId': 'Stephen',
        'OutputFormat': 'pcm',
    },
    'translate': {
        'SourceLanguageCode': 'en',
        'TargetLanguageCode': 'en',
    },
    'bedrock': {
        'response_streaming': True,
        'api_request': api_request
    }
}

In [5]:
def to_audio_generator(bedrock_stream):
    prefix = ''

    if bedrock_stream:
        for event in bedrock_stream:
            chunk = BedrockModelsWrapper.get_stream_chunk(event)
            if chunk:
                text = BedrockModelsWrapper.get_stream_text(chunk)

                if '.' in text:
                    a = text.split('.')[:-1]
                    to_polly = ''.join([prefix, '.'.join(a), '. '])
                    prefix = text.split('.')[-1]
                    print(to_polly, flush=True, end='')
                    yield to_polly
                else:
                    prefix = ''.join([prefix, text])

        if prefix != '':
            print(prefix, flush=True, end='')
            yield f'{prefix}.'

        print('\n')

In [11]:
import requests
import boto3

bedrock_runtime = boto3.client(service_name='bedrock-runtime', region_name=config['region'])

def get_transcript_text(transcript_uri):
    response = requests.get(transcript_uri)
    transcript_json = response.json()
    return transcript_json['results']['transcripts'][0]['transcript']

def invoke_bedrock(text):
    body = BedrockModelsWrapper.define_body(text)
    response = bedrock_runtime.invoke_model_with_response_stream(
        body=json.dumps(body),
        modelId=config['bedrock']['api_request']['modelId'],
        accept=config['bedrock']['api_request']['accept'],
        contentType=config['bedrock']['api_request']['contentType']
    )
    bedrock_stream = response['body']
    return to_audio_generator(bedrock_stream)

# Example usage
transcript_text = get_transcript_text(transcript_uri)
response_audio_gen = invoke_bedrock(transcript_text)


In [12]:
import pyaudio, sys

p = pyaudio.PyAudio()

class UserInputManager:
    shutdown_executor = False
    executor = None

    @staticmethod
    def set_executor(executor):
        UserInputManager.executor = executor

    @staticmethod
    def start_shutdown_executor():
        UserInputManager.shutdown_executor = False
        raise Exception()  # Workaround to shutdown exec, as executor.shutdown() doesn't work as expected.

    @staticmethod
    def start_user_input_loop():
        while True:
            sys.stdin.readline().strip()
            printer(f'[DEBUG] User input to shut down executor...', 'debug')
            UserInputManager.shutdown_executor = True

    @staticmethod
    def is_executor_set():
        return UserInputManager.executor is not None

    @staticmethod
    def is_shutdown_scheduled():
        return UserInputManager.shutdown_executor

class Reader:

    def __init__(self):
        self.polly = boto3.client('polly', region_name=config['region'])
        self.audio = p.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
        self.chunk = 1024

    def read(self, data):
        response = self.polly.synthesize_speech(
            Text=data,
            Engine=config['polly']['Engine'],
            LanguageCode=config['polly']['LanguageCode'],
            VoiceId=config['polly']['VoiceId'],
            OutputFormat=config['polly']['OutputFormat'],
        )

        stream = response['AudioStream']

        while True:
            # Check if user signaled to shutdown Bedrock speech
            # UserInputManager.start_shutdown_executor() will raise Exception. If not ideas but is functional.
            if UserInputManager.is_executor_set() and UserInputManager.is_shutdown_scheduled():
                UserInputManager.start_shutdown_executor()

            data = stream.read(self.chunk)
            self.audio.write(data)
            if not data:
                break

    def close(self):
        time.sleep(1)
        self.audio.stop_stream()
        self.audio.close()

In [14]:
def read_bedrock_response(response_audio_gen):
    reader = Reader()
    for audio in response_audio_gen:
        reader.read(audio)
    reader.close()

# Example usage
print(response_audio_gen)
read_bedrock_response(response_audio_gen)


<generator object to_audio_generator at 0x1091bda20>
