In [None]:
models = ["tiny", "small", "distil-small.en", "base", "medium", "distil-medium.en", "large", "large-v2", "distil-large-v2", "large-v3", "distil-large-v3"]
quantizations = [None, "4bit", "8bit"]

In [None]:
from dotenv import load_dotenv
from groq import Groq

groq_client = Groq()
load_dotenv()

from lightning_whisper_mlx import LightningWhisperMLX
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from elevenlabs.client import ElevenLabs
from elevenlabs import stream as audio_stream
from langchain_groq import ChatGroq

def transcribe_audio(audio_path, model="large-v3", batch_size=12, quant=None, language="nl"):
    whisper = LightningWhisperMLX(model=model, batch_size=batch_size, quant=quant)
    text = whisper.transcribe(audio_path=audio_path, language=language)['text']
    return text



def transcribe_audio_groq(audio_path):
    with open(audio_path, "rb") as file:
        # Create a transcription of the audio file
        transcription = groq_client.audio.transcriptions.create(
        file=(audio_path, file.read()), # Required audio file
        model="whisper-large-v3", # Required model to use for transcription
        #prompt="Specify context or spelling",  # Optional
        #response_format="json",  # Optional
        language="nl",  # Optional
        temperature=0.0  # Optional
        )
        # Print the transcription text
        return transcription.text

llm = ChatGroq(
    model="llama3-70b-8192",
    temperature=0,
    max_tokens=225,
    timeout=None,
    max_retries=2,
    # other params...
)
def generate_response(input_messages):
    #template = ChatPromptTemplate(input_messages)

    template = ChatPromptTemplate(input_messages)

    model = llm
    chain =  template | model | StrOutputParser()
    return chain.invoke({})

In [None]:
import pyaudio
import wave
import os
from pydub import AudioSegment
import numpy as np  # Added for numerical operations
from elevenlabs import stream
from langchain_core.messages import HumanMessage, SystemMessage
import openai
import time  # Added for timing
from groq import Groq

messages = [
        ("system", """
        Je bent een AI Call Center agent genaamd Freddy. Die werkt voor het bedrijf AI-Solar. 
        De bedoeling is dat je mensen opbelt die op de website hebben ingevuld dat ze interesse hebben in een afspraak met een expert.
        - Als eerste vraag vraag je aan de klant of ze willen weten wat de voordelen van zonnepanelen zijn. 
        - Je legt de voordelen van zonnepanelen uit en antwoordt op eventuele bemerkingen en vragen. 
        - Je antwoordt enkel op vragen die over het onderwerp zonnepanelen gaan. 
        - De bedoeling is om de postcode van deze mensen te vragen en een afspraak met een expert in te plannen. 
        - De expert kan enkel op dinsdag tussen 14 en 17 uur, woensdag tussen 10 en 11, of vrijdag tussen 9 en 14 uur. 
        - Maak je antwoorden altijd korter dan 600 karakters. 
        - Schrijf ALLE getallen voluit. Zoals vijfhonderdzestig, zeshonderd, etc.
         """),
         ("human", "Hallo, u spreekt met Ben Selleslagh."),
]


while True:
    start_time = time.time()  # Start timing

    client = ElevenLabs()

    response = generate_response(messages)

    response_time = time.time()  # Time after generating response
    print(f"Time to generate response: {response_time - start_time:.2f} seconds")

    voice = client.generate(
        text=response,
        voice="HansC",
        model="eleven_turbo_v2_5",
        stream=True,
        optimize_streaming_latency=1
        )
    stream(voice)

    voice_time = time.time()  # Time after generating voice
    print(f"Time to generate voice: {voice_time - response_time:.2f} seconds")

    messages.append(("ai", response))

    # Set up audio recording parameters
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    CHUNK = 1024
    WAVE_OUTPUT_FILENAME = "output.wav"
    MP3_OUTPUT_FILENAME = "output.mp3"

    SILENCE_THRESHOLD = 1000  # Adjust this threshold based on your microphone sensitivity
    MAX_SILENCE_SECONDS = 1  # Stop recording after 2 seconds of silence

    # Initialize PyAudio
    audio = pyaudio.PyAudio()

    # Start recording
    input_stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)

    print("Recording...")

    frames = []
    silent_chunks = 0
    chunk_duration = CHUNK / RATE  # Duration of each chunk in seconds

    while True:
        data = input_stream.read(CHUNK)
        frames.append(data)

        # Convert data to numpy array for processing
        audio_data = np.frombuffer(data, dtype=np.int16)
        amplitude = np.abs(audio_data).mean()

        if amplitude < SILENCE_THRESHOLD:
            silent_chunks += 1
        else:
            silent_chunks = 0

        # Check if we've reached the maximum duration of silence
        if silent_chunks * chunk_duration >= MAX_SILENCE_SECONDS:
            break

    print("Finished recording.")

    # Stop and close the stream
    input_stream.stop_stream()
    input_stream.close()
    audio.terminate()

    recording_time = time.time()  # Time after recording
    print(f"Time to record audio: {recording_time - voice_time:.2f} seconds")

    # Save as WAV file
    with wave.open(WAVE_OUTPUT_FILENAME, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))

    # Convert WAV to MP3
    sound = AudioSegment.from_wav(WAVE_OUTPUT_FILENAME)
    sound.export(MP3_OUTPUT_FILENAME, format="mp3")

    conversion_time = time.time()  # Time after conversion
    print(f"Time to convert audio: {conversion_time - recording_time:.2f} seconds")

    text = transcribe_audio_groq(MP3_OUTPUT_FILENAME)

    transcription_time = time.time()  # Time after transcription
    print(f"Time to transcribe audio: {transcription_time - conversion_time:.2f} seconds")

    messages.append(("human", text))

    # Remove the temporary WAV file
    os.remove(WAVE_OUTPUT_FILENAME)
    print(f"Audio saved as {MP3_OUTPUT_FILENAME}")

In [None]:
# Download the helper library from https://www.twilio.com/docs/python/install
import os
from twilio.rest import Client
from twilio.twiml.voice_response import Connect, VoiceResponse, Say, Stream

# Find your Account SID and Auth Token at twilio.com/console
# and set the environment variables. See http://twil.io/secure
account_sid = os.environ["TWILIO_ACCOUNT_SID"]
auth_token = os.environ["TWILIO_AUTH_TOKEN"]
client = Client(account_sid, auth_token)



response = VoiceResponse()
connect = Connect()
connect.stream(url='wss://example.com/audiostream')
response.append(connect)
response.say(
    'This TwiML instruction is unreachable unless the Stream is ended by your WebSocket server.'
)

call = client.calls.create(
    twiml=response,
    to="+32479467536",
    from_=os.environ["MY_TWILIO_NUMBER"],
)

print(call.sid)

In [None]:
from twilio.twiml.voice_response import Connect, VoiceResponse, Say, Stream

response = VoiceResponse()
connect = Connect()
connect.stream(url='wss://example.com/audiostream')
response.append(connect)
response.say(
    'This TwiML instruction is unreachable unless the Stream is ended by your WebSocket server.'
)

print(response)

In [None]:
import os
from dotenv import load_dotenv
import websockets, json
import asyncio
import base64
import os

async def write_to_local(audio_stream):
    """Write the audio encoded in base64 string to a local mp3 file."""

    with open(f'test.mp3', "wb") as f:
        async for chunk in audio_stream:
            if chunk:
                f.write(chunk)

async def listen(websocket):
    """Listen to the websocket for audio data and stream it."""

    while True:
        try:
            message = await websocket.recv()
            data = json.loads(message)
            if data.get("audio"):
                yield base64.b64decode(data["audio"])
            elif data.get('isFinal'):
                break

        except websockets.exceptions.ConnectionClosed:
            print("Connection closed")
            break

# Load the API key from the .env file
load_dotenv()
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")

voice_id = 'kmSVBPu7loj4ayNinwWM'
model_id = 'eleven_turbo_v2'

async def text_to_speech_ws_streaming(voice_id, model_id):
    uri = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input?model_id={model_id}"

    async with websockets.connect(uri) as websocket:
        await websocket.send(json.dumps({
            "text": " ",
            "voice_settings": {"stability": 0.5, "similarity_boost": 0.8, "use_speaker_boost": False},
            "generation_config": {
                "chunk_length_schedule": [120, 160, 250, 290]
            },
            "xi_api_key": ELEVENLABS_API_KEY,
        }))

        text = "The twilight sun cast its warm golden hues upon the vast rolling fields, saturating the landscape with an ethereal glow. Silently, the meandering brook continued its ceaseless journey, whispering secrets only the trees seemed privy to."
        await websocket.send(json.dumps({"text": text}))

        # Start the listen task
        listen_task = asyncio.create_task(write_to_local(listen(websocket)))

        # Send empty string to indicate the end of the text sequence
        await websocket.send(json.dumps({"text": ""}))

        # Wait for the listen task to complete
        await listen_task

# Run the function
await text_to_speech_ws_streaming('EXAVITQu4vr4xnSDxMaL', 'eleven_multilingual_v2')

In [9]:
from elevenlabs.client import ElevenLabs
from dotenv import load_dotenv
import os

load_dotenv()



client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))

response = client.voices.get_all()

print("Available Voices:")
for voice in response.voices:
    print(f"\nName: {voice.name}")
    print(f"Voice ID: {voice.voice_id}")
    print(f"Category: {voice.category}")
    print(f"Labels: {voice.labels}")
    print(f"Preview URL: {voice.preview_url}")
    print(f"High Quality Base Model IDs: {voice.high_quality_base_model_ids}")
    print("-" * 50)

Available Voices:

Name: Aria
Voice ID: 9BWtsMINqrJLrRacOk9x
Category: premade
Labels: {'accent': 'American', 'description': 'expressive', 'age': 'middle-aged', 'gender': 'female', 'use_case': 'social media'}
Preview URL: https://storage.googleapis.com/eleven-public-prod/premade/voices/9BWtsMINqrJLrRacOk9x/405766b8-1f4e-4d3c-aba1-6f25333823ec.mp3
High Quality Base Model IDs: ['eleven_turbo_v2', 'eleven_turbo_v2_5', 'eleven_multilingual_v2']
--------------------------------------------------

Name: Roger
Voice ID: CwhRBWXzGAHq8TQ4Fs17
Category: premade
Labels: {'accent': 'American', 'description': 'confident', 'age': 'middle-aged', 'gender': 'male', 'use_case': 'social media'}
Preview URL: https://storage.googleapis.com/eleven-public-prod/premade/voices/CwhRBWXzGAHq8TQ4Fs17/58ee3ff5-f6f2-4628-93b8-e38eb31806b0.mp3
High Quality Base Model IDs: ['eleven_turbo_v2', 'eleven_multilingual_v2', 'eleven_turbo_v2_5']
--------------------------------------------------

Name: Sarah
Voice ID: EXAVI

In [10]:
response = client.models.get_all()

print("Available Models:")
for model in response:
    print(f"\nName: {model.name}")
    print(f"Model ID: {model.model_id}")
    print(f"Description: {model.description}")
    print(f"Can be finetuned: {model.can_be_finetuned}")
    print(f"Can do text-to-speech: {model.can_do_text_to_speech}")
    print(f"Can do voice conversion: {model.can_do_voice_conversion}")
    print(f"Can use style: {model.can_use_style}")
    print(f"Can use speaker boost: {model.can_use_speaker_boost}")
    print(f"Max characters (free user): {model.max_characters_request_free_user}")
    print(f"Max characters (subscribed user): {model.max_characters_request_subscribed_user}")
    print(f"Maximum text length per request: {model.maximum_text_length_per_request}")
    print("Supported Languages:")
    for lang in model.languages:
        print(f"  - {lang.name} ({lang.language_id})")
    print(f"Character cost multiplier: {model.model_rates['character_cost_multiplier']}")
    print("-" * 50)

Available Models:

Name: Eleven Multilingual v2
Model ID: eleven_multilingual_v2
Description: Our most life-like, emotionally rich mode in 29 languages. Best for voice overs, audiobooks, post-production, or any other content creation needs.
Can be finetuned: True
Can do text-to-speech: True
Can do voice conversion: False
Can use style: True
Can use speaker boost: True
Max characters (free user): 10000
Max characters (subscribed user): 10000
Maximum text length per request: 10000
Supported Languages:
  - English (en)
  - Japanese (ja)
  - Chinese (zh)
  - German (de)
  - Hindi (hi)
  - French (fr)
  - Korean (ko)
  - Portuguese (pt)
  - Italian (it)
  - Spanish (es)
  - Indonesian (id)
  - Dutch (nl)
  - Turkish (tr)
  - Filipino (fil)
  - Polish (pl)
  - Swedish (sv)
  - Bulgarian (bg)
  - Romanian (ro)
  - Arabic (ar)
  - Czech (cs)
  - Greek (el)
  - Finnish (fi)
  - Croatian (hr)
  - Malay (ms)
  - Slovak (sk)
  - Danish (da)
  - Tamil (ta)
  - Ukrainian (uk)
  - Russian (ru)
Charac