lcdemo.py

## This is a demo for a bot that uses TTS, SST, and LLM models to have a real time conversation with an user

import time
import logging
import replicate
import sounddevice as sd
from scipy.io.wavfile import write as scwrite
import vlc


# set up logging
logging.basicConfig(
    format="%(asctime)s %(levelname)-8s %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)

if __name__ == "__main__":
    while True:
        ## record the audio

        logging.info(
            "starting program \n--------------------------------------------------"
        )
        print(
            "Record a message to start the conversation. When you press enter, you'll have 10 seconds to record your message."
        )
        dev_skip = input("Press enter to begin recording")
        if not dev_skip == "0":
            logging.info("now recording")

            fs = 44100  # Sample rate
            seconds = 7  # Duration of recording

            myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
            sd.wait()  # Wait until recording is finished
            logging.info("finished recording. Saving as recording.wav")
            scwrite("recording.wav", fs, myrecording)  # Save as WAV file

        ##put it through whispr
        logging.info(
            "\n----------------interpreting recording with whispr------------\n"
        )
        sttout = replicate.run(
            "vaibhavs10/incredibly-fast-whisper:3ab86df6c8f54c11309d4d1f930ac292bad43ace52d10c80d87eb258b3c9f79c",
            input={
                "task": "transcribe",
                "audio": open("recording.wav", "rb"),
                "language": "None",
                "timestamp": "chunk",
                "batch_size": 64,
                "diarise_audio": False,
            },
        )

        logging.info(f"whisper output: {sttout}")

        ## put it through llama

        transcript = sttout["text"]
        system_prompt = (
            f"You are an AI chatbot called faebot. \n"
            "You are having a conversation with your engineer, transfaeries. Fae is currently live on stream and is livestreaming development work on you! \n"
            "You are the real star of the show."
        )
        prompt = f"User: {transcript}"
        llmout = replicate.run(
            "meta/llama-2-13b-chat",
            input={
                "debug": False,
                "top_k": 50,
                "top_p": 1,
                "prompt": prompt,
                "temperature": 0.5,
                "system_prompt": system_prompt,
                "max_new_tokens": 200,
                "min_new_tokens": -1,
            },
        )
        response = "".join(llmout)

        logging.info(f"llama output: {response}")

        # put it through neontts

        ttsout = replicate.run(
            "awerks/neon-tts:139606fe1536f85a9f07d87982400b8140c9a9673733d47913af96738894128f", input={"text": response, "language": "en"}
        )

        # ttsout = replicate.run(
        #     "adirik/styletts2:989cb5ea6d2401314eb30685740cb9f6fd1c9001b8940659b406f952837ab5ac",
        #     input={
        #         "beta": 0.7,
        #         "seed": 0,
        #         "text": response,
        #         "alpha": 0.3,
        #         "diffusion_steps": 10,
        #         "embedding_scale": 1.5,
        #     },
        # )

        logging.info(f"tts output url: {ttsout}")

        # ## Play back the audio
        vlc_instance = vlc.Instance()
        player = vlc_instance.media_player_new()
        media = vlc_instance.media_new(ttsout)
        player.set_media(media)
        player.play()
        time.sleep(1.5)
        duration = player.get_length() / 1000
        ## let user start recording again before the AI is finished
        time.sleep(int(duration / 2))