# Multimodal demo

This is an example of how to simulate a video- and audio-aware model using existing LLM vision models (that take text and images as input, and generate text as output).

In [1]:
import os
from pathlib import Path

import dotenv
from groq import Groq

from media_extractor import split_video
import datauri

In [2]:
# Load OpenAI API key from .env file
dotenv.load_dotenv()
if os.environ.get("GROQ_API_KEY") is None:
    raise ValueError("GROQ_API_KEY not found in .env file")

client = Groq()

This is the input video that we'll turn into the user prompt.

In [3]:
video_file = "input_01.mp4"

from IPython.display import Video
Video(video_file, width=320)

At the time of this writing, the GPT-4o API doesn't directly support video or audio input. Instead, we'll decode the video into frames and feed them to the model as images, and decode the audio into text and feed it to the model as text.

In [4]:
audio_uri, image_uris = split_video(video_file)
audio_uri[:50]

Received video file: input_01.mp4


'data:audio/mpeg;base64,SUQzBAAAAAAAf1RYWFgAAAASAAA'

Decode the audio file into text, using OpenAI's `whisper-1` model. The result will serve as the text prompt for the LLM.

In [5]:
with datauri.as_tempfile(audio_uri) as audio_file:
    transcription = client.audio.transcriptions.create(
        model="distil-whisper-large-v3-en", file=Path(audio_file)
    )

user_prompt = transcription.text
user_prompt

' Could you explain what is this?'

We're ready to talk to the LLM: use the text and images as input, and get generated text back.

In [10]:
response = client.chat.completions.create(
    model="llama-3.2-11b-vision-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": user_prompt},
                {
                    "type": "image_url",
                    "image_url": {"url": image_uris[0], "detail": "auto"},
                }
                if image_uris else {},
            ],
        },
    ],
)
response_text = response.choices[0].message.content
response.choices[0].message.content

'The Sun has not reported on this piece of paper, so I cannot give information about it. I do not have the capability of knowing what is on the paper in this image because it is a physical object that my capabilities do not allow me to look at.'

Use OpenAI's text-to-speech model to turn the generated text into audio.

In [7]:
# audio = client.audio.speech.create(
#     model="tts-1",
#     voice="nova",
#     input=response_text,
#     response_format="mp3",
# )
from gtts import gTTS
audio = gTTS(text = response_text, lang='en')
# Save the audio to a file
audio.save("response_audio.mp3")

# Read the saved file and create a data URI
with open("response_audio.mp3", "rb") as audio_file:
    response_audio_uri = datauri.from_bytes(audio_file.read(), "audio/mpeg")

In [8]:
with datauri.as_tempfile(response_audio_uri) as response_audio_file:
    from IPython.display import Audio
    display(Audio(response_audio_file))
    os.remove('response_audio.mp3')

In [1]:
import mimetypes
mime_type, _ = mimetypes.guess_type('uploads/recorded-video.mp4')
print(mime_type)

video/mp4
