In [1]:
# Cloud project id.
PROJECT_ID = "merantix-genai23ber-9535"  # @param {type:"string"}

# The region you want to launch jobs in.
REGION = "us-central1"  # @param {type:"string"}

# The Cloud Storage bucket for storing experiments output. Fill it without the 'gs://' prefix.
GCS_BUCKET = "blip2-example"  # @param {type:"string"}

In [2]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=GCS_BUCKET)

In [3]:
# The pre-built serving docker image.
# The model artifacts are embedded within the container, except for model weights which will be downloaded during deployment.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-transformers-serve"

In [4]:
# Call the TTS API and transform the example text 'hello world' into speech and play it 
# if the response need to be saved locally, name it as test_synthesis.wav 
from google.cloud import speech_v1p1beta1 as speech
import io, os
from pydub import AudioSegment


In [5]:
import base64
import os
from datetime import datetime
from io import BytesIO

import requests
from google.cloud import aiplatform
from PIL import Image


def create_job_name(prefix):
    user = os.environ.get("USER")
    now = datetime.now().strftime("%Y%m%d_%H%M%S")
    job_name = f"{prefix}-{user}-{now}"
    return job_name


def download_image(url):
    response = requests.get(url)
    return Image.open(BytesIO(response.content))


def image_to_base64(image, format="JPEG"):
    buffer = BytesIO()
    image.save(buffer, format=format)
    image_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return image_str


def base64_to_image(image_str):
    image = Image.open(BytesIO(base64.b64decode(image_str)))
    return image


def image_grid(imgs, rows=2, cols=2):
    w, h = imgs[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid


def deploy_vqa_model(model_id, task):
    model_name = "blip2"
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")
    serving_env = {
        "MODEL_ID": model_id,
        "TASK": task,
    }
    # If the model_id is a GCS path, use artifact_uri to pass it to serving docker.
    artifact_uri = model_id if model_id.startswith("gs://") else None
    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_ports=[7080],
        serving_container_predict_route="/predictions/transformers_serving",
        serving_container_health_route="/ping",
        serving_container_environment_variables=serving_env,
        artifact_uri=artifact_uri,
    )
    model.deploy(
        endpoint=endpoint,
        machine_type="n1-standard-8",
        accelerator_type="NVIDIA_TESLA_T4",
        accelerator_count=1,
        deploy_request_timeout=1800,
    )
    return model, endpoint

In [6]:
def ask_vqa(endpoint, user_prompt, image):
    prompt =  f"Question: {user_prompt}? Answer:"
    instances = [
        {"image": image_to_base64(image), "text": prompt},
    ]
    preds = endpoint.predict(instances=instances).predictions
    return preds[0]

In [7]:
def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    client = speech.SpeechClient()
    
    # # Check if the file is mp3
    # if speech_file.lower().endswith(".mp3"):
    #     # Convert mp3 file to wav
    #     audio = AudioSegment.from_mp3(speech_file)
    #     # Export as wav
    #     speech_file = "temp_transcription_file.wav"
    #     audio.export(speech_file, format="wav")
    
    # Determine the sample rate
    audio = AudioSegment.from_wav(speech_file)
    sample_rate = audio.frame_rate
    
    with io.open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,  # Specify the correct sample rate
        language_code="en-US",
    )

    response = client.recognize(config=config, audio=audio)
    
    user_prompt = ""
    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))
        user_prompt += result.alternatives[0].transcript + " "
    # If a temporary file was created, remove it
    if speech_file == "temp_transcription_file.wav":
        os.remove(speech_file)
        
    return user_prompt

In [8]:
# Call the TTS API and transform the example text 'hello world' into speech and play it 
# if the response need to be saved locally, name it as test_synthesis.wav 
from google.cloud import texttospeech
from IPython.display import Audio, display

# Instantiates a client
tts_client = texttospeech.TextToSpeechClient()


def speak(text):
    # Set the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(text=text)

    # Build the voice request, select the language code ("en-US") and the ssml
    # voice gender ("neutral")
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
    )

    # Select the type of audio file you want returned
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = tts_client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # The response's audio_content is binary.
    with open("output.mp3", "wb") as out:
        # Write the response to the output file.
        out.write(response.audio_content)
        print('Audio content written to file "output.mp3"')

    display(Audio("output.mp3", autoplay=True))



In [9]:
from ultralytics import YOLO

def ask_object_detector(object_detection_model, image, conf=0.25):
    results = object_detection_model(image, conf=conf, save=True)[0]
    w, h = image.width, image.height


    xyxyn = results.boxes.xyxyn
    cls = results.boxes.cls
    
    object_detector_output = ""
    for i in range(len(results)):
        if (xyxyn[i][0] + xyxyn[i][2]) / 2 < 0.33:
            x_pos = "left"
        elif (xyxyn[i][0] + xyxyn[i][2]) / 2 > 0.66:
            x_pos = "right"
        else:
            x_pos = "middle"


        object_detector_output += f"{results.names[int(cls[i])]}, {x_pos} of the image; "
    
    return object_detector_output

In [10]:
import vertexai
from vertexai.preview.language_models import TextGenerationModel

def predict_large_language_model_sample(
    project_id: str,
    model_name: str,
    temperature: float,
    max_decode_steps: int,
    top_p: float,
    top_k: int,
    content: str,
    location: str = "us-central1",
    tuned_model_name: str = "",
    ) :
    """Predict using a Large Language Model."""
    vertexai.init(project=project_id, location=location)
    model = TextGenerationModel.from_pretrained(model_name)
    if tuned_model_name:
      model = model.get_tuned_model(tuned_model_name)
    response = model.predict(
        content,
        temperature=temperature,
        max_output_tokens=max_decode_steps,
        top_k=top_k,
        top_p=top_p,)
    return response.text

def ask_llm(user_prompt, vqa_output=None, object_detection_output=None):
    prompt = f"You are an AI assistant for vision impaired people. The user asked '{user_prompt}' and the VQA model outputs '{vqa_output}'. You also have the output from an object detector. Each element has the following format : class, position with respect to the image. Here are the detection results: '{object_detection_output}'. Rephrase for the text-to-speech model. Do not invent anything, only use what the VQA model and object detector output"""
    return predict_large_language_model_sample(PROJECT_ID, "text-bison@001", 0.2, 256, 0.8, 40, prompt, "us-central1")


In [11]:
object_detection_model = YOLO('yolov8x.pt')  # pretrained YOLOv8n model

vqa_vilt_endpoint = aiplatform.Endpoint('projects/71837958321/locations/us-central1/endpoints/1696047263379357696')
vqa_endpoint = aiplatform.Endpoint('projects/71837958321/locations/us-central1/endpoints/6595963657958457344')

In [12]:
from transformers import pipeline
import numpy as np

def run(image_path, wav_filepath, voice_text, output_text):
    image = Image.open(image_path).convert("RGB")
    user_prompt = transcribe_file(wav_filepath)
    voice_text.config(text=user_prompt)
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image)
    vqa_output = ask_vqa(vqa_endpoint, user_prompt, image)
    print(f"vqa: {vqa_output}")
    vqa_vilt_output = ask_vqa(vqa_vilt_endpoint, user_prompt, image)
    print(f"vqa VILT: {vqa_vilt_output}")
    if vqa_vilt_output["score"] > 0.5:
        vqa_output = vqa_vilt_output["answer"]
    #vqa_output += " " + vqa_vilt_output["answer"]
    object_detection_output = ask_object_detector(object_detection_model, image, conf=0.45)
    print(f"object detection {object_detection_output}")
    llm_output = ask_llm(user_prompt, vqa_output, object_detection_output)
    output_text.config(text=llm_output)
    print(f"output: {llm_output}")
    speak(llm_output)

In [29]:
import cv2
import tkinter as tk
import sounddevice as sd
import soundfile as sf
import numpy as np
from PIL import Image, ImageTk
from pydub import AudioSegment
import playsound


class WebcamAudioRecorderApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Webcam Audio Recorder")
        
        self.cap = cv2.VideoCapture(0)
        self.audio_stream = None
        self.audio_frames = []
        self.recording = False

        self.video_label = tk.Label(root)
        self.video_label.pack()
        
        self.start_button = tk.Button(root, text="Start Recording", command=self.start_recording)
        self.start_button.pack()
        
        self.stop_button = tk.Button(root, text="Stop Recording", command=self.stop_recording)
        self.stop_button.pack()
        self.stop_button["state"] = "disabled"

        self.voice_text = tk.Label(root, text="Input: ")
        self.voice_text.pack()
        self.vqa_option1 = tk.Label(root, text="VQA 1: ")
        self.vqa_option1.pack()
        self.vqa_option2 = tk.Label(root, text="VQA 2: ")
        self.vqa_option2.pack()
        self.object_detection = tk.Label(root, text="Object Detection: ")
        self.object_detection.pack()
        self.output_text = tk.Label(root, text="Output: ")
        self.output_text.pack()

        self.running = False

        self.update()
        
    def start_recording(self):
        self.audio_stream = sd.InputStream(callback=self.audio_callback)
        self.audio_stream.start()
        self.audio_frames = []
        self.recording = True
        self.start_button["state"] = "disabled"
        self.stop_button["state"] = "active"
        
    def stop_recording(self):
        if self.recording:
            self.audio_stream.stop()
            self.recording = False
            self.start_button["state"] = "active"
            self.stop_button["state"] = "disabled"
            self.capture_image()
            self.save_audio("audio.wav")
            # run("snapshot.png", "audio.wav", self.voice_text, self.output_text)
            self.run_app("snapshot.png", "audio.wav")
            playsound.playsound('output.mp3', True)

            
    def audio_callback(self, indata, frames, time, status):
        if status:
            print("Audio recording error:", status, flush=True)
        if self.recording:
            self.audio_frames.append(indata.copy())
            
    def save_audio(self, audio_path):
        audio_data = np.concatenate(self.audio_frames, axis=0).mean(-1)
        sf.write(audio_path, audio_data, 44100)



    def capture_image(self):
        ret, frame = self.cap.read()
        if ret:
            cv2.imwrite("snapshot.png", frame)
            self.display_image("snapshot.png")

    def display_image(self, image_path):
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        img = ImageTk.PhotoImage(image=img)
        self.video_label.config(image=img)
        self.video_label.image = img

    def update(self):
        ret, frame = self.cap.read()
        if ret and not self.running:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(frame)
            img = ImageTk.PhotoImage(image=img)
            self.video_label.config(image=img)
            self.video_label.image = img
        self.root.after(10, self.update)
        
    def run(self):
        self.root.mainloop()
        self.cap.release()

    def run_app(self, image_path, wav_filepath):
        self.running = True
        image = Image.open(image_path).convert("RGB")
        user_prompt = transcribe_file(wav_filepath)
        self.voice_text.config(text="Input: " + user_prompt)
        self.root.update()
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
        vqa_output = ask_vqa(vqa_endpoint, user_prompt, image)
        self.vqa_option1.config(text="VQA 1: " + vqa_output)
        self.root.update()
        vqa_vilt_output = ask_vqa(vqa_vilt_endpoint, user_prompt, image)
        self.vqa_option2.config(text="VQA 2: " + vqa_vilt_output["answer"])
        self.root.update()
        if vqa_vilt_output["score"] > 0.5:
            vqa_output = vqa_vilt_output["answer"]
        object_detection_output = ask_object_detector(object_detection_model, image, conf=0.45)
        self.object_detection.config(text="Object Detection: " + object_detection_output)
        self.root.update()
        llm_output = ask_llm(user_prompt, vqa_output, object_detection_output)
        self.output_text.config(text="Output: " + llm_output)
        self.root.update()
        speak(llm_output)
        self.running=False


root = tk.Tk()
app = WebcamAudioRecorderApp(root)
app.run()


[ WARN:0@2045.323] global cap_v4l.cpp:982 open VIDEOIO(V4L2:/dev/video0): can't open camera by index
[ERROR:0@2045.323] global obsensor_uvc_stream_channel.cpp:156 getStreamChannelGroup Camera index out of range


In [19]:
import cv2
import tkinter as tk
import sounddevice as sd
import soundfile as sf
import numpy as np
from PIL import Image, ImageTk
from pydub import AudioSegment
import playsound
import os

class WebcamAudioRecorderApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Webcam Audio Recorder")
        
        self.cap = cv2.VideoCapture(0)
        self.audio_stream = None
        self.audio_frames = []
        self.recording = False
        self.running = False

        self.video_label = tk.Label(root)
        self.video_label.pack()
        
        self.start_button = tk.Button(root, text="Start Recording", command=self.start_recording)
        self.start_button.pack()
        
        self.stop_button = tk.Button(root, text="Stop Recording", command=self.stop_recording)
        self.stop_button.pack()
        self.stop_button["state"] = "disabled"

        self.voice_text = tk.Label(root, text="Input: ")
        self.voice_text.pack()
        self.vqa_option1 = tk.Label(root, text="VQA 1: ")
        self.vqa_option1.pack()
        self.vqa_option2 = tk.Label(root, text="VQA 2: ")
        self.vqa_option2.pack()
        self.object_detection = tk.Label(root, text="Object Detection: ")
        self.object_detection.pack()
        self.output_text = tk.Label(root, text="Output: ")
        self.output_text.pack()

        # Bind the space bar press and release events
        root.bind("<KeyPress-space>", self.start_recording)
        root.bind("<KeyRelease-space>", self.stop_recording)

        self.update()
        
    def start_recording(self, event):
        if not self.recording:
            self.recording = True
            self.audio_stream = sd.InputStream(callback=self.audio_callback)
            self.audio_stream.start()
            self.audio_frames = []
            self.start_button["state"] = "disabled"
            self.stop_button["state"] = "active"
        
    def stop_recording(self, event):
        if self.recording:
            self.audio_stream.stop()
            self.recording = False
            self.start_button["state"] = "active"
            self.stop_button["state"] = "disabled"
            self.capture_image()
            self.save_audio("audio.wav")
            self.run_app("snapshot.png", "audio.wav")
            playsound.playsound('output.mp3', True)

    def audio_callback(self, indata, frames, time, status):
        if status:
            print("Audio recording error:", status, flush=True)
        if self.recording:
            self.audio_frames.append(indata.copy())
            
    def save_audio(self, audio_path):
        audio_data = np.concatenate(self.audio_frames, axis=0).mean(-1)
        sf.write(audio_path, audio_data, 44100)

    def capture_image(self):
        ret, frame = self.cap.read()
        if ret:
            cv2.imwrite("snapshot.png", frame)
            self.display_image("snapshot.png")

    def display_image(self, image_path):
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        img = ImageTk.PhotoImage(image=img)
        self.video_label.config(image=img)
        self.video_label.image = img

    def update(self):
        ret, frame = self.cap.read()
        if ret and not self.running:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(frame)
            img = ImageTk.PhotoImage(image=img)
            self.video_label.config(image=img)
            self.video_label.image = img
        self.root.after(10, self.update)
        
    def run(self):
        self.root.mainloop()
        self.cap.release()

    def run_app(self, image_path, wav_filepath):
        self.running = True
        image = Image.open(image_path).convert("RGB")
        user_prompt = transcribe_file(wav_filepath)
        self.voice_text.config(text="Input: " + user_prompt)
        self.root.update()
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
        vqa_output = ask_vqa(vqa_endpoint, user_prompt, image)
        self.vqa_option1.config(text="VQA 1: " + vqa_output)
        self.root.update()
        vqa_vilt_output = ask_vqa(vqa_vilt_endpoint, user_prompt, image)
        self.vqa_option2.config(text="VQA 2: " + vqa_vilt_output["answer"])
        self.root.update()
        if vqa_vilt_output["score"] > 0.5:
            vqa_output = vqa_vilt_output["answer"]
        object_detection_output = ask_object_detector(object_detection_model, image, conf=0.45)
        self.object_detection.config(text="Object Detection: " + object_detection_output)
        self.root.update()
        llm_output = ask_llm(user_prompt, vqa_output, object_detection_output)
        self.output_text.config(text="Output: " + llm_output)
        self.root.update()
        speak(llm_output)
        self.running=False


root = tk.Tk()
app = WebcamAudioRecorderApp(root)
app.run()


Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/tgieruc/miniconda3/lib/python3.8/tkinter/__init__.py", line 1892, in __call__
    return self.func(*args)
TypeError: stop_recording() missing 1 required positional argument: 'event'

0: 480x640 1 cell phone, 36.3ms
Speed: 13.2ms preprocess, 36.3ms inference, 4.7ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns/detect/predict10[0m


Audio content written to file "output.mp3"


Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/tgieruc/miniconda3/lib/python3.8/tkinter/__init__.py", line 1892, in __call__
    return self.func(*args)
  File "/tmp/ipykernel_2877/4259255808.py", line 67, in stop_recording
    playsound.playsound('output.mp3', True)
  File "/home/tgieruc/miniconda3/lib/python3.8/site-packages/playsound.py", line 254, in <lambda>
    playsound = lambda sound, block = True: _playsoundAnotherPython('/usr/bin/python3', sound, block, macOS = False)
  File "/home/tgieruc/miniconda3/lib/python3.8/site-packages/playsound.py", line 229, in _playsoundAnotherPython
    t.join()
  File "/home/tgieruc/miniconda3/lib/python3.8/site-packages/playsound.py", line 216, in join
    super().join(timeout)
  File "/home/tgieruc/miniconda3/lib/python3.8/threading.py", line 1011, in join
    self._wait_for_tstate_lock()
  File "/home/tgieruc/miniconda3/lib/python3.8/threading.py", line 1027, in _wait_for_tstate_lock
    elif lock.acquire(block

Audio content written to file "output.mp3"



0: 480x640 1 cell phone, 1 toothbrush, 34.3ms
Speed: 0.9ms preprocess, 34.3ms inference, 2.4ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns/detect/predict10[0m
Exception ignored from cffi callback <function _StreamBase.__init__.<locals>.callback_ptr at 0x7fb7b4416a60>:
Traceback (most recent call last):
  File "/home/tgieruc/miniconda3/lib/python3.8/site-packages/sounddevice.py", line 846, in callback_ptr
    return _wrap_callback(callback, data, frames, time, status)
  File "/home/tgieruc/miniconda3/lib/python3.8/site-packages/sounddevice.py", line 2687, in _wrap_callback
    callback(*args)
  File "/tmp/ipykernel_2877/3221092376.py", line 72, in audio_callback
AttributeError: 'WebcamAudioRecorderApp' object has no attribute 'recording'


Audio content written to file "output.mp3"



0: 480x640 1 toothbrush, 34.3ms
Speed: 1.2ms preprocess, 34.3ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns/detect/predict10[0m


Audio content written to file "output.mp3"



0: 480x640 1 cell phone, 1 toothbrush, 34.4ms
Speed: 1.3ms preprocess, 34.4ms inference, 2.2ms postprocess per image at shape (1, 3, 480, 640)
Results saved to [1mruns/detect/predict10[0m


Audio content written to file "output.mp3"


In [None]:
import cv2
import tkinter as tk
import sounddevice as sd
import soundfile as sf
import numpy as np
from PIL import Image, ImageTk
from pydub import AudioSegment
import playsound


class WebcamAudioRecorderApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Webcam Audio Recorder")
        
        self.cap = cv2.VideoCapture(0)
        self.audio_stream = None
        self.audio_frames = []
        self.recording = False

        self.video_label = tk.Label(root)
        self.video_label.pack()
        
        self.start_button = tk.Button(root, text="Start Recording", command=self.start_recording)
        self.start_button.pack()
        
        self.stop_button = tk.Button(root, text="Stop Recording", command=self.stop_recording)
        self.stop_button.pack()
        self.stop_button["state"] = "disabled"

        self.voice_text = tk.Label(root, text="Input: ")
        self.voice_text.pack()
        self.vqa_option1 = tk.Label(root, text="VQA 1: ")
        self.vqa_option1.pack()
        self.vqa_option2 = tk.Label(root, text="VQA 2: ")
        self.vqa_option2.pack()
        self.object_detection = tk.Label(root, text="Object Detection: ")
        self.object_detection.pack()
        self.output_text = tk.Label(root, text="Output: ")
        self.output_text.pack()

        self.running = False
        root.bind("<KeyPress-space>", self.toggle_recording)

        self.update()

    def toggle_recording(self, event):
        if self.recording:
            self.stop_recording()
        else:
            self.start_recording()
        
    def start_recording(self):
        self.audio_stream = sd.InputStream(callback=self.audio_callback)
        self.audio_stream.start()
        self.audio_frames = []
        self.recording = True
        self.start_button["state"] = "disabled"
        self.stop_button["state"] = "active"
        
    def stop_recording(self):
        if self.recording:
            self.audio_stream.stop()
            self.recording = False
            self.start_button["state"] = "active"
            self.stop_button["state"] = "disabled"
            self.capture_image()
            self.save_audio("audio.wav")
            # run("snapshot.png", "audio.wav", self.voice_text, self.output_text)
            self.run_app("snapshot.png", "audio.wav")
            playsound.playsound('output.mp3', True)

            
    def audio_callback(self, indata, frames, time, status):
        if status:
            print("Audio recording error:", status, flush=True)
        if self.recording:
            self.audio_frames.append(indata.copy())
            
    def save_audio(self, audio_path):
        audio_data = np.concatenate(self.audio_frames, axis=0).mean(-1)
        sf.write(audio_path, audio_data, 44100)



    def capture_image(self):
        ret, frame = self.cap.read()
        if ret:
            cv2.imwrite("snapshot.png", frame)
            self.display_image("snapshot.png")

    def display_image(self, image_path):
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = Image.fromarray(img)
        img = ImageTk.PhotoImage(image=img)
        self.video_label.config(image=img)
        self.video_label.image = img

    def update(self):
        ret, frame = self.cap.read()
        if ret and not self.running:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(frame)
            img = ImageTk.PhotoImage(image=img)
            self.video_label.config(image=img)
            self.video_label.image = img
        self.root.after(10, self.update)
        
    def run(self):
        self.root.mainloop()
        self.cap.release()

    def run_app(self, image_path, wav_filepath):
        self.running = True
        image = Image.open(image_path).convert("RGB")
        user_prompt = transcribe_file(wav_filepath)
        self.voice_text.config(text="Input: " + user_prompt)
        self.root.update()
        if isinstance(image, np.ndarray):
            image = Image.fromarray(image)
        vqa_output = ask_vqa(vqa_endpoint, user_prompt, image)
        self.vqa_option1.config(text="VQA 1: " + vqa_output)
        self.root.update()
        vqa_vilt_output = ask_vqa(vqa_vilt_endpoint, user_prompt, image)
        self.vqa_option2.config(text="VQA 2: " + vqa_vilt_output["answer"])
        self.root.update()
        if vqa_vilt_output["score"] > 0.5:
            vqa_output = vqa_vilt_output["answer"]
        object_detection_output = ask_object_detector(object_detection_model, image, conf=0.45)
        self.object_detection.config(text="Object Detection: " + object_detection_output)
        self.root.update()
        llm_output = ask_llm(user_prompt, vqa_output, object_detection_output)
        self.output_text.config(text="Output: " + llm_output)
        self.root.update()
        speak(llm_output)
        self.running=False


root = tk.Tk()
app = WebcamAudioRecorderApp(root)
app.run()
