In [8]:
import cv2
import gradio as gr
import numpy as np
import base64
import requests

from io import BytesIO
from PIL import Image

from ultralytics import YOLO  
  

# #################### YOLO V8 ##############################

# YOLOv8 모델 로드  
model = YOLO('yolov8n.pt')  # YOLOv8 모델 파일 경로  
  
# 객체 감지 함수  
def detect_objects(image):  
    results = model(image)  
      
    # 결과를 처리하여 이미지에 사각형 그리기 및 레이블 표시  
    labels = model.names  
    for result in results:  
        boxes = result.boxes.xyxy.cpu().numpy()  
        confidences = result.boxes.conf.cpu().numpy()  
        class_ids = result.boxes.cls.cpu().numpy()  
          
        for i, box in enumerate(boxes):  
            x1, y1, x2, y2 = map(int, box)  
            confidence = confidences[i]  
            class_id = int(class_ids[i])  
            label = labels[class_id]  
              
            # 사각형 그리기  
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)  
            # 레이블 표시  
            cv2.putText(image, f"{label} {confidence:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 0), 2)  
      
    return image      
#############################################################

################# chatgpt_response ##########################

def chatgpt_response(image_array, history):
    endpoint = "https://fimtrus-openai.openai.azure.com"
    api_key = "310a6832c2394daf97a3f446cc86ce20"
    deployment_name = "fitmrus-gpt4o"

    headers = {
        'Content-Type': 'application/json',
        'api-key': api_key
    }

    messages = []

    #System
    messages.append({
        "role": "system",
        "content": [{
            "type": "text",
            "text": "너는 사진 속에서 감지된 물체에 대해서 분석하는 봇이야."
        }]
    })

    image = Image.fromarray(image_array)
    buffered_io = BytesIO()
    image.save(buffered_io, format='png')
    base64_image = base64.b64encode(buffered_io.getvalue()).decode("utf-8")

    # original_width, original_height = image.size
    # ratio = 400 / original_width
    # resized_image = image.resize((int(original_width * ratio), int(original_height * ratio)), Image.LANCOS)

    #User
    messages.append({
        "role": "user",
        "content": [{
            "type": "text",
            "text": "이 사진에서 감지된 물체에 대해 감지 확률과 함께 자세하게 설명해줘."
        },{
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{base64_image}"
            }
        }]
    })

    payload = {
        "messages": messages,
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 800,
    }

    response = requests.post(
        f"{endpoint}/openai/deployments/{deployment_name}/chat/completions?api-version=2024-02-15-preview",
        headers=headers,
        json=payload
    )

    if response.status_code == 200:
        result = response.json()
        bot_response = result['choices'][0]['message']['content'].strip()
        history.append(('User', bot_response))
        return history
    else:
        history.append((str(response.status_code), response.text))
        return history
#############################################################

################# TTS #######################################

def get_token():
    endpoint = "https://eastus.api.cognitive.microsoft.com/sts/v1.0/issueToken"
    api_key = "8e5931bade634c4e859a8e7544f87ff7"

    headers = {
        "Ocp-Apim-Subscription-Key": api_key,
    }

    response = requests.post(endpoint, headers=headers)

    if response.status_code == 200:
        token = response.text
        return token
    else:
        return ''

def request_tts(text):
    endpoint = "https://eastus.tts.speech.microsoft.com/cognitiveservices/v1"
    token = get_token()

    headers = {
        "Content-Type": "application/ssml+xml",
        "User-Agent": "testForEducation",
        "X-Microsoft-OutputFormat": "riff-24khz-16bit-mono-pcm",
        "Authorization": f"Bearer {token}"
    }
    data = f"""
        <speak version='1.0' xml:lang='ko-KR'><voice xml:lang='ko-KR' xml:gender='Female' name='ko-KR-JiMinNeural'>
            {text}
        </voice></speak>
    """

    response = requests.post(endpoint,
                            headers=headers,
                            data=data)
    print(response)
    if response.status_code == 200:
        file_name = 'response_audio.wav'

        with open(file_name, "wb") as audio_file:
            audio_file.write(response.content)

        return file_name
    else:
        return None

# request_tts('안녕하세요 저는 AI 챗봇입니다.')
#############################################################

def stream_webcam(image):
    return detect_objects(image)

def click_capture(image):
    return image

def click_send_gpt(image_array, history):
    return chatgpt_response(image_array, history)

def change_chatbot(chatbot):
    import re
    text = chatbot[-1][1]
    pattern = r'[^가-힣a-zA-Z0-9\s]'
    cleaned_text = re.sub(pattern, '', text)
    file_name = request_tts(cleaned_text)
    return file_name

with gr.Blocks() as demo:

    gr.Markdown("# Fimtrus's AI World!!!")

    with gr.Column():

        with gr.Row():
            webcam_input = gr.Image(label="실시간 화면", sources="webcam")
            output_image = gr.Image(label="실시간 감지", interactive=False)
            output_capture_image = gr.Image(label="캡쳐 화면", interactive=False)

        with gr.Row():
            capture_button = gr.Button('캡쳐')
            send_gpt_button = gr.Button('GPT로 전송')

    with gr.Column():
        chatbot = gr.Chatbot(label="분석 결과")
        chatbot_audio = gr.Audio(label='GPT', interactive=False, autoplay=True)
        # chatbot, audio 
    
    webcam_input.stream(fn=stream_webcam, inputs=[webcam_input], outputs=[output_image])
    capture_button.click(fn=click_capture, inputs=[output_image], outputs=[output_capture_image])
    send_gpt_button.click(fn=click_send_gpt, inputs=[output_capture_image, chatbot], outputs=[chatbot])
    chatbot.change(fn=change_chatbot, inputs=[chatbot], outputs=[chatbot_audio])
    # 실시간 화면에 대한 stream event.
    #각종 이벤트 리스너 필요.

demo.launch()


<Response [200]>


'response_audio.wav'

In [4]:

## 로직

# 실시간 화면을 스트리밍 했을때, 실시간 감지화면에 감지된 이미지가 보여야하고,
# 내가 원하는 이미지를 캡쳐버튼을 통해, 캡쳐화면에 보여준다.

# GPT 버튼을 눌러서, OpenAI 로 이미지를 전송하는데, 
# System 메시지: 이미지를 분석하는 챗봇
# User 메시지: 감지된 이미지를 분석
# 캡쳐된 이미지를 포함, OpenAI로 전송

# Response 받은 데이터를 챗봇 화면에 보여준다.

# 챗봇에 어떤 텍스트가 업데이트 되었을 때, tts를 통해서, 음성파일을 받아온다.

# operation
def detect_objects():
    # YOLO 모델을 불러와서 객체화
    # 이미지를 받아와서
    # 분석후
    # Rectangle을 그려주고
    # 레이블을 그려준다.
    pass

def request_gpt():
    pass



# event
def click_capture():
    pass

def click_gpt():
    pass


## 화면
# with gr.Blocks() as demo:
#     with gr.Row():
#         gr.Image()
#         gr.Image()
#         gr.Image()

#     gr.Chatbot()
#     gr.Audio()

# 실시간 화면, 실시간 감지화면, 캡쳐화면
# 캡쳐버튼, GPT 로 전송하는 버튼
# 챗봇 
# 오디오 화면