In [6]:
import cv2
import gradio as gr
import numpy as np
import base64
import requests

from io import BytesIO
from PIL import Image


# #################### YOLO V3 ##############################

# Yolov3 파일 경로
weights_path = 'yolo3/yolov3.weights'
config_path = 'yolo3/yolov3.cfg'
names_path = 'yolo3/coco.names'

# YOLOv3 모델 로드
net = cv2.dnn.readNet(weights_path, config_path)

# 라벨 이름 로드
with open(names_path, 'r') as f:
    labels = f.read().strip().split('\n')
    print('Labels Length: ', len(labels))

# 객체 감지 함수
def detect_objects(image):
    height, width = image.shape[:2]
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layer_names = net.getLayerNames()
    
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
    
    detections = net.forward(output_layers)
    print('Detections Length: ', len(detections))

    box_list = []
    confidence_list = []
    class_id_list = []

    for output in detections:
        for detection in output:
            print('Detection Length: ', len(labels))
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]

            if confidence > 0.5:
                box = detection[0:4] * np.array([width, height, width, height])
                (center_x, center_y, w, h) = box.astype("int")
                x = int(center_x - (w / 2))
                y = int(center_y - (h / 2))

                box_list.append([x, y, int(w), int(h)])
                confidence_list.append(float(confidence))
                class_id_list.append(class_id)
    
    index_list = cv2.dnn.NMSBoxes(box_list, confidence_list, 0.5, 0.4)
    
    if len(index_list) > 0:
        for i in index_list.flatten():
            x, y, w, h = box_list[i]
            label = str(labels[class_id_list[i]])
            confidence = confidence_list[i]
            
            #사각형 그리기
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

            # 레이블 표시
            cv2.putText(image, f"{label} {(confidence*100):.2f}", (x, y - 10), cv2.FONT_HERSHEY_COMPLEX, 2, (0, 0, 255), 2)

    return image        
#############################################################

################# chatgpt_response ##########################

def chatgpt_response(image_array, history):
    endpoint = "https://fimtrus-openai.openai.azure.com"
    api_key = "310a6832c2394daf97a3f446cc86ce20"
    deployment_name = "fitmrus-gpt4o"

    headers = {
        'Content-Type': 'application/json',
        'api-key': api_key
    }

    messages = []

    #System
    messages.append({
        "role": "system",
        "content": [{
            "type": "text",
            "text": "너는 사진 속에서 감지된 물체에 대해서 분석하는 봇이야."
        }]
    })

    image = Image.fromarray(image_array)
    buffered_io = BytesIO()
    image.save(buffered_io, format='png')
    base64_image = base64.b64encode(buffered_io.getvalue()).decode("utf-8")

    # original_width, original_height = image.size
    # ratio = 400 / original_width
    # resized_image = image.resize((int(original_width * ratio), int(original_height * ratio)), Image.LANCOS)

    #User
    messages.append({
        "role": "user",
        "content": [{
            "type": "text",
            "text": "이 사진에서 감지된 물체에 대해 감지 확률과 함께 자세하게 설명해줘."
        },{
            "type": "image_url",
            "image_url": {
                "url": f"data:image/png;base64,{base64_image}"
            }
        }]
    })

    payload = {
        "messages": messages,
        "temperature": 0.7,
        "top_p": 0.95,
        "max_tokens": 800,
    }

    response = requests.post(
        f"{endpoint}/openai/deployments/{deployment_name}/chat/completions?api-version=2024-02-15-preview",
        headers=headers,
        json=payload
    )

    if response.status_code == 200:
        result = response.json()
        bot_response = result['choices'][0]['message']['content'].strip()
        history.append(('User', bot_response))
        return history
    else:
        history.append((str(response.status_code), response.text))
        return history
#############################################################

def stream_webcam(image):
    return detect_objects(image)

def click_capture(image):
    return image

def click_send_gpt(image_array, history):
    return chatgpt_response(image_array, history)

with gr.Blocks() as demo:

    gr.Markdown("# Fimtrus's AI World!!!")

    with gr.Column():

        with gr.Row():
            webcam_input = gr.Image(label="실시간 화면", sources="webcam")
            output_image = gr.Image(label="실시간 감지", interactive=False)
            output_capture_image = gr.Image(label="캡쳐 화면", interactive=False)

        with gr.Row():
            capture_button = gr.Button('캡쳐')
            send_gpt_button = gr.Button('GPT로 전송')

    with gr.Column():
        chatbot = gr.Chatbot(label="분석 결과")
        chatbot_audio = gr.Audio(label='GPT', interactive=False)
        # chatbot, audio 
    
    webcam_input.stream(fn=stream_webcam, inputs=[webcam_input], outputs=[output_image])
    capture_button.click(fn=click_capture, inputs=[output_image], outputs=[output_capture_image])
    send_gpt_button.click(fn=click_send_gpt, inputs=[output_capture_image, chatbot], outputs=[chatbot])
    # 실시간 화면에 대한 stream event.
    #각종 이벤트 리스너 필요.

demo.launch()


Labels Length:  80
Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.


