In [3]:
import gradio as gr
import requests
from PIL import Image, ImageDraw, ImageFont

In [None]:
ENDPOINT = "https://fimtrus-vision-we.cognitiveservices.azure.com/"
API_KEY = "***"


def request_vision(features, image_path, language=None):
    endpoint = f"{ENDPOINT}computervision/imageanalysis:analyze?api-version=2024-02-01&features={features}"
    if language:
        endpoint += f'&language={language}'
    # method
    headers = {
        "Ocp-Apim-Subscription-Key": API_KEY,
        "Content-Type": "application/octet-stream"
    }
    # image_path 로 이미지를 바이너리 형태로 읽어서 전송한다.

    with open(image_path, "rb") as image_file:
        image_data = image_file.read()

    response = requests.post(endpoint, headers=headers, data=image_data)

    if response.status_code == 200:
        return response.json()
    else:
        return response.text

FEATURES  = ['read', 'smartCrops', 'tags', 'people', 'caption', 'denseCaptions', 'objects']
FEATURE = "read"
request_vision(FEATURE, "resources/손흥민.png")

{'modelVersion': '2023-10-01',
 'metadata': {'width': 710, 'height': 473},
 'readResult': {'blocks': [{'lines': [{'text': 'kraw.',
      'boundingPolygon': [{'x': 470, 'y': 240},
       {'x': 493, 'y': 258},
       {'x': 487, 'y': 268},
       {'x': 463, 'y': 249}],
      'words': [{'text': 'kraw.',
        'boundingPolygon': [{'x': 470, 'y': 240},
         {'x': 494, 'y': 258},
         {'x': 487, 'y': 267},
         {'x': 463, 'y': 248}],
        'confidence': 0.595}]},
     {'text': 'AIA',
      'boundingPolygon': [{'x': 305, 'y': 304},
       {'x': 415, 'y': 311},
       {'x': 412, 'y': 357},
       {'x': 304, 'y': 354}],
      'words': [{'text': 'AIA',
        'boundingPolygon': [{'x': 308, 'y': 304},
         {'x': 407, 'y': 309},
         {'x': 405, 'y': 358},
         {'x': 306, 'y': 354}],
        'confidence': 0.57}]}]}]}}

In [5]:
def add_captions(image_path):
    result = request_vision('caption', image_path)
    if type(result) is str:
        return result
    
    return f'{result['captionResult']['text']} ({result['captionResult']['confidence']*100:.2f} %)'


def extract_tags(image_path):
    result = request_vision('tags', image_path, language='ko')
    if type(result) is str:
        return result
    
    results = []
    for tag in result['tagsResult']['values']:
        results.append(f'{tag['name']} ({tag['confidence']*100:.2f} %)')
    
    return '\n'.join(results)

In [52]:
TMP_IMAGE_PATH = 'resources/tmp.png'

def generate_dense_caption(image_path, tmp_image_path):
    result = request_vision('denseCaptions', image_path)
    if type(result) is str:
        return None
    
    image = Image.open(tmp_image_path)
    draw = ImageDraw.Draw(image)
    try:
        font = ImageFont.truetype("arial.ttf", size=40)
    except IOError:
        font = ImageFont.load_default()
    
    for obj in result['denseCaptionsResult']['values']:
        text = obj['text']
        confidence = obj['confidence'] * 100
        bbox = obj['boundingBox']
        x, y, w, h = bbox['x'], bbox['y'], bbox['w'], bbox['h']
        box_coords = [(x, y), (x + w, y + h)]
        draw.rectangle(box_coords, outline="red", width=2)
        label = f"{text} ({confidence:.2f}%)"
        text_position = (x, y + h - 40)
        draw.text(text_position, label, fill="red", font=font)
        
    image.save(TMP_IMAGE_PATH)
    return TMP_IMAGE_PATH


def read_text(image_path, tmp_image_path):
    result = request_vision('read', image_path)
    if type(result) is str:
        return None
    
    image = Image.open(tmp_image_path)
    draw = ImageDraw.Draw(image)
    try:
        font = ImageFont.truetype("arial.ttf", size=20)
    except IOError:
        font = ImageFont.load_default()
    
    for block in result['readResult']['blocks']:
        for line in block['lines']:
            polygon = line['boundingPolygon']
            text_content = line['text']
            polygon_coords = [(point['x'], point['y']) for point in polygon]
            draw.polygon(polygon_coords, outline="purple", width=2)
            text_position = (polygon_coords[0][0], polygon_coords[0][1] - 15 if polygon_coords[0][1] - 15 > 0 else polygon_coords[0][1] + 5)
            draw.text(text_position, text_content, fill="purple", font=font)
            
    image.save(TMP_IMAGE_PATH)
    return TMP_IMAGE_PATH


def crop(image_path, tmp_image_path):
    result = request_vision('smartCrops', image_path)
    if type(result) is str:
        return None, None
    crop = result['smartCropsResult']['values'][0]['boundingBox']
    crop_x, crop_y, crop_w, crop_h = crop['x'], crop['y'], crop['w'], crop['h']
    
    image = Image.open(tmp_image_path)
    draw = ImageDraw.Draw(image)
    try:
        font = ImageFont.truetype("arial.ttf", size=40)
    except IOError:
        font = ImageFont.load_default()
        
    crop_coords = [(crop_x, crop_y), (crop_x + crop_w, crop_y + crop_h)]
    draw.rectangle(crop_coords, outline="blue", width=2)
    crop_text_position = (crop_x, crop_y + crop_h - 40)
    draw.text(crop_text_position, "Smart Crop", fill="blue", font=font)
    
    image.save(TMP_IMAGE_PATH)
    
    image = Image.open(image_path)
    cropped_image = image.crop((crop_x, crop_y, crop_x + crop_w, crop_y + crop_h))
    cropped_image_path = 'resources/crop.png'
    cropped_image.save('resources/crop.png')
    
    return TMP_IMAGE_PATH, cropped_image_path

In [53]:
with gr.Blocks() as demo:
    gr.Markdown('# 🔍 이미지 분석')
    
    send_button = gr.Button('분석 시작')
    
    with gr.Row():
        input_image = gr.Image(label='입력 이미지', type='filepath')

        with gr.Column():
            caption_textbox = gr.Textbox(label='이미지 캡션', interactive=False)
            tag_textbox = gr.Textbox(label='이미지 태그', interactive=False)
    
    
    with gr.Row():
        bbox_image = gr.Image(label='시각적 분석', type='filepath', interactive=False)
        cropped_image = gr.Image(label='관심 영역 크롭', type='filepath', interactive=False)
    
    send_button.click(add_captions, inputs=[input_image], outputs=[caption_textbox])
    send_button.click(extract_tags, inputs=[input_image], outputs=[tag_textbox])
    send_button.click(
        crop, inputs=[input_image, input_image], outputs=[bbox_image, cropped_image]
    ).then(
        generate_dense_caption, inputs=[input_image, bbox_image], outputs=[bbox_image], show_progress='minimal'
    ).then(
        read_text, inputs=[input_image, bbox_image], outputs=[bbox_image], show_progress='minimal'
    )
demo.launch()

* Running on local URL:  http://127.0.0.1:7881

To create a public link, set `share=True` in `launch()`.


