In [112]:
import base64
import requests
import json
import pandas as pd
import os
from PIL import Image, ImageDraw, ImageFont

pd.set_option('display.max_colwidth', None)

# OpenAI API Key
api_key = os.getenv('OPENAI_API_KEY')

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Path to your image
image_path = r"C:/Users/h02317/Downloads/img.png"
# Getting the base64 string
base64_image = encode_image(image_path)

# Headers for the API request
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Payload for the GPT-4o API request
payload = {
    "model": "gpt-4o",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        "Analyze this image of industrial machinery and provide the following:"
                        "1) A very concise description of the overall semantic meaning of the scene (do not write 'the image shows/depicts' or something similar), "
                        "2) A JSON object with each object identified in the image, where each key is the object name, "
                        "each value includes a very concise description of the object, any text written on it (if any), and the bounding box coordinates of the objects (x, y, width, height). "
                        "Do not include the unimportant objects (e.g., walls, ceilings, floors, humans) unless they have text or labels on them."
                    )
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
            ]
        }
    ],
    "max_tokens": 2000,
    "temperature": 0
}

# Sending the request to the API
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

# Processing the response
if response.status_code == 200:
    response_data = response.json()
    gpt_response = response_data['choices'][0]['message']['content']
    
    # Extract the overall scene description
    try:
        # Assume the response starts with a scene description and then has a JSON block
        json_start = gpt_response.index('{')
        json_end = gpt_response.rindex('}') + 1
        scene_description = gpt_response[:json_start].strip()
        json_content = gpt_response[json_start:json_end]
        
        # Convert the extracted JSON to a Python dictionary
        json_data = json.loads(json_content)
        
        # Create a new structured JSON object for display purposes
        structured_output = {}
        label_counter = 1
        for obj, details in json_data.items():
            if isinstance(details, dict):
                description = details.get('description', '')
                text = details.get('text', '')
                bbox = details.get('bounding_box', {})
                
                # Format description to include the text
                if text:
                    description += f" with text: '{text}'"
                
                # For the DataFrame, store the formatted description
                structured_output[obj] = description
            else:
                # If details is a string, use it directly
                structured_output[obj] = details
        
        # Load the image
        image = Image.open(image_path)
        draw = ImageDraw.Draw(image)
        
        # Set a larger, bold font
        font = ImageFont.truetype("arial.ttf", 20, encoding="unic")  # Ensure you have a suitable font available
        
        # Annotate the objects on the image
        for obj, details in json_data.items():
            if isinstance(details, dict) and 'bounding_box' in details:
                bbox = details['bounding_box']
                x_center = bbox['x'] + bbox['width'] // 2
                y_center = bbox['y'] + bbox['height'] // 2
                
                label = obj
                
                # Calculate text size using textbbox
                text_bbox = font.getbbox(label)
                text_width = text_bbox[2] - text_bbox[0]
                text_height = text_bbox[3] - text_bbox[1]
                
                # Position the text at the center of the bounding box
                draw.text((x_center - text_width // 2, y_center - text_height // 2), label, fill="red", font=font)
        
        image.show()  
        
        scene_description = scene_description.split('2)')[0].replace('1)', '').strip()
        
        print("Overall Scene Description:", scene_description)
        
        df = pd.DataFrame(list(structured_output.items()), columns=['Object', 'Description'])
        
    except (ValueError, json.JSONDecodeError) as e:
        print("Failed to extract or decode JSON from the response:", str(e))
else:
    print(f"Request failed with status code {response.status_code}")
    print("Response Body:", response.text)

print("Individual objects and labels")
df

Overall Scene Description: Industrial workspace with machinery and labeled work instructions.
Individual objects and labels


Unnamed: 0,Object,Description
0,sign_1,Sign with text 'flexeserve Zone Work Instructions' with text: 'flexeserve Zone Work Instructions'
1,sign_2,Sign with text 'flexeserve Zone Work Instructions' with text: 'flexeserve Zone Work Instructions'
2,sign_3,Sign with text 'flexeserve Zone Work Instructions' with text: 'flexeserve Zone Work Instructions'
3,company_sign,Large sign with company name 'flexeserve' and slogan 'The Home of Hot-holding' with text: 'flexeserve The Home of Hot-holding'
4,red_machine,Red industrial machine with transparent sections
5,yellow_fence,Yellow safety fence surrounding machinery
6,shelves,Metal shelves with various items
