Libraries

In [135]:
import os
import torch
import pandas as pd
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
from google.cloud import vision
import cv2

import textwrap
from inference_sdk import InferenceHTTPClient
# Assuming google.generativeai is a valid module and used later in the code
import google.generativeai as genai
from IPython.display import display, Markdown
import numpy as np

# Use environment variables for sensitive information
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', 'AIzaSyArNvjy-dtSnMvKsqWsR9dBtVQ8eUSdlAg')
GOOGLE_APPLICATION_CREDENTIALS = os.getenv('GOOGLE_APPLICATION_CREDENTIALS', 'ornate-grail-426902-p3-b90b1e5ffe06.json')

if GOOGLE_API_KEY is None:
    print("Error: GOOGLE_API_KEY environment variable not set.")
else:
    try:
        genai.configure(api_key=GOOGLE_API_KEY)
        print("GenAI configured successfully.")
    except Exception as e:
        print(f"Error configuring GenAI: {e}")

# Set Google Cloud credentials environment variable
if GOOGLE_APPLICATION_CREDENTIALS:
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS
    print("Google Cloud credentials set.")
else:
    print("Error: GOOGLE_APPLICATION_CREDENTIALS environment variable not set or invalid.")

print("Setup complete.")

GenAI configured successfully.
Google Cloud credentials set.
Setup complete.


In [162]:
class DrawBoundingBoxes:
    def __init__(self, image, results):
        self.image = image
        self.predictions = results['predictions']
        self.draw = ImageDraw.Draw(self.image)

    def draw_bbox_with_polygon(self):
        """Draws bounding boxes as polygons based on the predictions."""
        for pred in self.predictions:
            vertices = pred['vertices']
            label = pred['label']
            # Draw the polygon based on vertices
            self.draw.polygon(vertices, outline="red", width=4)
            # Position for the label is slightly above and to the right of the first vertex
            label_pos = (vertices[0][0] + 10, vertices[0][1] - 10)
            self.draw.text(label_pos, label, fill="red")

    def show_image_with_bbox(self):
        """Displays the image with bounding boxes."""
        self.draw_bbox_with_polygon()
        plt.imshow(self.image)
        plt.axis('off')
        plt.show()

    def save_image_with_bbox(self, output_image_path):
        """Saves the image with bounding boxes to the specified path."""
        self.draw_bbox_with_polygon()
        self.image.save(output_image_path)
        print(f"Image saved to {output_image_path}")


def convert_to_polygon(bounds):
    """Converts bounding box coordinates to a polygon (list of tuples)."""
    # return [(bounds['xmin']-5, bounds['ymin']-5),
    #         (bounds['xmax'], bounds['ymin']),
    #         (bounds['xmax'], bounds['ymax']),
    #         (bounds['xmin'], bounds['ymax'])]
    return [(bounds['xmin'] -40, bounds['ymin'] -40),
            (bounds['xmax'] +40, bounds['ymin'] -40),
            (bounds['xmax'] +40, bounds['ymax'] +40),
            (bounds['xmin'] -40, bounds['ymax'] +40)]

In [163]:
class ObjectDetection:
    def __init__(self, model, image_path):
        self.model = model
        self.image_path = image_path

    def detect(self):
        img = Image.open(self.image_path)
        results = self.model(img)
        return results


class YOLOv5(ObjectDetection):
    def detect(self):
        results = super().detect()
        filter = results.pandas().xyxy[0]
        human_preds = filter[filter['name'] == 'person'].copy()
        human_preds['name'] = ['person_' +
                               str(index) for index in human_preds.index]
        human_info = {
            'predictions': [
                {
                    'label': row['name'],
                    'vertices': convert_to_polygon({
                        'xmin': row['xmin'], 'ymin': row['ymin'],
                        'xmax': row['xmax'], 'ymax': row['ymax']
                    })
                } for _, row in human_preds.iterrows()
            ]
        }
        return human_info

class ImageAnnotatorClient(ObjectDetection):
    def detect(self):
        # Read the image file
        with open(self.image_path, "rb") as image_file:
            content = image_file.read()

        # Convert the image content to a numpy array
        nparr = np.frombuffer(content, np.uint8)
        # Decode the numpy array to an image
        img_np = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

        # Preprocess the image
        # Convert to grayscale
        gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
        # Apply Gaussian Blur
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)

        # Convert the preprocessed image back to bytes
        _, buffer = cv2.imencode('.jpg', blurred)
        preprocessed_content = buffer.tobytes()

        # Use the preprocessed image for text detection
        image = vision.Image(content=preprocessed_content)
        response = self.model.text_detection(image=image)

        if hasattr(response, 'error') and response.error.message:
            raise Exception(
                f"{response.error.message}\nFor more info on error messages, check: https://cloud.google.com/apis/design/errors")

        texts = response.text_annotations
        if not texts:
            return "No text detected"

        results = {
            'image': {
                'width': response.full_text_annotation.pages[0].width,
                'height': response.full_text_annotation.pages[0].height,
            },
            'predictions': [
                {
                    'label': text.description,
                    'vertices': [(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices]
                } for text in texts
            ]
        }
        return results

In [164]:
image_path = 'Beer/48.jpg'

In [165]:
model = torch.hub.load('ultralytics/yolov5', 'yolov5n')
human_detector = YOLOv5(model, image_path)
human_info = human_detector.detect()

Using cache found in C:\Users\Thanh/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-6-28 Python-3.12.3 torch-2.3.1 CUDA:0 (NVIDIA GeForce MX230, 2048MiB)

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 


In [166]:
pd.DataFrame([{
    'label': pred['label'],
    'vertice_0': pred['vertices'][0],
    'vertice_1': pred['vertices'][1],
    'vertice_2': pred['vertices'][2],
    'vertice_3': pred['vertices'][3]
} for pred in human_info['predictions']])

Unnamed: 0,label,vertice_0,vertice_1,vertice_2,vertice_3
0,person_1,"(31.625839233398438, 307.0141906738281)","(251.49461364746094, 307.0141906738281)","(251.49461364746094, 968.5150146484375)","(31.625839233398438, 968.5150146484375)"
1,person_2,"(518.756591796875, 546.7799072265625)","(984.3592529296875, 546.7799072265625)","(984.3592529296875, 1299.0855712890625)","(518.756591796875, 1299.0855712890625)"
2,person_3,"(136.74790954589844, 319.6841735839844)","(596.0384521484375, 319.6841735839844)","(596.0384521484375, 1061.3719482421875)","(136.74790954589844, 1061.3719482421875)"
3,person_11,"(314.9019775390625, 357.0315856933594)","(555.3452758789062, 357.0315856933594)","(555.3452758789062, 672.1087646484375)","(314.9019775390625, 672.1087646484375)"


In [167]:
model = vision.ImageAnnotatorClient()
text_detector = ImageAnnotatorClient(model, image_path)
text_info = text_detector.detect()

In [168]:
pd.DataFrame([{
    'label': pred['label'],
    'vertice_0': pred['vertices'][0],
    'vertice_1': pred['vertices'][1],
    'vertice_2': pred['vertices'][2],
    'vertice_3': pred['vertices'][3]
} for pred in text_info['predictions']]).head()

Unnamed: 0,label,vertice_0,vertice_1,vertice_2,vertice_3
0,ENT\n10000\nMar\nJOUT AND\nYAD\n194\nĐƯỜNG\nSỐ 8,"(44, 182)","(814, 182)","(814, 1020)","(44, 1020)"
1,ENT,"(110, 1006)","(105, 949)","(126, 947)","(131, 1004)"
2,10000,"(139, 1018)","(135, 953)","(152, 952)","(156, 1017)"
3,Mar,"(44, 659)","(76, 647)","(81, 660)","(49, 672)"
4,JOUT,"(53, 403)","(89, 404)","(89, 416)","(53, 415)"


In [169]:
CLIENT = InferenceHTTPClient(
    api_url="https://detect.roboflow.com",
    api_key="myga9csfPR6CPfAMerk7"
)

clothes_info = CLIENT.infer(image_path, model_id="clothing-exome/1")

In [170]:
def convert_bounding_box_to_polygon(detection_results):
    """
    Converts bounding box center points to polygon vertices.

    :param detection_results: Dictionary containing image info and predictions.
    :return: Updated dictionary with predictions containing polygon vertices.
    """
    polygon_info = {
        'image': {
            'width': detection_results['image']['width'],
            'height': detection_results['image']['height']
        },
        'predictions': []
    }

    for prediction in detection_results['predictions']:
        vertices = [
            (prediction['x'] - prediction['width'] / 2,
             prediction['y'] - prediction['height'] / 2),  # Top-left
            (prediction['x'] + prediction['width'] / 2,
             prediction['y'] - prediction['height'] / 2),  # Top-right
            (prediction['x'] + prediction['width'] / 2,
             prediction['y'] + prediction['height'] / 2),  # Bottom-right
            (prediction['x'] - prediction['width'] / 2,
             prediction['y'] + prediction['height'] / 2)   # Bottom-left
        ]

        polygon_info['predictions'].append({
            'label': prediction['class'],
            'vertices': vertices
        })

    return polygon_info


# Example usage
clothes_info = convert_bounding_box_to_polygon(clothes_info)

In [171]:
pd.DataFrame([{
    'label': pred['label'],
    'vertice_0': pred['vertices'][0],
    'vertice_1': pred['vertices'][1],
    'vertice_2': pred['vertices'][2],
    'vertice_3': pred['vertices'][3]
} for pred in clothes_info['predictions']]).head()

Unnamed: 0,label,vertice_0,vertice_1,vertice_2,vertice_3
0,jacket,"(168.75, 446.25)","(411.25, 446.25)","(411.25, 857.5)","(168.75, 857.5)"
1,skirt,"(733.75, 712.5)","(951.25, 712.5)","(951.25, 991.25)","(733.75, 991.25)"


In [172]:
all_pred = {'predictions': human_info['predictions'] + text_info['predictions'][1:]}

In [173]:
draw_bbox = DrawBoundingBoxes(Image.open(image_path), all_pred)
draw_bbox.save_image_with_bbox('Beer_with_bbox/48_Bbox.jpg')

Image saved to Beer_with_bbox/48_Bbox.jpg


In [174]:
def to_markdown(text):
    text = text.replace('•', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [175]:
def analyze_image_information(image_path, human_detector, text_detector, clothes_detector):
    image = Image.open(image_path)
    prompt = f"""
    Analyze the following image information and provide detailed insights based on the criteria given below:

    Human Detection Results:
    {human_detector}

    Text Detection Results:
    {text_detector}


    Business Problem 1: How many people handling a can of beer or a bottle of beer?
    - Specifically, count the number of people drinking Heineken beer in the restaurant.

    Business Problem 4: Tracking marketing staff
    - Identify any marketing staff present in the image based on their clothing.
    - Confirm if there are at least two marketing staff members at each restaurant location.

    Insights:
    """

    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content([prompt, image], stream=True)
    response.resolve()
    return to_markdown(response.text)

In [177]:
analyze_image_information(image_path, human_info, text_info, clothes_info)

> ## Insights:
> 
> **Business Problem 1:**
> 
> * **Analysis:** Based on the provided information, it is impossible to determine the brand of beer being consumed. The image does not show any Heineken branding. There is one person holding a can of beer (person_2) and another person holding a bottle (likely beer) (person_3). 
> * **Answer:** We cannot determine if the beer is Heineken, but there are **two** people handling beer (one holding a can and one holding a bottle).
> 
> **Business Problem 4:**
> 
> * **Analysis:** The provided information only identifies people by their position in the image, but does not describe clothing. This makes it impossible to identify any marketing staff members based on their clothing.
> * **Answer:** We cannot confirm if there are any marketing staff members present in the image. There is insufficient information to answer this question.
> 
> **Overall:** The provided data is not sufficient to answer the business problems in detail. While we can identify some people handling beer, we lack information about the specific brand of beer and the clothing worn by the people in the image.  Further information is needed about the objects in the image and the clothing of the people to accurately analyze and answer the business problems.


In [244]:
image_path = 'Beer/76.jpg'
model = vision.ImageAnnotatorClient()
text_detector = ImageAnnotatorClient(model, image_path)
text_info = text_detector.detect()

prompt = """
Analyze the provided image from a restaurant or event location to identify visible promotional materials featuring the Heineken logo. Classify each item into categories: ice bucket, bottle, can, refrigerator, signboard, poster, display counter, display table, and umbrella. Pay special attention to counting beer boxes.

Text Detection Insights:
{text_info[1:]}

Required Information:
- Enumerate promotional materials with the Heineken logo.
- Classify each material and count beer boxes.

Context and Competitor Analysis:
- Determine the setting: restaurant, supermarket, or store.
- Identify competitor logos present.

Insights:
"""
from PIL import ImageOps
image = Image.open(image_path)
image = ImageOps.exif_transpose(image)
model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content([prompt, image], stream=True)
response.resolve()
to_markdown(response.text)

> ## Promotional Materials Analysis:
> 
> **Promotional materials with Heineken logo:**
> 
> 1. **Poster:** Large poster displaying Heineken logo and slogan, "Vui tiec thoai mai" (Enjoy your party) with an image of Heineken cans. 
> 2. **Beer Boxes:** Multiple boxes with the Heineken logo are stacked against the wall.  
> 3. **Can:** Visible on the poster. 
> 
> **Classification and count:**
> 
> - **Poster:** 1
> - **Beer Boxes:** 11
> - **Can:** 1
> 
> **Setting:**
> 
> The setting appears to be a **restaurant** or an **event location** based on the presence of tables, chairs, and beer boxes.
> 
> **Competitor logos:**
> 
> The image shows a **Tiger Crystal** logo, indicating a competitor brand is present. 
> 
> **Insights:**
> 
> - Heineken is actively promoting its brand within this restaurant or event location.
> - The presence of competitor logos suggests potential competition in the market. 
> - The volume of Heineken beer boxes indicates a significant presence and potentially popular choice at this establishment.
> - The poster and slogan aim to encourage customers to enjoy their event with Heineken.

In [241]:
text_info

{'image': {'width': 600, 'height': 800},
 'predictions': [{'label': 'P\nLARUE\nLARUE\n1\nnabati\nNurów\nMu\nLARUE\nSPECIAL\nCOOLPACK\nHeineken UE\nTiger\nLARUE\n2024\n241\nHeineken\n2024\nHeineken\nLARUE\nHeineken\n-SMOOTH-\nHekel\nHeineken\nCHITMAN\nSHUNG\n22-2\n24\n2924\nCOTHAN\nHeineken\nTHUNG\n77-7\nHeineken\nTiger\nMAN\n14040\n2232\n24\nHeineken\n2\n24\nHeineken\nTiger\nCOLY\nTiger\nHeineken\nTiger\nTiger\n24☆\nBIVINA\nHeineken\nTifer\nEXPORT',
   'vertices': [(0, 38), (599, 38), (599, 725), (0, 725)]},
  {'label': 'P', 'vertices': [(0, 408), (7, 404), (12, 419), (0, 423)]},
  {'label': 'LARUE', 'vertices': [(18, 79), (62, 68), (65, 81), (21, 92)]},
  {'label': 'LARUE', 'vertices': [(16, 226), (51, 213), (55, 224), (20, 237)]},
  {'label': '1', 'vertices': [(37, 537), (46, 534), (52, 549), (42, 552)]},
  {'label': 'nabati',
   'vertices': [(152, 265), (191, 261), (193, 277), (154, 281)]},
  {'label': 'Nurów', 'vertices': [(294, 43), (304, 80), (295, 82), (284, 46)]},
  {'label': '

In [245]:
draw_bbox = DrawBoundingBoxes(image, text_info)
draw_bbox.save_image_with_bbox('Beer_with_bbox/76_Bbox.jpg')

Image saved to Beer_with_bbox/76_Bbox.jpg


In [None]:
from pytube import YouTube


def download_video_from_youtube(link, path):
    yt = YouTube(link)
    video = yt.streams.get_highest_resolution()

    # download the video
    video.download(path)

# example usage:
download_video_from_youtube('https://youtu.be/0kSy34bXOsQ?si=VSKRj83xrqbWiusr', 'videos')

In [None]:
import moviepy.editor as mp
import speech_recognition as sr

# Load the video
video = mp.VideoFileClip(
    "videos/Introducing Heineken 00 on Draught - Denise Van Outen.mp4")

# Extract the audio from the video
audio_file = video.audio
audio_file.write_audiofile("geeksforgeeks.wav")

# Initialize recognizer
r = sr.Recognizer()

# Load the audio file
with sr.AudioFile("geeksforgeeks.wav") as source:
    data = r.record(source)

# Convert speech to text
text = r.recognize_google(data)

MoviePy - Writing audio in geeksforgeeks.wav


                                                                      

MoviePy - Done.


In [None]:
to_markdown(text)

> I think alcohol-free drinks should be more readily available then they already are and this is why you know for me being able to come to a pub and have Heineken 00 draft is just an absolute godsend for me because you don't always want to turn up a problem be nursing a cola or eliminate join me and you want to feel part of the team and I am a team player I don't want to be excluded now I feel like alcohol for me is something that I pick and choose whenever I feel it's the right time to have a drink so I'm definitely one of those moderate drinkers I don't I don't go out now to get drunk I think when I was young girl with the purpose of just getting really drunk but now when I go out I want to have good conversation I want to remember it the next day I think the music conception about alcoholic drink up is the first of all you must have an issue with alcohol and not everybody has and possibly are you pregnant which again you know we shouldn't have to keep defending ourselves and you know I found myself in the past when I've chosen not to drink and I've gone now with friends that I've even made up excuses because it's actually easier to say oh I'm on antibiotics I can't drink and actually just say don't fancy drinking today I'm really excited about how the Great British pub is changing because I feel like it's open its doors to everybody now you can all go and feel comfortable and that for me is a brilliant thing because you know with everything that's gone on over the last year and a half and everything's been closed down it's nice to have somewhere that comes like a Social Hub that you can all go to and all feel comfortable people from all walks of life all ages and it's catering for everybody finally I can stand at the bar I don't have to say I'm on antibiotics no I'm not pregnant I can order my draft zero zero and just enjoy it and be the magic that I used to be but grown up one

In [None]:
# Creating the prompt for the Gemini API
prompt = f"Given the following passage: '{
    text}', answer the following questions: \n1. Which beer is mentioned in the passage? \n2. Which beer is being described in detail?"
model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content(prompt, stream=True)
response.resolve()
to_markdown(response.text)

> Here are the answers to your questions:
> 
> 1. **Heineken 0.0** is the beer mentioned in the passage.
> 2. **Heineken 0.0** is the beer being described in detail. The passage specifically highlights how the availability of this alcohol-free beer is a "godsend" for the speaker and how it allows them to feel included and part of the group when socializing at a pub. 


In [189]:
image_path = 'Beer/48.jpg'

In [203]:
from inference_sdk import InferenceHTTPClient

CLIENT = InferenceHTTPClient(
    api_url="https://detect.roboflow.com",
    api_key="myga9csfPR6CPfAMerk7"
)

result = CLIENT.infer(image_path, model_id="gianamkhanh/3")

In [204]:
result_info = convert_bounding_box_to_polygon(result)

In [205]:
draw_bbox = DrawBoundingBoxes(Image.open(image_path), result_info)
draw_bbox.save_image_with_bbox('Beer_with_bbox/1387_Bbox.jpg')

Image saved to Beer_with_bbox/1387_Bbox.jpg


In [206]:
result_info

{'image': {'width': 960, 'height': 1280},
 'predictions': [{'label': 'person',
   'vertices': [(568.75, 581.25),
    (936.25, 581.25),
    (936.25, 1277.5),
    (568.75, 1277.5)]},
  {'label': 'biaviet-prometer',
   'vertices': [(75.0, 348.75),
    (217.5, 348.75),
    (217.5, 948.75),
    (75.0, 948.75)]},
  {'label': 'person',
   'vertices': [(187.5, 351.25),
    (560.0, 351.25),
    (560.0, 1018.75),
    (187.5, 1018.75)]},
  {'label': 'biaviet-prometer',
   'vertices': [(368.75, 398.75),
    (555.0, 398.75),
    (555.0, 610.0),
    (368.75, 610.0)]},
  {'label': 'biaviet-can',
   'vertices': [(565.0, 660.0),
    (601.25, 660.0),
    (601.25, 721.25),
    (565.0, 721.25)]},
  {'label': 'biaviet-brand',
   'vertices': [(520.0, 440.0),
    (555.0, 440.0),
    (555.0, 492.5),
    (520.0, 492.5)]},
  {'label': 'biaviet-brand',
   'vertices': [(521.25, 500.0),
    (557.5, 500.0),
    (557.5, 616.25),
    (521.25, 616.25)]},
  {'label': 'biaviet-can',
   'vertices': [(533.75, 653.75),
   

In [215]:
def analyze_image_information_general(image_path, object_detector):
    image = Image.open(image_path)
    prompt = f"""
    Given the object detection results below, provide insights based on the following criteria:

    Object Detection Results:
    {object_detector}

    Criteria for Analysis:
    1. A person is considered to be holding a "beer-can" if a 'beer-can' object is detected within a close proximity to the 'person' object, specifically if any part of the 'beer-can' is within the bounding box of the 'person'.
    2. Marketing staff are identified by the presence of 'promotional-material' objects. A marketing staff member must be in close proximity to the 'promotional-material', similar to the criteria for holding a 'beer-can'.
    3. For a location to be considered as having sufficient marketing staff presence, at least two individuals meeting the marketing staff criteria must be detected.

    Please analyze the object detection results and provide:
    - The number of people holding a "beer-can".
    - Whether there are at least two marketing staff members at the location, based on the criteria above.

    Insights:
    """

    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content([prompt, image], stream=True)
    response.resolve()
    return to_markdown(response.text)

In [217]:
analyze_image_information(image_path, result_info)

> ## Insights:
> 
> **Business Problem 1:**
> 
> - There is **one** person holding a "biaviet-can" in their hand. 
> - This can be identified by looking at the object detection result: `{'label': 'biaviet-can', 'vertices': [(565.0, 660.0), (601.25, 660.0), (601.25, 721.25), (565.0, 721.25)]}`. 
> - The person holding the can is the one in the blue shirt, standing next to the table.
> 
> **Business Problem 4:**
> 
> - **Two** people can be identified as marketing staff:
>     - The person holding a "biaviet-can" in their hand (mentioned above). 
>     - The person holding a "biaviet-prometer" (identified by the bounding box: `{'label': 'biaviet-prometer', 'vertices': [(75.0, 348.75), (217.5, 348.75), (217.5, 948.75), (75.0, 948.75)]}`).
> - The image appears to show **one** restaurant location.
> - There are **at least two** marketing staff members at this location, fulfilling the requirement.
> 
> **Additional Notes:**
> 
> - The object detection model seems to have correctly identified most objects, including people and "biaviet" products.
> - The image provides valuable information for analyzing marketing efforts and staff presence at a specific restaurant location.
> 
> **Recommendations:**
> 
> - It's beneficial to investigate the "biaviet-brand" bounding boxes. Are they correctly identified? What do these brands represent?
> - It would be helpful to gather more images from different restaurant locations to verify the presence of marketing staff and obtain a comprehensive view of their deployment.


In [260]:
preds = [
  {"xmin": 527, "ymin": 239, "xmax": 584, "ymax": 392, "class": "tiger-banner"},
  {"xmin": 139, "ymin": 49, "xmax": 493, "ymax": 167, "class": "tiger-banner"},
  {"xmin": 221, "ymin": 430, "xmax": 247, "ymax": 467, "class": "tiger-poster"},
  {"xmin": 533, "ymin": 442, "xmax": 565, "ymax": 520, "class": "tiger-poster"}
]

image_path = 'Beer/5.jpg'
image = Image.open(image_path)
image = image.resize((640, 640))
image.save('Beer/5_resized.jpg')
draw = ImageDraw.Draw(image)
for pred in preds:
    draw.rectangle([(pred['xmin'], pred['ymin']),
                   (pred['xmax'], pred['ymax'])], outline='red', width=2)
    draw.text((pred['xmin'], pred['ymin']), pred['class'], fill='red')

image.save('Beer_with_bbox/5_Bbox.jpg')

In [258]:
n_preds = []
for pred in preds:
    if pred['class'] == 'tiger-brand':
        n_preds.append(pred)
n_preds

[{'xmin': 527, 'ymin': 239, 'xmax': 584, 'ymax': 392, 'class': 'tiger-brand'},
 {'xmin': 139, 'ymin': 49, 'xmax': 493, 'ymax': 167, 'class': 'tiger-brand'},
 {'xmin': 221, 'ymin': 430, 'xmax': 247, 'ymax': 467, 'class': 'tiger-brand'},
 {'xmin': 533, 'ymin': 442, 'xmax': 565, 'ymax': 520, 'class': 'tiger-brand'}]

In [259]:
prompt = """
Given the bounding box predictions for an image, refine the classifications for boxes with the class ending in '-brand', ensuring to maintain the original bounding box coordinates. The goal is to provide more accurate and detailed classifications for these specific boxes without altering their positions or sizes. For each bounding box with a class ending in '-brand', analyze the image content within the bounding box to determine a more specific and accurate class. Possible refined classes include 'tiger-logo', 'competitor-logo', 'tiger-text', or 'other-brand'.

Bounding Box Predictions:
{preds}

Tasks:
1. For each bounding box with a class ending in '-brand', provide a refined classification based on the content within the bounding box. Maintain the original bounding box coordinates.
2. Return the list of bounding boxes with updated classes for those previously labeled with '-brand', including their unchanged coordinates.

Refined Bounding Box Predictions with Original Coordinates:
"""

image_path = 'Beer/5.jpg'
image = Image.open(image_path)
image = image.resize((640, 640))
model = genai.GenerativeModel('gemini-1.5-flash')
response = model.generate_content([prompt, image], stream=True)
response.resolve()
to_markdown(response.text)


> ```json
> {preds}
> ```