##  n text on n image

In [None]:
import requests
from PIL import Image
import torch

from transformers import OwlViTProcessor, OwlViTForObjectDetection

processor = OwlViTProcessor.from_pretrained("/mnt/huggingface/hub/models--google--owlvit-base-patch32", local_files_only=True)
model = OwlViTForObjectDetection.from_pretrained("/mnt/huggingface/hub/models--google--owlvit-base-patch32", local_files_only=True)


# load images
import glob
images = []
original_dir = '/mnt/owlvit/test_img'

for image_fp in glob.glob(f'{original_dir}/*.jpg'):
    images.append(Image.open(image_fp))


# prepare texts
texts = [["a photo of a human face", 
          "a photo of a people face", 
          "a photo of a vehicle license plate", 
          "a photo of a car plate"] for i in range(len(images))]

inputs = processor(text=texts, images=images, return_tensors="pt")


# inference
with torch.inference_mode(), torch.no_grad():
    outputs = model(**inputs)
    target_sizes = torch.Tensor([x.size[::-1] for x in images])
    results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)
    
# plot results  
from PIL import ImageDraw,ImageFont
for text, image, result in zip(texts, images, results):
    boxes, scores, labels = result["boxes"], result["scores"], result["labels"]
    im = image.copy()
    draw = ImageDraw.Draw(im)
    for box, score, label in zip(boxes, scores, labels):
        box = [round(i, 2) for i in box.tolist()]
        font = ImageFont.truetype("Overpass-Bold.ttf", 40)
        xmin, ymin, xmax, ymax = box
        draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=10)
        t = text[label].replace("a photo of a ","")
        draw.text((xmin, ymin-40), f"{t}: {round(score.item(), 3)}", fill="white", font=font)    

    display(im)

## 1 query image on 1 image 

In [None]:
import requests
from PIL import Image
import torch

from transformers import OwlViTProcessor, OwlViTForObjectDetection

processor = OwlViTProcessor.from_pretrained("/mnt/huggingface/hub/models--google--owlvit-base-patch32", local_files_only=True)
model = OwlViTForObjectDetection.from_pretrained("/mnt/huggingface/hub/models--google--owlvit-base-patch32", local_files_only=True)

image = Image.open("/mnt/owlvit/test_img/1701655590224868.jpg")
query_image = Image.open("/mnt/owlvit/symbol/van1.jpg")

inputs = processor(images=image, query_images=query_image, return_tensors="pt")

with torch.no_grad():
    outputs = model.image_guided_detection(**inputs)
    target_sizes = torch.Tensor([image.size[::-1]])
    results = processor.post_process_image_guided_detection(
        outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
    )

boxes, scores = results[0]["boxes"], results[0]["scores"]

from PIL import ImageDraw,ImageFont

im = image.copy()
draw = ImageDraw.Draw(im)

for box, score in zip(boxes, scores):
    box = [round(i, 2) for i in box.tolist()]
    font = ImageFont.truetype("Overpass-Bold.ttf", 40)
    xmin, ymin, xmax, ymax = box
    draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=10)
    draw.text((xmin, ymin-40), f"{round(score.item(), 3)}", fill="white", font=font)    

display(im)