In [None]:
import io
from PIL import Image
from datasets import load_dataset
from transformers import Owlv2Processor, Owlv2ForObjectDetection

model_name = "google/owlv2-base-patch16"
processor = Owlv2Processor.from_pretrained(model_name)
model = Owlv2ForObjectDetection.from_pretrained(model_name)

dataset = load_dataset("Francesco/animals-ij5d2")
print(dataset)
print(dataset["test"][0])

In [None]:
images = dataset["test"]["image"][:2]
categories = dataset["test"].features["objects"].feature["category"].names
labels = [categories] * len(images)
inputs = processor(text=labels, images=images, return_tensors="pt", padding=True)

print(images)
print(labels)
print("input_ids :", inputs["input_ids"])
print("attention_mask :", inputs["attention_mask"])
print("pixel_values :", inputs["pixel_values"])
print("image_shape :", inputs["pixel_values"].shape)

In [None]:
import torch

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.no_grad():
    outputs = model(**inputs.to(device))
    
print(outputs.keys())
print("logits :", outputs.logits.shape)
print("objectness_logits :", outputs.objectness_logits.shape)
print("pred_boxes :", outputs.pred_boxes.shape)
print("class_embeds :", outputs.class_embeds.shape)

In [None]:
shape = [dataset["test"][:2]["width"], dataset["test"][:2]["height"]]
target_sizes = list(map(list, zip(*shape)))
detections = processor.post_process_object_detection(
    outputs=outputs, threshold=0.5, target_sizes=target_sizes
)

print(target_sizes)
print(detections)

In [None]:
import matplotlib.pyplot as plt
from PIL import ImageDraw, ImageFont

for idx, (image, detect) in enumerate(zip(images, detections)):
    im = image.copy()
    draw = ImageDraw.Draw(im)
    font = ImageFont.truetype("arial.ttf", 36)

    for box, score, label in zip(detect["boxes"], detect["scores"], detect["labels"]):
        box = [round(i, 2) for i in box.tolist()]
        draw.rectangle(box, outline="red", width=3)
        
        label_text = f"{labels[idx][label]}: {round(score.item(), 3)}"
        draw.text((box[0], box[1]), label_text, fill="red", font=font)

    plt.imshow(im)
    plt.axis("off")
    plt.show()

In [None]:
!pip install pycocotools

In [None]:
def get_coco_annotations(dataset):
    annotations = []
    for data in dataset:
        image_id = int(data["image_id"])
        objects = data["objects"]
        for idx in range(len(objects["id"])):
            annotations.append(
                {
                    "image_id": image_id,
                    "category_id": int(objects["category"][idx]),
                    "bbox": [float(coord) for coord in objects["bbox"][idx]],
                    "area": float(objects["area"][idx]),
                    "id": int(objects["id"][idx]),
                    "iscrowd": 0
                }
            )
    return annotations

coco_annotations = get_coco_annotations(dataset["test"])
coco_annotation_format = {
    "annotations": coco_annotations,
    "images": [{"id": int(data["image_id"])} for data in dataset["test"]],
    "categories": [{"id": i, "name": name} for i, name in enumerate(categories)]
}
print(len(coco_annotations))
print(coco_annotation_format["annotations"][0])
print(coco_annotation_format["images"][0])
print(coco_annotation_format["categories"][0])

In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(
    dataset["test"],
    batch_size=2,
    collate_fn=lambda batch: (
        [item["image"] for item in batch],
        [list(item["image"].size) for item in batch],
        [item["image_id"] for item in batch],
        [item["objects"] for item in batch]
    )
)

predictions = []
model.eval()
with torch.no_grad():
    for images, target_sizes, image_ids, objects in dataloader:
        input_labels = [categories] * len(images)
        inputs = processor(images=images, text=input_labels, return_tensors="pt")
        outputs = model(**inputs.to(device))
        detections = processor.post_process_object_detection(
            outputs=outputs, threshold=0.3, target_sizes=target_sizes
        )

        for batch_idx, detection in enumerate(detections):
            category_ids = detection["labels"].cpu().numpy().tolist()
            scores = detection["scores"].cpu().numpy().tolist()
            boxes = detection["boxes"].cpu().numpy()

            boxes[:, 2:4] -= boxes[:, :2]
            boxes = boxes.tolist()

            for obj_idx, box in enumerate(boxes):
                prediction = {
                    "image_id": image_ids[batch_idx],
                    "category_id": category_ids[obj_idx],
                    "bbox": box,
                    "score": scores[obj_idx]
                }
                predictions.append(prediction)

print(len(predictions))
print(predictions[0])

In [None]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

coco_gt = COCO()
coco_gt.dataset = coco_annotation_format
coco_gt.createIndex()

coco_dt = coco_gt.loadRes(predictions)

coco_eval = COCOeval(coco_gt, coco_dt, "bbox")
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()