In [1]:
import os
import json

from tqdm import tqdm
from ultralytics import YOLO

In [2]:
YOLO_MODEL_PATH = "weights/yolov8s-worldv2.pt"
CLASSES = ["cars, "]
OUTPUT_FILE = "results/test_detection.json"
TRAIN_BATCH_SIZE = 1
VAL_BATCH_SIZE = 1
TEST_BATCH_SIZE = 1

os.makedirs("results", exist_ok=True)

In [None]:
import sys
from datasets import load_dataset
from torch.utils.data import IterableDataset, DataLoader

def load_our_dataset(split):
    return load_dataset("ntudlcv/dlcv_2024_final1", split=split, streaming=True)

training_dataset: IterableDataset = load_our_dataset("train")
val_dataset: IterableDataset = load_our_dataset("val")
test_dataset: IterableDataset = load_our_dataset("test")

Each element in the dataset is a dictionary with the following structure:
| Key | Type | Description |
|-----|------|-------------|
| id | int | The unique identifier for the sample |
| image | PIL.Image.Image | The image data as a PIL Image object |
| conversations | str | Concatenation of input text and output description |

In [4]:
def custom_collate_fn(batch):
    ids = [item["id"] for item in batch]
    images = [item["image"] for item in batch]
    conversations = [item["conversations"] for item in batch]
    
    return {
        "ids": ids,
        "images": images,
        "conversations": conversations
    }

train_loader = DataLoader(training_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

In [5]:
data = dict()
detection_model = YOLO(YOLO_MODEL_PATH)

# Define custom classes
if CLASSES:
    detection_model.set_classes(CLASSES)

# Execute prediction for specified categories on an image
for batch_idx, batch in enumerate(tqdm(test_loader)):
    if batch_idx > 10:
        break
    results = detection_model.predict(batch["images"], conf=0.25)
    for index, result in enumerate(results):
        image_data = []
        for box in result.boxes:
            image_data.append({
                "bbox": box.xyxyn.tolist(),
                "category_id": box.cls.item(),
                "confidence": box.conf.item(),
                "category_name": detection_model.names.get(box.cls.item())
            })
        data[batch["ids"][index]] = image_data
        result.save_txt(f"results/{batch_idx * TEST_BATCH_SIZE + index}.txt")
        result.save(f"results/{batch_idx * TEST_BATCH_SIZE + index}.jpg")

with open(OUTPUT_FILE, "w") as f:
    json.dump(data, f, indent=4)
    print(f"Saved to {OUTPUT_FILE}")

0it [00:00, ?it/s]


0: 352x640 (no detections), 24.0ms
Speed: 1.7ms preprocess, 24.0ms inference, 6.3ms postprocess per image at shape (1, 3, 352, 640)


1it [00:07,  7.74s/it]


0: 352x640 1 cars, , 6.7ms
Speed: 1.1ms preprocess, 6.7ms inference, 18.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 6.9ms
Speed: 1.1ms preprocess, 6.9ms inference, 0.3ms postprocess per image at shape (1, 3, 352, 640)


3it [00:07,  2.05s/it]


0: 352x640 (no detections), 9.0ms
Speed: 1.1ms preprocess, 9.0ms inference, 0.6ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 9.1ms
Speed: 1.4ms preprocess, 9.1ms inference, 0.5ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 1 cars, , 8.9ms
Speed: 1.4ms preprocess, 8.9ms inference, 1.2ms postprocess per image at shape (1, 3, 352, 640)


6it [00:07,  1.22it/s]


0: 352x640 (no detections), 8.9ms
Speed: 1.4ms preprocess, 8.9ms inference, 0.4ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 8.8ms
Speed: 1.4ms preprocess, 8.8ms inference, 0.6ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 9.0ms
Speed: 1.5ms preprocess, 9.0ms inference, 0.4ms postprocess per image at shape (1, 3, 352, 640)


9it [00:08,  2.18it/s]


0: 352x640 (no detections), 8.9ms
Speed: 1.4ms preprocess, 8.9ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)

0: 352x640 (no detections), 9.0ms
Speed: 1.3ms preprocess, 9.0ms inference, 0.7ms postprocess per image at shape (1, 3, 352, 640)


11it [00:08,  1.33it/s]

Saved to results/test_detection.json



