In [3]:
#@title Import necessary packages and set correct device

#  To use venv,
#  python -m ipykernel install --user --name=yoloenv

import numpy as np
import torch
from tqdm import tqdm
from modules.utilities import visual_grounding_test

from modules.refcocog import RefCOCOg, RefCOCOgSample
from modules.yoloclip import YoloClip

%matplotlib inline

if torch.cuda.is_available():
    device = torch.device("cuda")  # CUDA GPU
    print("[INFO] Using GPU.")
elif torch.has_mps:
    device = torch.device("mps")  # Apple Silicon GPU
    print("[INFO] Using MPS.")
else:
    device = torch.device("cpu")
    print("[INFO] No GPU found, using CPU instead.")


[INFO] Using MPS.


In [4]:
#@title Import RefCOCOg dataset and its train/val/test splits

# data_path = "/media/dmmp/vid+backup/Data/refcocog"
data_path = "dataset/refcocog"

dataset = RefCOCOg(ds_path=data_path)

train_ds = RefCOCOg(ds_path=data_path, split='train')
val_ds = RefCOCOg(ds_path=data_path, split='val')
test_ds = RefCOCOg(ds_path=data_path, split='test')

print(f"[INFO] Dataset Size: {len(dataset)}")
print(f"[INFO] train split:  {len(train_ds)}")
print(f"[INFO] val split:    {len(val_ds)}")
print(f"[INFO] test split:   {len(test_ds)}")

[INFO] Dataset Size: 49822
[INFO] train split:  42226
[INFO] val split:    2573
[INFO] test split:   5023


In [25]:
#@title Initialize YoloClip pipeline

yoloclip = YoloClip(device="mps", categories=dataset.categories)


[INFO] Time elapsed: 0.50s


{'IoU': 0.7288451790809631,
 'cosine': 0.31918758153915405,
 'euclidean': 1.166886806488037,
 'dotproduct': 5.456388473510742,
 'grounding': 0.0}

In [8]:
#@tile Test YoloClip on a random sample

%matplotlib inline

idx = np.random.randint(0, len(dataset))

sample = RefCOCOgSample(**dataset[idx])

for sentence in sample.sentences:
    yoloclip(sample, sentence, show=True)


image 1/1: 720x1280 2 persons, 2 ties
Speed: 3227.7ms pre-process, 66.1ms inference, 0.6ms NMS per image at shape (1, 3, 384, 640)
Saved 1 image to [1mruns/detect/exp[0m


Unnamed: 0,xmin,ymin,xmax,ymax,confidence,class,name
0,743.290344,48.343597,1141.756714,720.0,0.879861,0,person
1,441.989624,437.33667,496.585083,710.036255,0.675118,27,tie
2,123.050964,193.238068,714.690674,719.771362,0.666693,0,person
3,978.989807,313.579468,1025.302734,415.526184,0.261517,27,tie


Using cache found in /Users/azel/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-5-9 Python-3.9.6 torch-2.0.0 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


[31m[1mrequirements:[0m /Users/azel/.cache/torch/hub/requirements.txt not found, check failed.


image 1/1: 720x1280 2 persons, 2 ties
Speed: 10613.9ms pre-process, 60.2ms inference, 0.5ms NMS per image at shape (1, 3, 384, 640)
Saved 1 image to [1mruns/detect/exp2[0m


Unnamed: 0,xmin,ymin,xmax,ymax,confidence,class,name
0,743.290344,48.343597,1141.756714,720.0,0.879861,0,person
1,441.989624,437.33667,496.585083,710.036255,0.675118,27,tie
2,123.050964,193.238068,714.690674,719.771362,0.666693,0,person
3,978.989807,313.579468,1025.302734,415.526184,0.261517,27,tie


---

In [None]:
#@title Execute testing on the test dataset

visual_grounding_test(yoloclip, test_ds)