In [2]:
#@title Import necessary packages and set correct device

#  To use venv,
#  python -m ipykernel install --user --name=yoloenv

import numpy as np
import torch
from tqdm import tqdm

from modules.refcocog import RefCOCOg, RefCOCOgSample
from modules.yoloclip import YoloClip

%matplotlib inline

if torch.cuda.is_available():
    device = torch.device("cuda")  # CUDA GPU
    print("[INFO] Using GPU.")
elif torch.has_mps:
    device = torch.device("mps")  # Apple Silicon GPU
    print("[INFO] Using MPS.")
else:
    device = torch.device("cpu")
    print("[INFO] No GPU found, using CPU instead.")


[INFO] Using MPS.


In [4]:
#@title Import RefCOCOg dataset and its train/val/test splits

# data_path = "/media/dmmp/vid+backup/Data/refcocog"
data_path = "dataset/refcocog"

dataset = RefCOCOg(ds_path=data_path)

train_ds = RefCOCOg(ds_path=data_path, split='train')
val_ds = RefCOCOg(ds_path=data_path, split='val')
test_ds = RefCOCOg(ds_path=data_path, split='test')

print(f"[INFO] Dataset Size: {len(dataset)}")
print(f"[INFO] train split:  {len(train_ds)}")
print(f"[INFO] val split:    {len(val_ds)}")
print(f"[INFO] test split:   {len(test_ds)}")

[INFO] Dataset Size: 49822
[INFO] train split:  42226
[INFO] val split:    2573
[INFO] test split:   5023


In [5]:
#@title Initialize YoloClip pipeline

yoloclip = YoloClip(device="mps", categories=dataset.categories)


In [None]:
#@tile Test YoloClip on a random sample

%matplotlib inline

idx = np.random.randint(0, len(dataset))

sample = RefCOCOgSample(**dataset[idx])

for sentence in sample.sentences:
    yoloclip(sample, sentence, show=True)


---

In [None]:
#@title Function definition to test visual grounding with a given pipeline

def visual_grounding_test(vg_pipeline, dataset, track="IoU"):
    scores = list()

    pbar = tqdm(dataset)

    for sample in pbar:

        sample = RefCOCOgSample(**sample)

        for sentence in sample.sentences:
            sc = vg_pipeline(sample, sentence, show=False)

            scores.append(sc)

            avg_score = np.mean([score[track] for score in scores])

            pbar.set_description(f"Average {track}: {avg_score:.3f}")

    for metric in scores[0].keys():
        avg_metric = np.mean([score[metric] for score in scores])

        print("Avg. {}: {:.3f}".format(metric, avg_metric))


In [None]:
#@title Test YoloClip on the test set

yoloclip.quiet = True

visual_grounding_test(yoloclip, test_ds)