In [131]:
import clip
import torch
from torch import nn
from tqdm import tqdm
import torch.nn.functional as F
from pycocotools.coco import COCO

from cocoeval import CocoEvaluator
from detr.models.detr import PostProcess
from detr.models.matcher import HungarianMatcher

clip_model, clip_preprocess = clip.load('RN50', device='cpu')
coco_val = COCO(annotation_file='coco/annotations/instances_val2017.json')

model = nn.Linear(256, 1024)
model.load_state_dict(torch.load('checkpoints/detr_r50_to_clip_r50_linear_epoch4.pth')['model'])

loading annotations into memory...
Done (t=0.21s)
creating index...
index created!


<All keys matched successfully>

In [132]:
val = torch.load('detr/outputs/detr_outputs_val.pth')

coco_evaluator = CocoEvaluator(coco_val, ('bbox',))

matcher = HungarianMatcher(cost_class=1, cost_bbox=5, cost_giou=2)
postprocess = PostProcess()

In [133]:
texts = ['a ' + coco_val.cats[i]['name'] if i in coco_val.cats else 'unknown' for i in range(91)] + ['unknown']
idx2cocoid = [k for k in coco_val.cats]
texts_tokenized = clip.tokenize(texts)
texts_encoded = clip_model.encode_text(texts_tokenized)

In [134]:
for batch in tqdm(val):
    target_sizes = torch.stack([t['orig_size'] for t in batch['targets']], dim=0)
    batch_matched_idxs = matcher(batch['outputs'], batch['targets'])
    batch_results = postprocess(batch['outputs'], target_sizes)

    res = {target['image_id'].item(): output for target, output in zip(batch['targets'], batch_results)}
    coco_evaluator.update(res)

coco_evaluator.synchronize_between_processes()
coco_evaluator.accumulate()
coco_evaluator.summarize()

100%|██████████| 625/625 [00:17<00:00, 35.50it/s]


Accumulating evaluation results...
DONE (t=2.26s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.419
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.624
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.441
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.203
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.458
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.609
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.333
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.531
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.573
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.310
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.628
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= la

In [137]:
coco_evaluator = CocoEvaluator(coco_val, ('bbox',))
for batch in tqdm(val):
    target_sizes = torch.stack([t['orig_size'] for t in batch['targets']], dim=0)
    batch_matched_idxs = matcher(batch['outputs'], batch['targets'])
    batch_results = postprocess(batch['outputs'], target_sizes)

    res = {target['image_id'].item(): output for target, output in zip(batch['targets'], batch_results)}

    for i, results in enumerate(batch_results):
        features, img_id = batch['h'][i], batch['targets'][i]['image_id'].item()

        with torch.no_grad():
            logits = model(features) @ texts_encoded.T
            clip_probs = F.softmax(logits, dim=-1)
            values, labels = clip_probs.max(dim=-1)
        
        res[img_id]['labels'] = labels

    coco_evaluator.update(res)

coco_evaluator.synchronize_between_processes()
coco_evaluator.accumulate()
coco_evaluator.summarize()

100%|██████████| 625/625 [00:13<00:00, 47.12it/s]


Accumulating evaluation results...
DONE (t=2.43s).
IoU metric: bbox
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.190
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.270
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.199
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.077
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.189
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.272
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.177
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.291
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.307
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.133
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.314
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= la