In [None]:
# evaluate_yolov11_final.py
# Full evaluation script that mirrors your working inference pipeline (letterbox, dtype handling, scaling).
# Saves COCO-format predictions and runs COCOeval.
#
# Usage: place in repo root and run: python evaluate_yolov11_final.py

import os
import cv2
import json
import torch
import numpy as np
from tqdm import tqdm
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

  ckpt = torch.load(WEIGHT, map_location="cpu")


loading annotations into memory...
Done (t=0.61s)
creating index...
index created!


Evaluating: 100%|██████████| 5000/5000 [04:08<00:00, 20.13img/s]



Saved 24767 predictions to yolov11_predictions.json (processed 5000 images)
Loading and preparing results...
DONE (t=0.11s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=10.85s).
Accumulating evaluation results...
DONE (t=1.72s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.325
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.443
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.355
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.119
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.352
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.512
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.269
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.371
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 

In [None]:
# ---- CONFIG ----
WEIGHT = "weights/best.pt"
IMG_DIR = "dataset/coco/val2017"                       # folder with images (must match COCO file_name)
ANN_FILE = "annotations/instances_val2017.json"
IMG_SIZE = (640, 640)   # (H, W) as used by your model/training
CONF_THR = 0.25
IOU_THR = 0.45
SAVE_JSON = "yolov11_predictions.json"
VISUALIZE = False        # True -> save a few debug images (vis_<imgid>.jpg)
# ----------------

# COCO 0..79 -> 1..90 mapping (use only if annotations require it)
COCO91CLASS = {
    0:1,1:2,2:3,3:4,4:5,5:6,6:7,7:8,8:9,9:10,
    10:11,11:13,12:14,13:15,14:16,15:17,16:18,17:19,18:20,
    19:21,20:22,21:23,22:24,23:25,24:27,25:28,26:31,27:32,
    28:33,29:34,30:35,31:36,32:37,33:38,34:39,35:40,36:41,
    37:42,38:43,39:44,40:46,41:47,42:48,43:49,44:50,45:51,
    46:52,47:53,48:54,49:55,50:56,51:57,52:58,53:59,54:60,
    55:61,56:62,57:63,58:64,59:65,60:67,61:70,62:72,63:73,
    64:74,65:75,66:76,67:77,68:78,69:79,70:80,71:81,72:82,
    73:84,74:85,75:86,76:87,77:88,78:89,79:90
}

In [None]:
# ---- helpers (reuse exact letterbox from your working inference) ----
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114)):
    """Resize and pad image (returns padded_img, gain, (pad_left, pad_top))."""
    h0, w0 = img.shape[:2]
    new_h, new_w = new_shape
    r = min(new_h / h0, new_w / w0)
    new_unpad_w = int(round(w0 * r))
    new_unpad_h = int(round(h0 * r))
    img_resized = cv2.resize(img, (new_unpad_w, new_unpad_h), interpolation=cv2.INTER_LINEAR)
    dw = new_w - new_unpad_w
    dh = new_h - new_unpad_h
    top = int(round(dh / 2 - 0.1))
    bottom = int(round(dh / 2 + 0.1))
    left = int(round(dw / 2 - 0.1))
    right = int(round(dw / 2 + 0.1))
    img_padded = cv2.copyMakeBorder(img_resized, top, bottom, left, right,
                                    cv2.BORDER_CONSTANT, value=color)
    return img_padded, r, (left, top)

def scale_coords_from_padded(dets, gain, pad, orig_w, orig_h):
    """
    dets: tensor Nx6 (x1,y1,x2,y2,conf,cls) in padded image coordinates
    gain: scaling gain used for resize
    pad: (pad_left, pad_top) values returned by letterbox
    """
    # operate inplace on a clone outside if needed
    dets = dets.clone()
    pad_left, pad_top = pad
    # remove padding
    dets[:, [0, 2]] -= pad_left
    dets[:, [1, 3]] -= pad_top
    # divide by gain (undo scaling)
    dets[:, :4] /= gain
    # clamp
    dets[:, [0, 2]] = dets[:, [0, 2]].clamp(0, orig_w)
    dets[:, [1, 3]] = dets[:, [1, 3]].clamp(0, orig_h)
    return dets

# ---- import repo NMS (assumes utils/util.py in repo) ----
try:
    from utils.util import non_max_suppression  # expects model-native output shape [B, C, N]
except Exception as e:
    raise RuntimeError("Failed to import repo non_max_suppression from utils.util: " + str(e))

# ---- main evaluation ----
def main():
    # load model from checkpoint exactly as your inference code does
    ckpt = torch.load(WEIGHT, map_location="cpu")
    if 'model' not in ckpt:
        raise RuntimeError("Checkpoint does not contain 'model' key.")
    model = ckpt['model']
    model.eval()

    # dtype handling (match inference)
    if next(model.parameters()).dtype == torch.half:
        model = model.half()
    else:
        model = model.float()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    cocoGt = COCO(ANN_FILE)
    img_ids = cocoGt.getImgIds()

    results = []
    processed = 0
    debug_printed = 0

    for img_id in tqdm(img_ids, desc="Evaluating", unit="img"):
        info = cocoGt.loadImgs(img_id)[0]
        file_name = info['file_name']
        img_path = os.path.join(IMG_DIR, file_name)
        if not os.path.exists(img_path):
            # skip if image not available locally
            continue

        img_bgr = cv2.imread(img_path)
        if img_bgr is None:
            continue
        orig_h, orig_w = img_bgr.shape[:2]

        # preprocessing (exact same as your working inference)
        img_pad, gain, (pad_w, pad_h) = letterbox(img_bgr, new_shape=IMG_SIZE)
        img_rgb = cv2.cvtColor(img_pad, cv2.COLOR_BGR2RGB)
        img_tensor = torch.from_numpy(img_rgb).permute(2, 0, 1).unsqueeze(0).float() / 255.0
        img_tensor = img_tensor.to(device)
        if next(model.parameters()).dtype == torch.half:
            img_tensor = img_tensor.half()

        # forward (use model output as-is, repo NMS expects that)
        with torch.no_grad():
            out = model(img_tensor)
            if isinstance(out, (list, tuple)):
                out = out[0]   # sometimes (pred, loss)

        # out expected shape: [B, C, N] (like your inference)
        if not torch.is_tensor(out):
            continue

        # Ensure class logits are probabilities: if raw logits exceed 1 apply sigmoid to class slice
        # C = out.shape[1], nc = C - 4
        try:
            C = out.shape[1]
            nc = C - 4
            # check a small slice to decide if logits present
            cls_slice = out[:, 4:4+nc]
            if float(cls_slice.max()) > 1.0:
                out[:, 4:4+nc] = cls_slice.sigmoid()
        except Exception:
            pass

        # Run repo NMS (works with output as-is)
        dets = non_max_suppression(out, confidence_threshold=CONF_THR, iou_threshold=IOU_THR)[0]

        if dets is None or len(dets) == 0:
            processed += 1
            continue

        # Map boxes from padded input -> original image exactly like inference
        dets = dets.detach().cpu()
        dets = scale_coords_from_padded(dets, gain, (pad_w, pad_h), orig_w, orig_h)

        # collect results (COCO expects [x,y,w,h] and category IDs must match annotation file)
        # Decide if mapping to COCO91 needed:
        gt_cat_ids = set([c['id'] for c in cocoGt.loadCats(cocoGt.getCatIds())])
        mapping_needed = (0 not in gt_cat_ids)  # if GT IDs are 1..90, map from 0..79 to 1..90

        for *xyxy, conf, cls in dets:
            x1, y1, x2, y2 = [float(v) for v in xyxy]
            w = x2 - x1
            h = y2 - y1
            if w <= 0 or h <= 0:
                continue

            cls_idx = int(cls.item())  # YOLO class index (0..79)
            if mapping_needed:
                coco_cat_id = COCO91CLASS.get(cls_idx)
                if coco_cat_id is None:
                    continue
            else:
                coco_cat_id = cls_idx

            score = float(conf.item())
            # clamp score to [0,1]
            score = max(0.0, min(1.0, score))

            results.append({
                "image_id": img_id,
                "category_id": int(coco_cat_id),
                "bbox": [max(0.0, x1), max(0.0, y1), float(w), float(h)],
                "score": score
            })

        # optional visualization of one image (quick sanity)
        if VISUALIZE and processed % 10 == 0:
            vis = img_bgr.copy()
            for *xyxy, conf, cls in dets:
                xa, ya, xb, yb = map(int, xyxy)
                cv2.rectangle(vis, (xa, ya), (xb, yb), (0,255,0), 2)
                cv2.putText(vis, f"{int(cls.item())}:{conf.item():.2f}", (xa, max(0, ya-6)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)
            cv2.imwrite(f"vis_{img_id}.jpg", vis)

        processed += 1

    # save predictions
    with open(SAVE_JSON, "w") as f:
        json.dump(results, f)
    print(f"\nSaved {len(results)} predictions to {SAVE_JSON} (processed {processed} images)")

    # run COCOeval
    if len(results) == 0:
        print("No results to evaluate.")
        return

    cocoDt = cocoGt.loadRes(SAVE_JSON)
    cocoEval = COCOeval(cocoGt, cocoDt, "bbox")
    cocoEval.evaluate()
    cocoEval.accumulate()
    cocoEval.summarize()

In [None]:
if __name__ == "__main__":
    main()


 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.325
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.443
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.355
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.119
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.352
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.512
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.269
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.371
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.373
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.131
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.400
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.593