In [1]:
import numpy as np
from matplotlib import pyplot as plt
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import utils
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

In [2]:
# Load ground truth data
cocoGt=COCO("cocoGt.json")

# Load prediction data
cocoDt=cocoGt.loadRes("cocoDt.json")

# Initialize COCOeval object
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.params.useCats = 1

# Evaluate on a subset of images (optional)
# cocoEval.params.imgIds = [5]  # Remove this line to evaluate on all images

# Run evaluation
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()

loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.24s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=6.52s).
Accumulating evaluation results...
DONE (t=1.37s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.461
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.602
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.503
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.263
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.517
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.659
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.354
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.518
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

In [3]:
# Initialize COCOeval object
cocoEval_cls = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval_cls.params.useCats = 0

# Run evaluation
cocoEval_cls.evaluate()
cocoEval_cls.accumulate()
cocoEval_cls.summarize()

Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=7.35s).
Accumulating evaluation results...
DONE (t=0.46s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.475
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.635
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.518
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.269
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.561
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.732
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.120
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.464
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.532
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.307
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100

In [4]:
# increases score:
# prediction with high confidence but incorrect (FP)
# TP with low confidence, low IoU
# edge cases: small objects, big objects, low IoU, low confidence
# many predictions in one image
# many FP predictions in one image
# many FN in one image
# miss-classified objects with high/low confidence
# miss-classified prediciotn, and it is the most often miss-classified class

# finding insightful FN:
# many FN in one image
# FN with small area
# FN on the edge
# FN with tall/len object

# finding insightful TP:
# TP with low confidence
# TP with low IoU
# TP with small area

In [5]:
# per-image stats:
# prediction count (TP+FP)
# FP count
# FN count
# TP + low conf + low IoU (avg on image)
# FP + high conf (avg on image)
# many rare classes (avg)

# per-prediction stats:
# TP + low conf
# TP + low IoU
# FP + high conf
# rare class
# TP, but incorrect class + high conf
# FN + small area (?)

# image_id / ann_id

In [249]:
from importlib import reload
reload(utils)

<module 'utils' from '/root/model-benchmark/utils.py'>

In [15]:
eval_img_dict = utils.get_eval_img_dict(cocoEval)
eval_img_dict_cls = utils.get_eval_img_dict(cocoEval_cls)

In [53]:
matches = utils.get_matches(eval_img_dict, eval_img_dict_cls, cocoEval_cls, iou_t=0)

In [8]:
len(matches), len(cocoDt.anns), len(cocoGt.anns)

(44425, 33765, 36781)

# Gallery

### Per-image stats

In [None]:
# (N_imgs, 5), 5 = TP, FP, FN, score, iou

In [5]:
# Per-image stats
img_ids = cocoEval.params.imgIds

per_image = np.zeros((len(img_ids), 5))

imgId2idx = {img_id: idx for idx, img_id in enumerate(img_ids)}
idx2imgId = {idx: img_id for img_id, idx in imgId2idx.items()}

for match in matches:
    idx = imgId2idx[match['image_id']]
    if match["type"] == "TP":
        per_image[idx, 0] += 1
        per_image[idx, 3] += match["score"]
        per_image[idx, 4] += match["iou"]
    elif match["type"] == "FP":
        per_image[idx, 1] += 1
        per_image[idx, 3] += match["score"]
    elif match["type"] == "FN":
        per_image[idx, 2] += 1


per_image[:, 3] /= (per_image[:, 0] + per_image[:, 1])
per_image[:, 4] /= per_image[:, 0]

  per_image[:, 3] /= (per_image[:, 0] + per_image[:, 1])
  per_image[:, 4] /= per_image[:, 0]


In [None]:
# per-image stats:
# many prediction count (TP+FP)
# many FP count
# many FN count
# TP + low conf + low IoU (avg on image)
# FP + high conf (avg on image)
# many rare classes (avg)

In [10]:
# prediction_count = per_image[:, 0] + per_image[:, 1]
# fp_count = per_image[:, 1]
# fn_count = per_image[:, 2]


# inds_sorted = np.argsort(prediction_count)[::-1]
# prediction_count[inds_sorted]

In [16]:
# FP + high conf (avg on image)
# Logarithmic Adjustment Score
log_scores = np.log(1 + per_image[:, 1]) * per_image[:, 3]
log_scores = np.nan_to_num(log_scores, nan=-1)
inds_sorted = np.argsort(log_scores)[::-1]

log_scores = log_scores[inds_sorted]
img_ids = [idx2imgId[idx] for idx in inds_sorted]

In [26]:
# many FN count
fn_count = per_image[:, 2]
inds_sorted = np.argsort(fn_count)[::-1]
fn_count = fn_count[inds_sorted]
img_ids = [idx2imgId[idx] for idx in inds_sorted]

In [32]:
# low conf
inds_sorted = np.argsort(per_image[:, 3])
low_conf = per_image[:, 3][inds_sorted]
img_ids = [idx2imgId[idx] for idx in inds_sorted]

In [37]:
# low IoU
inds_sorted = np.argsort(per_image[:, 4])
low_iou = per_image[:, 4][inds_sorted]
img_ids = [idx2imgId[idx] for idx in inds_sorted]

### Per-instance stats

In [None]:
# increases score:
# prediction with high confidence but incorrect (FP)
# TP with low confidence, low IoU
# edge cases: small objects, big objects, low IoU, low confidence
# many predictions in one image
# many FP predictions in one image
# many FN in one image
# miss-classified objects with high/low confidence
# miss-classified prediciotn, and it is the most often miss-classified class

# finding insightful FN:
# many FN in one image
# FN with small area
# FN on the edge
# FN with tall/len object

# finding insightful TP:
# TP with low confidence
# TP with low IoU
# TP with small area

In [None]:
# per-prediction stats:
# TP + low conf
# TP + low IoU
# FP + high conf
# rare class in GT
# FN + small area (?)
# miss-classified prediciotn + high/low conf
# miss-classified prediciotn, and it is the most often miss-classified class

In [56]:
tp_matches = [match for match in matches if match["type"] == "TP"]
fp_matches = [match for match in matches if match["type"] == "FP" and not match["miss_cls"]]
fn_matches = [match for match in matches if match["type"] == "FN"]
confused_matches = [match for match in matches if match["miss_cls"]]

In [41]:
# TP + low conf
inds_sorted = np.argsort([match["score"] for match in tp_matches])
tp_low_conf_matches = [tp_matches[idx] for idx in inds_sorted]
img_ids = [match["image_id"] for match in tp_low_conf_matches]
dt_ids = [match["dt_id"] for match in tp_low_conf_matches]

In [60]:
# TP + low IoU
inds_sorted = np.argsort([match["iou"] for match in tp_matches])
tp_low_iou_matches = [tp_matches[idx] for idx in inds_sorted]
img_ids = [match["image_id"] for match in tp_low_iou_matches]
dt_ids = [match["dt_id"] for match in tp_low_iou_matches]

In [74]:
# FP + high conf
inds_sorted = np.argsort([match["score"] for match in fp_matches])[::-1]
fp_high_conf_matches = [fp_matches[idx] for idx in inds_sorted]
img_ids = [match["image_id"] for match in fp_high_conf_matches]
dt_ids = [match["dt_id"] for match in fp_high_conf_matches]

In [94]:
# confused + high/low conf
inds_sorted = np.argsort([match["score"] for match in confused_matches])[::1]
confused_high_conf_matches = [confused_matches[idx] for idx in inds_sorted]
img_ids = [match["image_id"] for match in confused_high_conf_matches]
dt_ids = [match["dt_id"] for match in confused_high_conf_matches]
gt_ids = [match["gt_id"] for match in confused_high_conf_matches]

### Rare classes

Only for 3+ classes

In [229]:
cat_ids_rare, cat_names = utils.get_rare_classes(cocoGt)

In [213]:
# Get images with most rare classes

counts = defaultdict(int)
for match in matches:
    if match["category_id"] not in cat_ids_rare:
        continue
    counts[match["image_id"]] += 1
counts = dict(counts)

# sort by count
sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

# img_id, n = sorted_counts[idx]

In [None]:
idx = 1
utils.show_pred_image(sorted_counts[idx][0], cocoDt)

In [None]:
utils.show_gt_image(img_ids[idx], cocoGt)

In [260]:
from importlib import reload
reload(utils)

<module 'utils' from '/root/model-benchmark/utils.py'>

In [261]:
true_positives, false_positives, false_negatives = utils.get_counts(cocoEval)

In [297]:
tp = true_positives.sum(1)
fp = false_positives.sum(1)
fn = false_negatives.sum(1)

precision = tp / (tp + fp)
recall = tp / (tp + fn)

print(f"Precision: {precision.mean():.3f}")
print(f"Recall: {recall.mean():.3f}")

Precision: 0.587
Recall: 0.538


In [300]:
sum([tp[0], fp[0], fn[0]])

46085.0

In [301]:
len(matches)

44425

In [299]:
tp = true_positives.sum((0,1))
fp = false_positives.sum((0,1))
fn = false_negatives.sum((0,1))

precision = tp / (tp + fp)
recall = tp / (tp + fn)

print(f"Precision: {precision.mean():.3f}")
print(f"Recall: {recall.mean():.3f}")

Precision: 0.619
Recall: 0.546


In [283]:
cocoEval.params.iouThrs

array([0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])