In [1]:
import numpy as np
from matplotlib import pyplot as plt
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import utils
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from collections import defaultdict

In [2]:
# Load ground truth data
cocoGt=COCO("cocoGt.json")

# Load prediction data
cocoDt=cocoGt.loadRes("cocoDt.json")

# Initialize COCOeval object
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.params.useCats = 1

# Evaluate on a subset of images (optional)
# cocoEval.params.imgIds = [5]  # Remove this line to evaluate on all images

# Run evaluation
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()

loading annotations into memory...
Done (t=0.26s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.26s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=6.35s).
Accumulating evaluation results...
DONE (t=1.37s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.461
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.602
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.503
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.263
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.517
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.659
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.354
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.518
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

In [3]:
# Initialize COCOeval object
cocoEval_cls = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval_cls.params.useCats = 0

# Run evaluation
cocoEval_cls.evaluate()
cocoEval_cls.accumulate()
cocoEval_cls.summarize()

Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=7.18s).
Accumulating evaluation results...
DONE (t=0.47s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.475
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.635
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.518
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.269
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.561
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.732
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.120
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.464
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.532
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.307
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100

In [4]:
# increases score:
# prediction with high confidence but incorrect (FP)
# TP with low confidence, low IoU
# edge cases: small objects, big objects, low IoU, low confidence
# many predictions in one image
# many FP predictions in one image
# many FN in one image
# miss-classified objects with high/low confidence
# miss-classified prediciotn, and it is the most often miss-classified class

# finding insightful FN:
# many FN in one image
# FN with small area
# FN on the edge
# FN with tall/len object

# finding insightful TP:
# TP with low confidence
# TP with low IoU
# TP with small area

In [5]:
# per-image stats:
# prediction count (TP+FP)
# FP count
# FN count
# TP + low conf + low IoU (avg on image)
# FP + high conf (avg on image)
# many rare classes (avg)

# per-prediction stats:
# TP + low conf
# TP + low IoU
# FP + high conf
# rare class
# TP, but incorrect class + high conf
# FN + small area (?)

# image_id / ann_id

In [None]:
save_ids = True

In [4]:
cat_ids = cocoEval.params.catIds
cat_names = [cocoGt.cats[cat_id]['name'] for cat_id in cat_ids]

true_positives, false_positives, false_negatives = utils.get_counts(cocoEval)

eval_img_dict = utils.get_eval_img_dict(cocoEval)
eval_img_dict_cls = utils.get_eval_img_dict(cocoEval_cls)
matches = utils.get_matches(eval_img_dict, eval_img_dict_cls, cocoEval_cls, iou_t=0)

len(matches), len(cocoDt.anns), len(cocoGt.anns)

(44425, 33765, 36781)

In [5]:
TP_count = int(true_positives[...,0].sum())
FP_count = int(false_positives[...,0].sum())
FN_count = int(false_negatives[...,0].sum())
TP_count, FP_count, FN_count

(24102, 8090, 12233)

In [6]:
mAP = cocoEval.stats[0]

tp = true_positives.sum(1)
fp = false_positives.sum(1)
fn = false_negatives.sum(1)
precision = np.mean(tp / (tp + fp))
recall = np.mean(tp / (tp + fn))

# IoU distribution
ious = np.array([m['iou'] for m in matches if m['iou']])
iou_hist = np.histogram(ious, range=(0.5, 1))
iou_mean = np.mean(ious)

In [7]:
# Per-class metrics
tp = true_positives.sum(1).mean(1)
fp = false_positives.sum(1).mean(1)
fn = false_negatives.sum(1).mean(1)

pr = tp / (tp + fp)
rc = tp / (tp + fn)

pr_sort = np.argsort(pr)
rc_sort = np.argsort(rc)
pr_names = [cat_names[i] for i in pr_sort]
rc_names = [cat_names[i] for i in rc_sort]
pr_values = pr[pr_sort]
rc_values = rc[rc_sort]

In [34]:
tp_matches = [m for m in matches if m['type'] == "TP"]
fp_matches = [m for m in matches if m['type'] == "FP" and not m['miss_cls']]
fn_matches = [m for m in matches if m['type'] == "FN"]

In [43]:
# FP for precision
# FN for recall

if save_ids:
    pr_per_class_ids = defaultdict(list)
    for m in fp_matches:
        cat_id = m['category_id']
        cat_name = cocoGt.cats[cat_id]['name']
        pr_per_class_ids[cat_name].append((m['image_id'], m['gt_id'], m['dt_id']))
    pr_per_class_ids = dict(pr_per_class_ids)

    rc_per_class_ids = defaultdict(list)
    for m in fn_matches:
        cat_id = m['category_id']
        cat_name = cocoGt.cats[cat_id]['name']
        rc_per_class_ids[cat_name].append((m['image_id'], m['gt_id'], m['dt_id']))
    rc_per_class_ids = dict(rc_per_class_ids)

In [8]:
# pr_curve shape: R x K
pr_curve = cocoEval.eval['precision'][:,:,:,0,2].mean(0)

In [9]:
miss_cls = [m for m in matches if m['miss_cls']]
confuse_count = len(miss_cls)

In [26]:
# Confusion matrix

catId2idx = {cat_id: i for i, cat_id in enumerate(cat_ids)}

confusion_matrix = np.zeros((len(cat_ids)+1, len(cat_ids)+1))
confusion_matrix_ids = [[[] for _ in range(len(cat_ids)+1)] for _ in range(len(cat_ids)+1)]

for m in miss_cls:
    cat_idx_pred = catId2idx[m['category_id']]
    cat_idx_gt = catId2idx[cocoGt.anns[m['gt_id']]['category_id']]
    confusion_matrix[cat_idx_pred, cat_idx_gt] += 1
    if save_ids:
        confusion_matrix_ids[cat_idx_pred][cat_idx_gt].append((m['image_id'], m['gt_id'], m['dt_id']))

for m in tp_matches:
    cat_idx = catId2idx[m['category_id']]
    confusion_matrix[cat_idx, cat_idx] += 1
    if save_ids:
        confusion_matrix_ids[cat_idx][cat_idx].append((m['image_id'], m['gt_id'], m['dt_id']))

for m in fp_matches:
    cat_idx_pred = catId2idx[m['category_id']]
    confusion_matrix[cat_idx_pred, -1] += 1
    if save_ids:
        confusion_matrix_ids[cat_idx_pred][-1].append((m['image_id'], m['gt_id'], m['dt_id']))

for m in fn_matches:
    cat_idx_gt = catId2idx[m['category_id']]
    confusion_matrix[-1, cat_idx_gt] += 1
    if save_ids:
        confusion_matrix_ids[-1][cat_idx_gt].append((m['image_id'], m['gt_id'], m['dt_id']))

In [None]:
# show confusion matrix with plotly
confusion_matrix_df = pd.DataFrame(np.log(confusion_matrix), index=cat_names + ['(background)'], columns=cat_names + ['(background)'])
fig = px.imshow(confusion_matrix_df, labels=dict(x="Predicted", y="Ground Truth", color="Count"), title="Confusion Matrix (log scale)",
                width=800, height=800)
# remove margin
fig.update_layout(margin=dict(l=0, r=0, t=50, b=0))
fig.show()

In [97]:
overall_metrics = {
    "mAP": mAP,
    "precision": precision,
    "recall": recall,
    "iou": iou_mean,
}

recall_metrics = {
    "recall": recall,
    "per_class": {"class_names": rc_names, "recall": rc_values},
    "TP": TP_count,
    "TP+FN": TP_count + FN_count,
}

precision_metrics = {
    "precision": precision,
    "per_class": {"class_names": pr_names, "precision": pr_values},
    "TP": TP_count,
    "TP+FP": TP_count + FP_count,
}

pr_curve = pr_curve

classification_metrics = {
    "classification_accuracy": TP_count / (TP_count + confuse_count),
    "confuse_count": confuse_count,
    "total": TP_count + confuse_count,
    "confusion_matrix": confusion_matrix,
}

localization_metrics = {
    "iou": iou_mean,
    "iou_hist": iou_hist,
}


In [None]:
print("P/R vs IoU")
tp = true_positives.sum((0,1))
fp = false_positives.sum((0,1))
fn = false_negatives.sum((0,1))

precision = tp / (tp + fp)
recall = tp / (tp + fn)

plt.plot(recall)
plt.plot(precision)
plt.title("Precision / Recall vs IoU")
plt.xlabel("IoU")
plt.ylabel("Precision / Recall")
plt.xticks(range(0, len(cocoEval.params.iouThrs), 1), cocoEval.params.iouThrs[::1])
plt.ylim(0, 1)
plt.grid()

# Gallery

In [56]:
from prediction_gallery import prediction_gallery
cat_ids_rare, cat_names_rare = utils.get_rare_classes(cocoGt)

gallery = prediction_gallery(matches, cocoGt, cat_ids_rare)

  per_image[:, 3] /= (per_image[:, 0] + per_image[:, 1])
  per_image[:, 4] /= per_image[:, 0]
