In [None]:
import numpy as np
from matplotlib import pyplot as plt
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import utils
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from collections import defaultdict
import random

In [None]:
# Load ground truth data
cocoGt=COCO("cocoGt.json")

# Load prediction data
cocoDt=cocoGt.loadRes("cocoDt.json")

# Initialize COCOeval object
cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval.params.useCats = 1

# Evaluate on a subset of images (optional)
# cocoEval.params.imgIds = [5]  # Remove this line to evaluate on all images

# Run evaluation
cocoEval.evaluate()
cocoEval.accumulate()
cocoEval.summarize()

In [None]:
# Initialize COCOeval object
cocoEval_cls = COCOeval(cocoGt, cocoDt, 'bbox')
cocoEval_cls.params.useCats = 0

# Run evaluation
cocoEval_cls.evaluate()
cocoEval_cls.accumulate()
cocoEval_cls.summarize()

In [None]:
# increases score:
# prediction with high confidence but incorrect (FP)
# TP with low confidence, low IoU
# edge cases: small objects, big objects, low IoU, low confidence
# many predictions in one image
# many FP predictions in one image
# many FN in one image
# miss-classified objects with high/low confidence
# miss-classified prediciotn, and it is the most often miss-classified class

# finding insightful FN:
# many FN in one image
# FN with small area
# FN on the edge
# FN with tall/len object

# finding insightful TP:
# TP with low confidence
# TP with low IoU
# TP with small area

In [None]:
# per-image stats:
# prediction count (TP+FP)
# FP count
# FN count
# TP + low conf + low IoU (avg on image)
# FP + high conf (avg on image)
# many rare classes (avg)

# per-prediction stats:
# TP + low conf
# TP + low IoU
# FP + high conf
# rare class
# TP, but incorrect class + high conf
# FN + small area (?)

# image_id / ann_id

In [None]:
save_ids = True

In [None]:
cat_ids = cocoEval.params.catIds
cat_names = [cocoGt.cats[cat_id]['name'] for cat_id in cat_ids]

true_positives, false_positives, false_negatives = utils.get_counts(cocoEval)

eval_img_dict = utils.get_eval_img_dict(cocoEval)
eval_img_dict_cls = utils.get_eval_img_dict(cocoEval_cls)
matches = utils.get_matches(eval_img_dict, eval_img_dict_cls, cocoEval_cls, iou_t=0)

len(matches), len(cocoDt.anns), len(cocoGt.anns)

In [None]:
TP_count = int(true_positives[...,0].sum())
FP_count = int(false_positives[...,0].sum())
FN_count = int(false_negatives[...,0].sum())
TP_count, FP_count, FN_count

In [None]:
tp_matches = [m for m in matches if m['type'] == "TP"]
fp_matches = [m for m in matches if m['type'] == "FP"]
fn_matches = [m for m in matches if m['type'] == "FN"]
confused_matches = [m for m in fp_matches if m['miss_cls']]
fp_not_confused_matches = [m for m in fp_matches if not m['miss_cls']]

In [None]:
mAP = cocoEval.stats[0]

tp = true_positives.sum(1)
fp = false_positives.sum(1)
fn = false_negatives.sum(1)
precision = np.mean(tp / (tp + fp))
recall = np.mean(tp / (tp + fn))

# IoU distribution
ious = np.array([m['iou'] for m in matches if m['iou']])
iou_hist = np.histogram(ious, range=(0.5, 1))
iou_mean = np.mean(ious)

In [None]:
# Per-class metrics
tp = true_positives.sum(1).mean(1)
fp = false_positives.sum(1).mean(1)
fn = false_negatives.sum(1).mean(1)

pr = tp / (tp + fp)
rc = tp / (tp + fn)

pr_sort = np.argsort(pr)
rc_sort = np.argsort(rc)
pr_names = [cat_names[i] for i in pr_sort]
rc_names = [cat_names[i] for i in rc_sort]
pr_values = pr[pr_sort]
rc_values = rc[rc_sort]

In [None]:
# FP for precision
# FN for recall

if save_ids:
    pr_per_class_ids = {cat_name: [] for cat_name in cat_names}
    for m in fp_matches:
        cat_id = m['category_id']
        cat_name = cocoGt.cats[cat_id]['name']
        pr_per_class_ids[cat_name].append((m['image_id'], m['gt_id'], m['dt_id']))

    rc_per_class_ids = {cat_name: [] for cat_name in cat_names}
    for m in fn_matches:
        cat_id = m['category_id']
        cat_name = cocoGt.cats[cat_id]['name']
        rc_per_class_ids[cat_name].append((m['image_id'], m['gt_id'], m['dt_id']))

In [None]:
# shape: R x K
pr_curve_np = cocoEval.eval['precision'][:,:,:,0,2].mean(0)

In [None]:
confuse_count = len(confused_matches)
classification_accuracy = TP_count / (TP_count + confuse_count)

In [None]:
# Confusion matrix

catId2idx = {cat_id: i for i, cat_id in enumerate(cat_ids)}
idx2catId = {i: cat_id for cat_id, i in catId2idx.items()}

confusion_matrix = np.zeros((len(cat_ids)+1, len(cat_ids)+1), dtype=int)
confusion_matrix_ids = [[[] for _ in range(len(cat_ids)+1)] for _ in range(len(cat_ids)+1)]

for m in confused_matches:
    cat_idx_pred = catId2idx[m['category_id']]
    cat_idx_gt = catId2idx[cocoGt.anns[m['gt_id']]['category_id']]
    confusion_matrix[cat_idx_pred, cat_idx_gt] += 1
    if save_ids:
        confusion_matrix_ids[cat_idx_pred][cat_idx_gt].append((m['image_id'], m['gt_id'], m['dt_id']))

for m in tp_matches:
    cat_idx = catId2idx[m['category_id']]
    confusion_matrix[cat_idx, cat_idx] += 1
    if save_ids:
        confusion_matrix_ids[cat_idx][cat_idx].append((m['image_id'], m['gt_id'], m['dt_id']))

for m in fp_not_confused_matches:
    cat_idx_pred = catId2idx[m['category_id']]
    confusion_matrix[cat_idx_pred, -1] += 1
    if save_ids:
        confusion_matrix_ids[cat_idx_pred][-1].append((m['image_id'], m['gt_id'], m['dt_id']))

for m in fn_matches:
    cat_idx_gt = catId2idx[m['category_id']]
    confusion_matrix[-1, cat_idx_gt] += 1
    if save_ids:
        confusion_matrix_ids[-1][cat_idx_gt].append((m['image_id'], m['gt_id'], m['dt_id']))

In [None]:
# Frequently confused class pairs
topk_pairs = 20
cm = confusion_matrix[:-1,:-1]
cm_l = np.tril(cm, -1)
cm_u = np.triu(cm, 1)
cm = cm_l + cm_u.T
cm_flat = cm.flatten()
inds_sort = np.argsort(-cm_flat)[:topk_pairs]
inds_sort = inds_sort[cm_flat[inds_sort] > 0]  # remove zeros
inds_sort = np.unravel_index(inds_sort, cm.shape)

# probability of confusion: (predicted A, actually B + predicted B, actually A) / (predicted A + predicted B)
confused_counts = cm[inds_sort]
dt_total = confusion_matrix.sum(1)
dt_pair_sum = np.array([dt_total[i] + dt_total[j] for i, j in zip(*inds_sort)])
confused_prob = confused_counts / dt_pair_sum
inds_sort2 = np.argsort(-confused_prob)

confused_idxs = np.array(inds_sort).T[inds_sort2]
confused_name_pairs = [(cat_names[i], cat_names[j]) for i, j in confused_idxs]
confused_counts = confused_counts[inds_sort2]
confused_prob = confused_prob[inds_sort2]
confused_catIds = [(idx2catId[i], idx2catId[j]) for i, j in confused_idxs]

In [None]:
import random
random.seed(0)
if save_ids:
    frequent_confusion_ids = {}
    for i, j in confused_idxs:
        c_ids = confusion_matrix_ids[i][j] + confusion_matrix_ids[j][i]
        random.shuffle(c_ids)
        cat_i, cat_j = idx2catId[i], idx2catId[j]
        frequent_confusion_ids[(cat_i, cat_j)] = c_ids

## Per-class

In [None]:
# Per-class AP
pr = cocoEval.eval['precision'][:, :, :, 0, 2]
ap_per_class = pr.mean(axis=(0, 1))

In [None]:
# Per-class Counts
iou_thres = 0

tp = true_positives.sum(1)[:,iou_thres]
fp = false_positives.sum(1)[:,iou_thres]
fn = false_negatives.sum(1)[:,iou_thres]

# normalize
support = tp + fn
tp_rel = tp / support
fp_rel = fp / support
fn_rel = fn / support

# sort by tp_rel / fp_rel / fn_rel
# sort_scores = tp_rel - fp_rel - fn_rel
# sort_scores = ap_per_class
# f1
sort_scores = 2 * tp / (2 * tp + fp + fn)

K = len(cat_names)
sort_indices = np.argsort(sort_scores)
cat_names_sorted = [cat_names[i] for i in sort_indices]
tp_rel, fn_rel, fp_rel = tp_rel[sort_indices], fn_rel[sort_indices], fp_rel[sort_indices]

In [None]:
if save_ids:
    per_class_counts_ids = {cat_name: {"TP": [], "FN": [], "FP": []} for cat_name in cat_names_sorted}

    for m in tp_matches:
        cat_id = m['category_id']
        cat_name = cocoGt.cats[cat_id]['name']
        per_class_counts_ids[cat_name]["TP"].append((m['image_id'], m['gt_id'], m['dt_id']))

    for m in fn_matches:
        cat_id = m['category_id']
        cat_name = cocoGt.cats[cat_id]['name']
        per_class_counts_ids[cat_name]["FN"].append((m['image_id'], m['gt_id'], m['dt_id']))

    for m in fp_matches:
        cat_id = m['category_id']
        cat_name = cocoGt.cats[cat_id]['name']
        per_class_counts_ids[cat_name]["FP"].append((m['image_id'], m['gt_id'], m['dt_id']))

## Confidence

In [None]:
def calculate_scores_vs_metrics(tp_matches, fp_matches, fn_matches, iouThrs):
    scores = []
    classes = []
    iou_idxs = []
    p_matches = tp_matches + fp_matches
    per_class_count = defaultdict(int)
    for m in p_matches:
        if m['type'] == "TP" and m['iou'] is not None:
            iou_idx = np.searchsorted(iouThrs, m['iou'])
            iou_idxs.append(iou_idx)
            assert iou_idx > 0
        else:
            iou_idxs.append(0)
        scores.append(m['score'])
        classes.append(m["category_id"])
        if m['type'] == "TP":
            per_class_count[m["category_id"]] += 1
    for m in fn_matches:
        per_class_count[m["category_id"]] += 1
    per_class_count = dict(per_class_count)

    scores = np.array(scores)
    inds_sort = np.argsort(-scores)
    scores = scores[inds_sort]
    classes = np.array(classes)[inds_sort]
    iou_idxs = np.array(iou_idxs)[inds_sort]

    return {
        "scores": scores,
        "classes": classes,
        "iou_idxs": iou_idxs,
        "per_class_count": per_class_count
    }


class ScoresVsMetrics:
    def __init__(self, scores, classes, iou_idxs, per_class_count):
        self.scores = scores
        self.classes = classes
        self.iou_idxs = iou_idxs
        self.per_class_count = per_class_count

    def query(self, iou_idx=0, cat_id=None):
        tps = self.iou_idxs > iou_idx
        if cat_id is not None:
            cls_mask = self.classes == cat_id
            tps = tps[cls_mask]
            scores = self.scores[cls_mask]
            n_positives = self.per_class_count[cat_id]
        else:
            scores = self.scores
            n_positives = sum(self.per_class_count.values())
        fps = ~tps

        tps_sum = tps.cumsum()
        fps_sum = fps.cumsum()

        # Precision, recall, f1
        precision = tps_sum / (tps_sum + fps_sum)
        recall = tps_sum / n_positives
        f1 = 2 * precision * recall / (precision + recall)
        return scores, precision, recall, f1

In [None]:
scores_vs_metrics_result = calculate_scores_vs_metrics(tp_matches, fp_matches, fn_matches, cocoEval.params.iouThrs)

iou_idx = 0
cat_id = None
scores, classes, iou_idxs, per_class_count = scores_vs_metrics_result.values()
# s = ScoresVsMetrics(scores, classes, iou_idxs, per_class_count)
# scores, precision, recall, f1 = s.query(iou_idx, cat_id)

In [None]:
# Hist + KDE
tps = iou_idxs > iou_idx
scores_tp = scores[tps]
scores_fp = scores[~tps]

from scipy.stats import gaussian_kde
x = np.linspace(0., 1, 500, endpoint=False)
kde_tp = gaussian_kde(scores_tp)
kde_fp = gaussian_kde(scores_fp)
density_tp = kde_tp(x)
density_fp = kde_fp(x)

# density is relative to the number of data points
# density_tp *= len(scores_tp)
# density_fp *= len(scores_fp)

## Calibration Curve

In [None]:
y_true = iou_idxs > iou_idx
y_pred = iou_idxs >= iou_idx

In [None]:
from sklearn.metrics import log_loss, brier_score_loss
from sklearn.calibration import calibration_curve
true_probs, pred_probs = calibration_curve(y_true, scores, n_bins=10)
brier_score = 1 - brier_score_loss(y_true, scores)

## Per-image

In [None]:
t = 0
tp = true_positives.sum(0)[:,t]
fp = false_positives.sum(0)[:,t]
fn = false_negatives.sum(0)[:,t]

In [None]:
from matplotlib import cm

y_edges = np.arange(min(tp) - 0.5, max(tp) + 1.5, 1)
x_edges = np.arange(min(fp+fn) - 0.5, max(fp+fn) + 1.5, 1)
heatmap, y_edges, x_edges = np.histogram2d(tp, fp+fn, bins=(y_edges, x_edges))

z_max = np.max(heatmap)
gamma = 0.95

colors = np.zeros((heatmap.shape[0], heatmap.shape[1], 3))  # for RGB channels

for i in range(heatmap.shape[0]):
    for j in range(heatmap.shape[1]):
        tp_val = x_edges[j] + 0.5
        fp_fn_val = y_edges[i] + 0.5
        
        intensity = heatmap[i, j]
        if tp_val + fp_fn_val > 0:
            value = tp_val / (tp_val + fp_fn_val)
        else:
            value = 0
        
        # green to red colormap
        colormap_name = 'RdYlGn_r'
        color = cm.get_cmap(colormap_name)(value)  # Get a color from a colormap
        # Adjust the color intensity based on the heatmap value
        if intensity > 0:
            c = np.array(color[:3]) * max(0.2, np.log(intensity) / np.log(z_max))
            colors[i, j, :] = c**gamma
        else:
            colors[i, j, :] = np.array(color[:3]) * 0.12

# Plot the colored heatmap
fig = px.imshow(colors, labels=dict(x="Count of Errors", y="Count of True Predictions"), title="TP vs FP+FN", text_auto=True, origin='lower',
                width=800, height=800)

# Adding text to each pixel
for i in range(heatmap.shape[0]):
    for j in range(heatmap.shape[1]):
        fig.add_annotation(
            x=j, 
            y=i, 
            text=str(int(heatmap[i, j])),
            showarrow=False,
            font=dict(color="#ddd", size=10)
        )

# Remove margin
# fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))

# fig

In [None]:
overall_metrics = {
    "mAP": mAP,
    "precision": precision,
    "recall": recall,
    "iou": iou_mean,
    "classification_accuracy": classification_accuracy,
    "reliability": brier_score,
}

recall_metrics = {
    "recall": recall,
    "per_class": {"class_names": rc_names, "recall": rc_values},
    "TP": TP_count,
    "TP+FN": TP_count + FN_count,
}

precision_metrics = {
    "precision": precision,
    "per_class": {"class_names": pr_names, "precision": pr_values},
    "TP": TP_count,
    "TP+FP": TP_count + FP_count,
}

pr_curve = pr_curve_np

classification_metrics = {
    "classification_accuracy": classification_accuracy,
    "confuse_count": confuse_count,
    "total": TP_count + confuse_count,
    "confusion_matrix": confusion_matrix,
    "frequently_confused": {"class_name_pairs": confused_name_pairs, "counts": confused_counts, "prob": confused_prob},
}

localization_metrics = {
    "iou": iou_mean,
    "iou_hist": iou_hist,
}

per_class_metrics = {
    "AP": ap_per_class,
    "per_class_counts": {"class_names": cat_names_sorted, "TP": tp_rel, "FN": fn_rel, "FP": fp_rel},
}

confidence_metrics = {
    "confidence_vs_metrics": scores_vs_metrics_result,
    "confidence_histogram": "note: can be made with confidence_vs_metrics",
    "brier_score": brier_score,
    "calibration_curve": {"true_probs": true_probs, "pred_probs": pred_probs},
}

if save_ids:
    json_ids = {
        "recall_metrics": {
            "per_class": rc_per_class_ids,
        },
        "precision_metrics": {
            "per_class": pr_per_class_ids,
        },
        "classification_metrics": {
            "confusion_matrix": confusion_matrix_ids,
            "frequently_confused": frequent_confusion_ids,
        },
        "per_class_metrics": {
            "per_class_counts": per_class_counts_ids,
        },
    }

In [None]:
print("P/R vs IoU")
tp = true_positives.sum((0,1))
fp = false_positives.sum((0,1))
fn = false_negatives.sum((0,1))

precision = tp / (tp + fp)
recall = tp / (tp + fn)

plt.plot(recall, label="Recall")
plt.plot(precision, label="Precision")
plt.legend()
plt.title("Precision / Recall vs IoU")
plt.xlabel("IoU")
plt.ylabel("Precision / Recall")
plt.xticks(range(0, len(cocoEval.params.iouThrs), 1), cocoEval.params.iouThrs[::1])
plt.ylim(0, 1)
plt.grid()

# Gallery

In [None]:
from prediction_gallery import prediction_gallery
cat_ids_rare, cat_names_rare = utils.get_rare_classes(cocoGt)

gallery = prediction_gallery(matches, cocoGt, cat_ids_rare)

In [None]:
# show confusion matrix with plotly
confusion_matrix_df = pd.DataFrame(np.log(confusion_matrix), index=cat_names + ['(background)'], columns=cat_names + ['(background)'])
fig = px.imshow(confusion_matrix_df, labels=dict(x="Predicted", y="Ground Truth", color="Count"), title="Confusion Matrix (log scale)",
                width=800, height=800)
# remove margin
fig.update_layout(margin=dict(l=0, r=0, t=50, b=0))
fig.show()

In [None]:
# draw frequency of confusion as bar chart with plotly
x_labels = [f"{pair[0]} - {pair[1]}" for pair in confused_name_pairs]
fig = go.Figure()
fig.add_trace(go.Bar(x=x_labels, y=confused_prob))
fig.update_layout(title="Frequently confused class pairs", xaxis_title="Class pair", yaxis_title="Probability")
fig.show()

In [None]:
# Per-class Average Precision (AP)
fig = px.scatter_polar(r=ap_per_class, theta=cat_names, title="Per-class Average Precision (AP)",
                       labels=dict(r="Average Precision", theta="Category"),
                       width=600, height=600,
                       range_r=[0, 1])
# fill points
fig.update_traces(fill='toself')

In [None]:
# Stacked per-class counts
data = {
    "count": np.concatenate([tp_rel, fn_rel, fp_rel]),
    "type": ["TP"]*K + ["FN"]*K + ["FP"]*K,
    "category": cat_names_sorted*3
}

df = pd.DataFrame(data)

color_map = {
    'TP': '#1fb466',
    'FN': '#dd3f3f',
    'FP': '#d5a5a5'
}
fig = px.bar(df, x="category", y="count", color="type", title="Per-class Counts",
             labels={'count': 'Total Count'}, text='count',
             color_discrete_map=color_map)

fig.show()

In [None]:
# Confidence vs F1
s = ScoresVsMetrics(scores, classes, iou_idxs, per_class_count)
scores2, precision2, recall2, f1_2 = s.query(iou_idx, cat_id)

fig = go.Figure()
fig.add_trace(go.Scatter(x=scores2, y=precision2, mode='lines', name='Precision'))
fig.add_trace(go.Scatter(x=scores2, y=recall2, mode='lines', name='Recall'))
fig.add_trace(go.Scatter(x=scores2, y=f1_2, mode='lines', name='F1'))

fig.update_layout(title="Precision, Recall, F1 vs Confidence Score",
                    xaxis_title="Confidence Score", yaxis_title="Value",
                    width=800, height=500)
fig.show()

In [None]:
plt.plot(x, density_tp, label="TP")
plt.plot(x, density_fp, label="FP")
y_hist, x_hist = np.histogram(scores_tp, bins=50, density=True)
dx = x_hist[1] - x_hist[0]
plt.bar(x_hist[:-1]+dx/2, y_hist, width=dx, alpha=0.5)
y_hist, x_hist = np.histogram(scores_fp, bins=50, density=True)
dx = x_hist[1] - x_hist[0]
plt.bar(x_hist[:-1]+dx/2, y_hist, width=dx, alpha=0.5)
plt.legend()
plt.xlabel("Score")
plt.ylabel("Density")
plt.show()

plt.hist(scores_tp, bins=50, alpha=0.5, label="TP")
plt.hist(scores_fp, bins=50, alpha=0.5, label="FP");

In [None]:
# Сalibration curve
plt.figure(figsize=(8, 6))
plt.plot(pred_probs, true_probs, marker='o', linewidth=1, label='Calibration plot (Model)')
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfectly calibrated')
plt.xlabel('Confidence Score')
plt.ylabel('Fraction of True Positives')
plt.title('Calibration Curve')
plt.legend()
plt.grid(True)
plt.show()