In [None]:
import numpy as np
from matplotlib import pyplot as plt
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval, Params
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from collections import defaultdict
import random

from model_benchmark import metrics, utils

## Loading data

In [None]:
cocoGt = COCO("cocoGt.json")
cocoDt = cocoGt.loadRes("cocoDt.json")
# cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')

import pickle
with open('eval_data.pkl', 'rb') as f:
    eval_data = pickle.load(f)

# true_positives = eval_data["true_positives"]
# false_positives = eval_data["false_positives"]
# false_negatives = eval_data["false_negatives"]
# matches = eval_data["matches"]
# coco_stats = eval_data["coco_stats"]
# coco_precision = eval_data["coco_precision"]

# # basic calculation
# TP_count = int(true_positives[...,0].sum())
# FP_count = int(false_positives[...,0].sum())
# FN_count = int(false_negatives[...,0].sum())

# tp_matches = [m for m in matches if m['type'] == "TP"]
# fp_matches = [m for m in matches if m['type'] == "FP"]
# fn_matches = [m for m in matches if m['type'] == "FN"]
# confused_matches = [m for m in fp_matches if m['miss_cls']]
# fp_not_confused_matches = [m for m in fp_matches if not m['miss_cls']]
# ious = np.array([m['iou'] for m in matches if m['iou']])

# # info
# cat_ids = cocoGt.getCatIds()
# cat_names = [cocoGt.cats[cat_id]['name'] for cat_id in cat_ids]

In [None]:
import warnings

from sklearn.metrics import log_loss, brier_score_loss
from sklearn.calibration import calibration_curve
import model_benchmark.metrics as metrics


def get_outcomes_per_image(matches, cocoGt: COCO):
    img_ids = sorted(cocoGt.getImgIds())
    imgId2idx = {img_id: idx for idx, img_id in enumerate(img_ids)}
    outcomes_per_image = np.zeros((len(img_ids), 3), dtype=float)
    for m in matches:
        img_id = m["image_id"]
        idx = imgId2idx[img_id]
        if m["type"] == "TP":
            outcomes_per_image[idx, 0] += 1
        elif m["type"] == "FP":
            outcomes_per_image[idx, 1] += 1
        elif m["type"] == "FN":
            outcomes_per_image[idx, 2] += 1
    return img_ids, outcomes_per_image


class Metrics:
    def __init__(self, eval_data: dict, cocoGt: COCO, cocoDt: COCO):

        # eval_data
        self.true_positives = eval_data["true_positives"]
        self.false_positives = eval_data["false_positives"]
        self.false_negatives = eval_data["false_negatives"]
        self.matches = eval_data["matches"]
        self.coco_stats = eval_data["coco_stats"]
        self.coco_precision = eval_data["coco_precision"]
        self.coco_params : Params = eval_data["coco_params"]

        # Counts
        self.TP_count = int(self.true_positives[...,0].sum())
        self.FP_count = int(self.false_positives[...,0].sum())
        self.FN_count = int(self.false_negatives[...,0].sum())

        # Matches
        self.tp_matches = [m for m in self.matches if m['type'] == "TP"]
        self.fp_matches = [m for m in self.matches if m['type'] == "FP"]
        self.fn_matches = [m for m in self.matches if m['type'] == "FN"]
        self.confused_matches = [m for m in self.fp_matches if m['miss_cls']]
        self.fp_not_confused_matches = [m for m in self.fp_matches if not m['miss_cls']]
        self.ious = np.array([m['iou'] for m in self.matches if m['iou']])

        # Calibration
        self.calibration_metrics = CalibrationMetrics(self.tp_matches, self.fp_matches, self.fn_matches, self.coco_params.iouThrs)

        # info
        self.cat_ids = cocoGt.getCatIds()
        self.cat_names = [cocoGt.cats[cat_id]['name'] for cat_id in self.cat_ids]

    def base_metrics(self):
        tp = self.true_positives.sum(1)
        fp = self.false_positives.sum(1)
        fn = self.false_negatives.sum(1)
        confuse_count = len(self.confused_matches)

        mAP = self.coco_stats[0]
        precision = np.mean(tp / (tp + fp))
        recall = np.mean(tp / (tp + fn))
        iou = np.mean(self.ious)
        classification_accuracy = self.TP_count / (self.TP_count + confuse_count)
        calibration_score = 1 - self.calibration_metrics.maximum_calibration_error()

        return {
            "mAP": mAP,
            "precision": precision,
            "recall": recall,
            "iou": iou,
            "classification_accuracy": classification_accuracy,
            "calibration_score": calibration_score
        }
    
    def precision_recall_classes(self):
        # Per-class metrics
        tp = self.true_positives.sum(1).mean(1)
        fp = self.false_positives.sum(1).mean(1)
        fn = self.false_negatives.sum(1).mean(1)

        pr = tp / (tp + fp)
        rc = tp / (tp + fn)

        pr_sort = np.argsort(pr)
        rc_sort = np.argsort(rc)
        pr_names = [self.cat_names[i] for i in pr_sort]
        rc_names = [self.cat_names[i] for i in rc_sort]
        pr_values = pr[pr_sort]
        rc_values = rc[rc_sort]
        return pr_names, pr_values, rc_names, rc_values
    
    def pr_curve(self):
        pr_curve = self.coco_precision[:,:,:,0,2].mean(0)
        return pr_curve
    
    def prediction_table(self):
        img_ids, outcomes_per_image = get_outcomes_per_image(self.matches, cocoGt)
        image_names = [cocoGt.imgs[img_id]["file_name"] for img_id in img_ids]
        # inference_time = ...
        n_gt = outcomes_per_image[:,0] + outcomes_per_image[:,2]
        n_dt = outcomes_per_image[:,0] + outcomes_per_image[:,1]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            precision_per_image = outcomes_per_image[:,0] / n_dt
            recall_per_image = outcomes_per_image[:,0] / n_gt
            f1_per_image = 2 * precision_per_image * recall_per_image / (precision_per_image + recall_per_image)
        prediction_table = pd.DataFrame({
            "image_name": image_names,
            "N gt": n_gt,
            "N dt": n_dt,
            "TP": outcomes_per_image[:,0],
            "FP": outcomes_per_image[:,1],
            "FN": outcomes_per_image[:,2],
            "Precision": precision_per_image,
            "Recall": recall_per_image,
            "F1": f1_per_image
            })
        return prediction_table
    
    
class CalibrationMetrics:
    def __init__(self, tp_matches, fp_matches, fn_matches, iouThrs):
        scores = []
        classes = []
        iou_idxs = []
        p_matches = tp_matches + fp_matches
        per_class_count = defaultdict(int)
        for m in p_matches:
            if m['type'] == "TP" and m['iou'] is not None:
                iou_idx = np.searchsorted(iouThrs, m['iou'])
                iou_idxs.append(iou_idx)
                assert iou_idx > 0
            else:
                iou_idxs.append(0)
            scores.append(m['score'])
            classes.append(m["category_id"])
            if m['type'] == "TP":
                per_class_count[m["category_id"]] += 1
        for m in fn_matches:
            per_class_count[m["category_id"]] += 1
        per_class_count = dict(per_class_count)
        scores = np.array(scores)
        inds_sort = np.argsort(-scores)
        scores = scores[inds_sort]
        classes = np.array(classes)[inds_sort]
        iou_idxs = np.array(iou_idxs)[inds_sort]

        self.scores = scores
        self.classes = classes
        self.iou_idxs = iou_idxs
        self.per_class_count = per_class_count

        # TODO What does it mean: self.iou_idxs > iou_idx
        self.y_true = self.iou_idxs > iou_idx

    def scores_vs_metrics(self, iou_idx=0, cat_id=None):
        tps = self.iou_idxs > iou_idx
        if cat_id is not None:
            cls_mask = self.classes == cat_id
            tps = tps[cls_mask]
            scores = self.scores[cls_mask]
            n_positives = self.per_class_count[cat_id]
        else:
            scores = self.scores
            n_positives = sum(self.per_class_count.values())
        fps = ~tps

        tps_sum = tps.cumsum()
        fps_sum = fps.cumsum()

        # Precision, recall, f1
        precision = tps_sum / (tps_sum + fps_sum)
        recall = tps_sum / n_positives
        f1 = 2 * precision * recall / (precision + recall)
        return {
            "scores": scores,
            "precision": precision,
            "recall": recall,
            "f1": f1
        }
    
    def calibration_curve(self):
        true_probs, pred_probs = calibration_curve(self.y_true, self.scores, n_bins=10)
        return true_probs, pred_probs
    
    def maximum_calibration_error(self):
        return metrics.maximum_calibration_error(self.y_true, self.scores, n_bins=10)

In [None]:
m = Metrics(eval_data, cocoGt, cocoDt)
m.base_metrics()