In [None]:
import numpy as np
from matplotlib import pyplot as plt
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval, Params
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from collections import defaultdict
import random

from model_benchmark import metrics, utils
from model_benchmark.metric_provider import MetricProvider, METRIC_NAMES
from model_benchmark import metric_provider

## Loading data

In [None]:
cocoGt_path = "cocoGt_remap.json"
cocoDt_path = "data/model-benchmark/COCO 2017 val (YOLOv8-L, conf-0.01)/cocoDt.json"
eval_data_path = "eval_data_conf-0.01.pkl"

In [None]:
cocoGt = COCO(cocoGt_path)
cocoDt = cocoGt.loadRes(cocoDt_path)
# cocoEval = COCOeval(cocoGt, cocoDt, 'bbox')

import pickle
with open(eval_data_path, 'rb') as f:
    eval_data = pickle.load(f)

In [None]:
from importlib import reload
reload(metric_provider)
m_full = metric_provider.MetricProvider(eval_data['matches'], eval_data['coco_metrics'], eval_data['params'], cocoGt, cocoDt)
m_full.base_metrics()

## F1-optimal conf

In [None]:
score_profile, f1s = m_full.confidence_score_profile()
# score_profile = m_full.confidence_score_profile_v0()
f1_optimal_conf, best_f1 = m_full.get_f1_optimal_conf(score_profile)
print(f"F1-Optimal confidence: {f1_optimal_conf:.4f} with f1: {best_f1:.4f}")

matches_thresholded = metric_provider.filter_by_conf(eval_data['matches'], f1_optimal_conf)
m = metric_provider.MetricProvider(matches_thresholded, eval_data['coco_metrics'], eval_data['params'], cocoGt, cocoDt)
m.base_metrics()

## Overview

In [None]:
# Overall Metrics
base_metrics = m.base_metrics()
r = list(base_metrics.values())
theta = [metric_provider.METRIC_NAMES[k] for k in base_metrics.keys()]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
    r=r+[r[0]], theta=theta+[theta[0]], fill='toself', name='Overall Metrics',
    hovertemplate='%{theta}: %{r:.2f}<extra></extra>',
))
fig.update_layout(polar=dict(radialaxis=dict(range=[0., 1.]),
                             angularaxis=dict(rotation=90, direction='clockwise')),
                  title="Overall Metrics", width=600, height=500,)
fig.show()

## Model Predictions

In [None]:
df = m.prediction_table()
df

### How the optimal Confidence Threshold is calculated?

In [None]:
df = pd.DataFrame(score_profile)
df.columns = ['scores', 'Precision', 'Recall', 'F1']

# downsample
if len(df) > 5000:
    df_down = df.iloc[::len(df)//1000]
else:
    df_down = df

color_map = {
    'Precision': '#1f77b4',
    'Recall': 'orange',
}
fig = px.line(df_down, x='scores', y=['Precision', 'Recall', 'F1'], title='Confidence Score Profile',
              labels={'value': 'Value', 'variable': 'Metric', 'scores': 'Confidence Score'},
              width=None, height=500,
              color_discrete_map=color_map)
fig.update_layout(yaxis=dict(range=[0, 1]),
                  xaxis=dict(range=[0, 1], tick0=0, dtick=0.1))

# Add vertical line for the best threshold
fig.add_shape(type="line", x0=f1_optimal_conf, x1=f1_optimal_conf, y0=0, y1=best_f1, line=dict(color="gray", width=2, dash="dash"))
fig.add_annotation(x=f1_optimal_conf, y=best_f1+0.04, text=f"F1-optimal threshold: {f1_optimal_conf:.2f}", showarrow=False)
fig.show()

In [None]:
# downsample
f1s_down = f1s[:,::f1s.shape[1]//1000]
iou_names = list(map(lambda x: str(round(x,2)), m.iouThrs.tolist()))
df = pd.DataFrame(np.concatenate([df_down['scores'].values[:,None], f1s_down.T], 1), columns=['scores'] + iou_names)
fig = px.line(df, x='scores', y=iou_names, title='F1-Score at different IoU Thresholds',
              labels={'value': 'Value', 'variable': 'IoU threshold', 'scores': 'Confidence Score'},
              color_discrete_sequence=px.colors.sequential.Viridis,
              width=None, height=500)
fig.update_layout(yaxis=dict(range=[0, 1]),
                  xaxis=dict(range=[0, 1], tick0=0, dtick=0.1))

# add annotations for maximum F1-Score for each IoU threshold
for i, iou in enumerate(iou_names):
    argmax_f1 = f1s[i].argmax()
    max_f1 = f1s[i][argmax_f1]
    score = score_profile['scores'][argmax_f1]
    fig.add_annotation(x=score, y=max_f1, text=f'Best score: {score:.2f}', showarrow=True, arrowhead=1, arrowcolor='black', ax=0, ay=-30)

fig.show()

## Outcome Counts

In [None]:
# Outcome counts
fig = go.Figure()
fig.add_trace(go.Bar(x=[m.TP_count], y=["Outcome"], name='TP', orientation='h', marker=dict(color='#1fb466')))
fig.add_trace(go.Bar(x=[m.FN_count], y=["Outcome"], name='FN', orientation='h', marker=dict(color='#dd3f3f')))
fig.add_trace(go.Bar(x=[m.FP_count], y=["Outcome"], name='FP', orientation='h', marker=dict(color='#d5a5a5')))
fig.update_layout(barmode='stack', title="Outcome Counts",
                  width=600, height=300)
fig.update_xaxes(title_text="Count")
fig.update_yaxes(tickangle=-90)
fig.show()

## Recall

In [None]:
per_class_metrics_df = m.per_class_metrics()

In [None]:
# Per-class Precision and Recall bar chart
per_class_metrics_df_sorted = per_class_metrics_df.sort_values(by="f1")

blue_color = '#1f77b4'
orange_color = '#ff7f0e'
fig = go.Figure()
fig.add_trace(go.Bar(y=per_class_metrics_df_sorted["precision"], x=per_class_metrics_df_sorted["category"], name='Precision', marker=dict(color=blue_color)))
fig.add_trace(go.Bar(y=per_class_metrics_df_sorted["recall"], x=per_class_metrics_df_sorted["category"], name='Recall', marker=dict(color=orange_color)))
fig.update_layout(barmode='group', title="Per-class Precision and Recall (Sorted by F1)")
fig.update_xaxes(title_text="Category")
fig.update_yaxes(title_text="Value", range=[0, 1])
fig.show()

In [None]:
# Per-class Precision bar chart
# per_class_metrics_df_sorted = per_class_metrics_df.sort_values(by="precision")
fig = px.bar(per_class_metrics_df_sorted, x='category', y='precision', title="Per-class Precision (Sorted by F1)",
             color='precision', color_continuous_scale='Plasma')
if len(per_class_metrics_df_sorted) <= 20:
    fig.update_traces(text=per_class_metrics_df_sorted["precision"].round(2), textposition='outside')
fig.update_xaxes(title_text="Category")
fig.update_yaxes(title_text="Precision", range=[0, 1])
fig.show()

In [None]:
# Per-class Precision bar chart
# per_class_metrics_df_sorted = per_class_metrics_df.sort_values(by="recall")
fig = px.bar(per_class_metrics_df_sorted, x='category', y='recall', title="Per-class Recall (Sorted by F1)",
             color='recall', color_continuous_scale='Plasma')
if len(per_class_metrics_df_sorted) <= 20:
    fig.update_traces(text=per_class_metrics_df_sorted["recall"].round(2), textposition='outside')
fig.update_xaxes(title_text="Category")
fig.update_yaxes(title_text="Recall", range=[0, 1])
fig.show()

## PR-curve

In [None]:
pr_curve = m.pr_curve()

In [None]:
# Precision-Recall curve
fig = px.line(x=m.recThrs, y=pr_curve.mean(-1), title="Precision-Recall Curve",
                labels={'x': 'Recall', 'y': 'Precision'},
                width=600, height=500)
fig.data[0].name = "Model"
fig.data[0].showlegend = True
fig.update_traces(fill='tozeroy', line=dict(color='#1f77b4'))
fig.add_trace(
    go.Scatter(
        x=m.recThrs,
        y=[1]*len(m.recThrs),
        name="Perfect",
        line=dict(color='orange', dash='dash'),
        showlegend=True
    )
)
fig.add_annotation(
    text=f"mAP = {base_metrics['mAP']:.2f}",
    xref="paper", yref="paper",
    x=0.98, y=0.92,
    showarrow=False,
    bgcolor="white",
)
fig.show()

In [None]:
# Precision-Recall curve per-class
df = pd.DataFrame(pr_curve, columns=m.cat_names)

fig = px.line(df, x=m.recThrs, y=df.columns, title="Precision-Recall Curve per Class",
              labels={"x": "Recall", "value": "Precision", "variable": "Category"},
              color_discrete_sequence=px.colors.qualitative.Prism, width=800, height=600)

fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[0, 1])
fig.show()

## Confusion Matrix

In [None]:
confusion_matrix = m.confusion_matrix()

In [None]:
# Confusion Matrix
cat_names = m.cat_names
none_name = "(None)"

confusion_matrix_df = pd.DataFrame(np.log(confusion_matrix), index=cat_names + [none_name], columns=cat_names + [none_name])
fig = px.imshow(confusion_matrix_df, labels=dict(x="Ground Truth", y="Predicted", color="Count"), title="Confusion Matrix (log-scale)",
                width=1000, height=1000)

# Hover text
fig.update_traces(customdata=confusion_matrix,
                  hovertemplate='Count: %{customdata}<br>Predicted: %{y}<br>Ground Truth: %{x}')

# Text on cells
if len(cat_names) <= 20:
    fig.update_traces(text=confusion_matrix,
                      texttemplate="%{text}")

fig.show()

## Frequently Confused Classes

In [None]:
# Frequency of confusion as bar chart
confused_df = m.frequently_confused(confusion_matrix, topk_pairs=20)
confused_name_pairs = confused_df["category_pair"]
confused_prob = confused_df["probability"]
x_labels = [f"{pair[0]} - {pair[1]}" for pair in confused_name_pairs]
fig = go.Figure()
fig.add_trace(go.Bar(x=x_labels, y=confused_prob, marker=dict(color=confused_prob, colorscale="Reds")))
fig.update_layout(title="Frequently confused class pairs", xaxis_title="Class pair", yaxis_title="Probability")
fig.update_traces(text=confused_prob.round(2))
fig.show()

## IoU Distribution

In [None]:
fig = go.Figure()
nbins = 40
fig.add_trace(go.Histogram(x=m.ious, nbinsx=nbins))
fig.update_layout(title="IoU Distribution", xaxis_title="IoU", yaxis_title="Count",
                  width=600, height=500)

# Add annotation for mean IoU as vertical line
mean_iou = m.ious.mean()
y1 = len(m.ious) // nbins
fig.add_shape(type="line", x0=mean_iou, x1=mean_iou, y0=0, y1=y1, line=dict(color="orange", width=2, dash="dash"))
fig.add_annotation(x=mean_iou, y=y1, text=f"Mean IoU: {mean_iou:.2f}", showarrow=False)
fig.show()

## Calibration Score

In [None]:
# Calibration curve (only positive predictions)
true_probs, pred_probs = m_full.calibration_metrics.calibration_curve()

fig = go.Figure()
fig.add_trace(go.Scatter(x=pred_probs, y=true_probs, mode='lines+markers', name='Calibration plot (Model)', 
                         line=dict(color='blue'), marker=dict(color='blue')))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Perfectly calibrated', 
                         line=dict(color='orange', dash='dash')))

fig.update_layout(
    title='Calibration Curve (only positive predictions)',
    xaxis_title='Confidence Score',
    yaxis_title='Fraction of True Positives',
    legend=dict(x=0.6, y=0.1),
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1]),
    width=700, height=500
)

fig.show()

In [None]:
df = pd.DataFrame(score_profile)
df.columns = ['scores', 'Precision', 'Recall', 'F1']

# downsample
if len(df) > 5000:
    df_down = df.iloc[::len(df)//1000]
else:
    df_down = df

color_map = {
    'Precision': '#1f77b4',
    'Recall': 'orange',
}
fig = px.line(df_down, x='scores', y=['Precision', 'Recall', 'F1'], title='Confidence Score Profile',
              labels={'value': 'Value', 'variable': 'Metric', 'scores': 'Confidence Score'},
              width=None, height=500,
              color_discrete_map=color_map)
fig.update_layout(yaxis=dict(range=[0, 1]),
                  xaxis=dict(range=[0, 1], tick0=0, dtick=0.1))

# Add vertical line for the best threshold
fig.add_shape(type="line", x0=f1_optimal_conf, x1=f1_optimal_conf, y0=0, y1=best_f1, line=dict(color="gray", width=2, dash="dash"))
fig.add_annotation(x=f1_optimal_conf, y=best_f1+0.04, text=f"F1-optimal threshold: {f1_optimal_conf:.2f}", showarrow=False)
fig.show()

In [None]:
# downsample
f1s_down = f1s[:,::f1s.shape[1]//1000]
iou_names = list(map(lambda x: str(round(x,2)), m.iouThrs.tolist()))
df = pd.DataFrame(np.concatenate([df_down['scores'].values[:,None], f1s_down.T], 1), columns=['scores'] + iou_names)
fig = px.line(df, x='scores', y=iou_names, title='F1-Score at different IoU Thresholds',
              labels={'value': 'Value', 'variable': 'IoU threshold', 'scores': 'Confidence Score'},
              color_discrete_sequence=px.colors.sequential.Viridis,
              width=None, height=500)
fig.update_layout(yaxis=dict(range=[0, 1]),
                  xaxis=dict(range=[0, 1], tick0=0, dtick=0.1))

# add annotations for maximum F1-Score for each IoU threshold
for i, iou in enumerate(iou_names):
    argmax_f1 = f1s[i].argmax()
    max_f1 = f1s[i][argmax_f1]
    score = score_profile['scores'][argmax_f1]
    fig.add_annotation(x=score, y=max_f1, text=f'Best score: {score:.2f}', showarrow=True, arrowhead=1, arrowcolor='black', ax=0, ay=-30)

fig.show()

In [None]:
# Histogram of confidence scores (TP vs FP)
scores_tp, scores_fp = m_full.calibration_metrics.scores_tp_and_fp(iou_idx=0)

tp_y, tp_x = np.histogram(scores_tp, bins=40, range=[0,1])
fp_y, fp_x = np.histogram(scores_fp, bins=40, range=[0,1])
dx = (tp_x[1] - tp_x[0])/2

fig = go.Figure()
fig.add_trace(go.Histogram(x=scores_fp, name='FP', marker=dict(color='#dd3f3f'), opacity=0.5, xbins=dict(size=0.025)))
fig.add_trace(go.Histogram(x=scores_tp, name='TP', marker=dict(color='#1fb466'), opacity=0.5, xbins=dict(size=0.025)))
fig.add_trace(go.Scatter(x=tp_x+dx, y=tp_y, mode='lines', name='TP', line=dict(color='#1fb466', width=3)))
fig.add_trace(go.Scatter(x=fp_x+dx, y=fp_y, mode='lines', name='FP', line=dict(color='#dd3f3f', width=3)))

# Best threshold
fig.add_shape(type="line", x0=f1_optimal_conf, x1=f1_optimal_conf, y0=0, y1=tp_y.max()*1.3, line=dict(color="orange", width=1, dash="dash"))
fig.add_annotation(x=f1_optimal_conf, y=tp_y.max()*1.3, text=f"F1-optimal threshold: {f1_optimal_conf:.2f}", showarrow=False)

fig.update_layout(barmode='overlay', title="Histogram of Confidence Scores (TP vs FP)",
                  width=800, height=500)
fig.update_xaxes(title_text="Confidence Score", range=[0, 1])
fig.update_yaxes(title_text="Count", range=[0, tp_y.max()*1.3])
fig.show()

## Per-class

In [None]:
# AP per-class
ap_per_class = m.coco_precision[:, :, :, 0, 2].mean(axis=(0, 1))
# Per-class Average Precision (AP)
fig = px.scatter_polar(r=ap_per_class, theta=m.cat_names, title="Per-class Average Precision (AP)",
                       labels=dict(r="Average Precision", theta="Category"),
                       width=800, height=800,
                       range_r=[0, 1])
# fill points
fig.update_traces(fill='toself')
fig.show()

In [None]:
# Per-class Counts
iou_thres = 0

tp = m.true_positives[:,iou_thres]
fp = m.false_positives[:,iou_thres]
fn = m.false_negatives[:,iou_thres]

# normalize
support = tp + fn
tp_rel = tp / support
fp_rel = fp / support
fn_rel = fn / support

# sort by f1
sort_scores = 2 * tp / (2 * tp + fp + fn)

K = len(m.cat_names)
sort_indices = np.argsort(sort_scores)
cat_names_sorted = [m.cat_names[i] for i in sort_indices]
tp_rel, fn_rel, fp_rel = tp_rel[sort_indices], fn_rel[sort_indices], fp_rel[sort_indices]

In [None]:
# Stacked per-class counts
data = {
    "count": np.concatenate([tp_rel, fn_rel, fp_rel]),
    "type": ["TP"]*K + ["FN"]*K + ["FP"]*K,
    "category": cat_names_sorted*3
}

df = pd.DataFrame(data)

color_map = {
    'TP': '#1fb466',
    'FN': '#dd3f3f',
    'FP': '#d5a5a5'
}
fig = px.bar(df, x="category", y="count", color="type", title="Per-class Outcome Counts",
             labels={'count': 'Total Count', "category": "Category"},
             color_discrete_map=color_map)

fig.show()

In [None]:
# Stacked per-class counts
data = {
    "count": np.concatenate([tp[sort_indices], fn[sort_indices], fp[sort_indices]]),
    "type": ["TP"]*K + ["FN"]*K + ["FP"]*K,
    "category": cat_names_sorted*3
}

df = pd.DataFrame(data)

color_map = {
    'TP': '#1fb466',
    'FN': '#dd3f3f',
    'FP': '#d5a5a5'
}
fig = px.bar(df, x="category", y="count", color="type", title="Per-class Outcome Counts",
             labels={'count': 'Total Count', "category": "Category"},
             color_discrete_map=color_map)

fig.show()

## Corr-plot (experimental)

In [None]:
# Precision-recall for area bin
n_bins = 40
max_area = 0.4

tps = np.zeros(n_bins)
fps = np.zeros(n_bins)
fns = np.zeros(n_bins)

def get_area(match, cocoGt: COCO):
    img_info = cocoGt.imgs[match['image_id']]
    img_w, img_h = img_info['width'], img_info['height']
    img_area = img_w * img_h

    gt_bbox = cocoGt.anns[match['gt_id']]
    gt_area_abs = gt_bbox['bbox'][2] * gt_bbox['bbox'][3]
    gt_area = gt_area_abs / img_area
    # dt_bbox = cocoDt.anns[match['dt_id']]
    # dt_area_abs = dt_bbox['area']
    # dt_area = dt_area_abs / img_area
    return gt_area

tp_areas = [get_area(m, cocoGt) for m in m.tp_matches]
fn_areas = [get_area(m, cocoGt) for m in m.fn_matches]
tp_areas = np.sqrt([a for a in tp_areas if a < max_area])
fn_areas = np.sqrt([a for a in fn_areas if a < max_area])

area_bins = np.linspace(0, np.sqrt(max_area), n_bins+1)
tp_hist = np.histogram(tp_areas, bins=area_bins)[0]
fn_hist = np.histogram(fn_areas, bins=area_bins)[0]
recall = tp_hist / (tp_hist + fn_hist)
bin_dt = (area_bins[1] - area_bins[0]) / 2

In [None]:
fig = px.bar(x=area_bins[:-1]+bin_dt, y=recall, title="Recall vs Area (sqrt)", labels={'x': 'Area (sqrt)', 'y': 'Recall'},
             color=recall, color_continuous_scale='Viridis')
fig.show()

In [None]:
# histogram of areas TP vs FN
fig = go.Figure()
fig.add_trace(go.Histogram(x=tp_areas, name='TP', marker=dict(color='#1fb466'), opacity=0.5, xbins=dict(size=0.01)))
fig.add_trace(go.Histogram(x=fn_areas, name='FN', marker=dict(color='#dd3f3f'), opacity=0.5, xbins=dict(size=0.01)))
fig.update_layout(barmode='overlay', title="Histogram of Areas (TP vs FN)",
                  width=800, height=500)

## Per-class (experimental)

In [None]:
class_name = 'book'
class_idx = m.cat_names.index(class_name)

ap_score = m.coco_precision[:, :, class_idx, 0, 2].mean(axis=(0, 1))
print(f"AP for {class_name}: {ap_score:.4f}")

In [None]:
iou_thres = 0
tp_count = m.true_positives[class_idx, iou_thres]
fp_count = m.false_positives[class_idx, iou_thres]
fn_count = m.false_negatives[class_idx, iou_thres]

fig = go.Figure()
fig.add_trace(go.Bar(x=[tp_count], y=["Outcome"], name='TP', orientation='h', marker=dict(color='#1fb466')))
fig.add_trace(go.Bar(x=[fn_count], y=["Outcome"], name='FN', orientation='h', marker=dict(color='#dd3f3f')))
fig.add_trace(go.Bar(x=[fp_count], y=["Outcome"], name='FP', orientation='h', marker=dict(color='#d5a5a5')))
fig.update_layout(barmode='stack', title="Outcome Counts: " + class_name,
                  width=600, height=300)
fig.update_xaxes(title_text="Count")
fig.update_yaxes(tickangle=-90)
fig.show()

In [None]:
cls_metrics = per_class_metrics_df[per_class_metrics_df["category"] == class_name].to_dict(orient='records')[0]
print(f"F1-Score for {class_name}: {cls_metrics['f1']:.4f}")

blue_color = '#1f77b4'
orange_color = '#ff7f0e'
fig = go.Figure()
fig.add_trace(go.Bar(y=[cls_metrics["precision"]], x=[class_name], name='Precision', marker=dict(color=blue_color)))
fig.add_trace(go.Bar(y=[cls_metrics["recall"]], x=[class_name], name='Recall', marker=dict(color=orange_color)))
fig.update_layout(barmode='group', title="Precision and Recall: " + class_name)
fig.update_xaxes(title_text="Category")
fig.update_yaxes(title_text="Value", range=[0, 1])
fig.show()

In [None]:
# Precision-Recall curve per-class
df = pd.DataFrame(pr_curve, columns=m.cat_names)[[class_name]]

fig = px.line(df, x=m.recThrs, y=df.columns, title="Precision-Recall Curve: "+ class_name,
              labels={"x": "Recall", "value": "Precision", "variable": "Category"},
              color_discrete_sequence=px.colors.qualitative.Prism, width=800, height=600)

fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[0, 1])
fig.show()

In [None]:
y_nz = np.nonzero(confusion_matrix[class_idx, :-1])[0]
x_nz = np.nonzero(confusion_matrix[:-1, class_idx])[0]
idxs = np.union1d(y_nz, x_nz)
if class_idx not in idxs:
    idxs = np.concatenate([idxs, [class_idx]])
idxs = np.sort(idxs)

# get confusion matrix for the selected classes
confusion_matrix_mini = confusion_matrix[idxs][:, idxs].copy()
self_idx = idxs == class_idx
v = confusion_matrix_mini[self_idx, self_idx]
confusion_matrix_mini[np.diag_indices_from(confusion_matrix_mini)] *= 0
confusion_matrix_mini[self_idx, self_idx] = v

cat_names_cls = [m.cat_names[i] for i in idxs]
confusion_matrix_df_mini = pd.DataFrame(np.log(confusion_matrix_mini), index=cat_names_cls, columns=cat_names_cls)
fig = px.imshow(confusion_matrix_df_mini, labels=dict(x="Ground Truth", y="Predicted", color="Count"),
                title=f"Confusion Matrix: {class_name} (log-scale)")
                # width=1000, height=1000)

# Hover text
fig.update_traces(customdata=confusion_matrix_mini,
                  hovertemplate='Count: %{customdata}<br>Predicted: %{y}<br>Ground Truth: %{x}<extra></extra>')

# Text on cells
if len(idxs) <= 20:
    fig.update_traces(text=confusion_matrix_mini,
                      texttemplate="%{text}")

fig.show()

In [None]:
# TODO: avoid "class_idx + 1"
scores_tp = [m['score'] for m in m.tp_matches if m['category_id'] == class_idx + 1]
scores_fp = [m['score'] for m in m.fp_matches if m['category_id'] == class_idx + 1]

n_bins = 10
xbins_size = 1/n_bins
tp_y, tp_x = np.histogram(scores_tp, bins=n_bins, range=[0,1])
fp_y, fp_x = np.histogram(scores_fp, bins=n_bins, range=[0,1])
dx = (tp_x[1] - tp_x[0])/2

fig = go.Figure()
fig.add_trace(go.Histogram(x=scores_tp, name='TP', marker=dict(color='#1fb466'), opacity=0.5, xbins=dict(size=xbins_size)))
fig.add_trace(go.Histogram(x=scores_fp, name='FP', marker=dict(color='#dd3f3f'), opacity=0.5, xbins=dict(size=xbins_size)))
# fig.add_trace(go.Scatter(x=tp_x+dx, y=tp_y, mode='lines', name='TP', line=dict(color='#1fb466', width=2)))
# fig.add_trace(go.Scatter(x=fp_x+dx, y=fp_y, mode='lines', name='FP', line=dict(color='#dd3f3f', width=2)))

# Best threshold
fig.add_shape(type="line", x0=f1_optimal_conf, x1=f1_optimal_conf, y0=0, y1=tp_y.max()*1.3, line=dict(color="orange", width=1, dash="dash"))
fig.add_annotation(x=f1_optimal_conf, y=tp_y.max()*1.3, text=f"F1-optimal threshold: {f1_optimal_conf:.2f}", showarrow=False)

fig.update_layout(barmode='overlay', title="Histogram of Confidence Scores (TP vs FP)",
                  width=800, height=500)
fig.update_xaxes(title_text="Confidence Score", range=[0, 1])
fig.update_yaxes(title_text="Count", range=[0, tp_y.max()*1.3])
fig.show()

In [None]:
from matplotlib import cm

t = 0
tp = m.true_positives.sum(0)[:,t]
fp = m.false_positives.sum(0)[:,t]
fn = m.false_negatives.sum(0)[:,t]

y_edges = np.arange(min(tp) - 0.5, max(tp) + 1.5, 1)
x_edges = np.arange(min(fp+fn) - 0.5, max(fp+fn) + 1.5, 1)
heatmap, y_edges, x_edges = np.histogram2d(tp, fp+fn, bins=(y_edges, x_edges))

z_max = np.max(heatmap)
gamma = 0.95

colors = np.zeros((heatmap.shape[0], heatmap.shape[1], 3))  # for RGB channels
colormap_name = 'RdYlGn_r'
cmap = cm.get_cmap(colormap_name)

for i in range(heatmap.shape[0]):
    for j in range(heatmap.shape[1]):
        tp_val = x_edges[j] + 0.5
        fp_fn_val = y_edges[i] + 0.5
        
        intensity = heatmap[i, j]
        if tp_val + fp_fn_val > 0:
            value = tp_val / (tp_val + fp_fn_val)
        else:
            value = 0
        
        # green to red colormap
        colormap_name = 'RdYlGn_r'
        color = cmap(value)  # Get a color from a colormap
        # Adjust the color intensity based on the heatmap value
        if intensity > 0:
            c = np.array(color[:3]) * max(0.2, np.log(intensity) / np.log(z_max))
            colors[i, j, :] = c**gamma
        else:
            colors[i, j, :] = np.array(color[:3]) * 0.12

# Plot the colored heatmap
fig = px.imshow(colors, labels=dict(x="Count of Errors", y="Count of True Predictions"), title="TP vs FP+FN", text_auto=True, origin='lower',
                width=1000, height=1000)

# Adding text to each pixel
# for i in range(heatmap.shape[0]):
#     for j in range(heatmap.shape[1]):
#         fig.add_annotation(
#             x=j, 
#             y=i, 
#             text=str(int(heatmap[i, j])),
#             showarrow=False,
#             font=dict(color="#ddd", size=10)
#         )

# fig.show()