In [None]:
import numpy as np
from matplotlib import pyplot as plt
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval, Params
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from collections import defaultdict
import random
import pickle

from model_benchmark import metrics, utils
from model_benchmark.metric_provider import MetricProvider, METRIC_NAMES
from model_benchmark import metric_provider

## Loading data

In [None]:
cocoGt_path = "cocoGt_remap.json"
cocoDt1_path = "data/model-benchmark/COCO 2017 val (YOLOv8-L, conf-0.01)/cocoDt.json"
cocoDt2_path = "data/model-benchmark/COCO 2017 val (RT-DETR r34, conf-0.01)/cocoDt.json"
eval_data1_path = "eval_data_conf-0.01.pkl"
eval_data2_path = "eval_data_rtdetr_conf-0.01.pkl"

In [None]:
cocoGt = COCO(cocoGt_path)
cocoDt1 = cocoGt.loadRes(cocoDt1_path)
cocoDt2 = cocoGt.loadRes(cocoDt2_path)

with open(eval_data1_path, 'rb') as f:
    eval_data1 = pickle.load(f)

with open(eval_data2_path, 'rb') as f:
    eval_data2 = pickle.load(f)

In [None]:
from importlib import reload
reload(metric_provider)
m_full1 = metric_provider.MetricProvider(eval_data1['matches'], eval_data1['coco_metrics'], eval_data1['params'], cocoGt, cocoDt1)
m_full1.base_metrics()

In [None]:
from importlib import reload
reload(metric_provider)
m_full2 = metric_provider.MetricProvider(eval_data2['matches'], eval_data2['coco_metrics'], eval_data2['params'], cocoGt, cocoDt2)
m_full2.base_metrics()

## F1-optimal conf

In [None]:
score_profile, f1s = m_full1.confidence_score_profile()
# score_profile = m_full.confidence_score_profile_v0()
f1_optimal_conf1, best_f1 = m_full1.get_f1_optimal_conf(score_profile)
print(f"F1-Optimal confidence: {f1_optimal_conf1:.4f} with f1: {best_f1:.4f}")

matches_thresholded = metric_provider.filter_by_conf(eval_data1['matches'], f1_optimal_conf1)
m1 = metric_provider.MetricProvider(matches_thresholded, eval_data1['coco_metrics'], eval_data1['params'], cocoGt, cocoDt1)
m1.model_name = "YOLOv8-L"
m1.base_metrics()

In [None]:
score_profile, f1s = m_full2.confidence_score_profile()
# score_profile = m_full.confidence_score_profile_v0()
f1_optimal_conf2, best_f1 = m_full2.get_f1_optimal_conf(score_profile)
print(f"F1-Optimal confidence: {f1_optimal_conf2:.4f} with f1: {best_f1:.4f}")

matches_thresholded = metric_provider.filter_by_conf(eval_data2['matches'], f1_optimal_conf2)
m2 = metric_provider.MetricProvider(matches_thresholded, eval_data2['coco_metrics'], eval_data2['params'], cocoGt, cocoDt2)
m2.model_name = "RT-DETR r34"
m2.base_metrics()

## Overview

In [None]:
# Overall Metrics comparison
base_metrics1 = m1.base_metrics()
base_metrics2 = m2.base_metrics()
r1 = list(base_metrics1.values())
r2 = list(base_metrics2.values())
theta = [metric_provider.METRIC_NAMES[k] for k in base_metrics1.keys()]
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
    r=r1+[r1[0]], theta=theta+[theta[0]], name=m1.model_name,
    hovertemplate='%{theta}: %{r:.2f}<extra></extra>',
))
fig.add_trace(go.Scatterpolar(
    r=r2+[r2[0]], theta=theta+[theta[0]], name=m2.model_name,
    hovertemplate='%{theta}: %{r:.2f}<extra></extra>',
))
fig.update_layout(polar=dict(radialaxis=dict(range=[0., 1.]),
                             angularaxis=dict(rotation=90, direction='clockwise')),
                  title="Overall Metrics", width=600, height=500,)
fig.show()

In [None]:
# Grouped bar chart
base_metrics1 = m1.base_metrics()
base_metrics2 = m2.base_metrics()
df = pd.DataFrame({
    "metric": [metric_provider.METRIC_NAMES[k] for k in base_metrics1.keys()],
    m1.model_name: list(base_metrics1.values()),
    m2.model_name: list(base_metrics2.values()),
})
fig = px.bar(df, x='metric', y=[m1.model_name, m2.model_name], barmode='group')
fig.update_layout(title="Overall Metrics")
fig.show()

## Outcome Counts

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=[m1.TP_count], y=[m1.model_name], name='TP', orientation='h', marker=dict(color='#1fb466')))
fig.add_trace(go.Bar(x=[m1.FN_count], y=[m1.model_name], name='FN', orientation='h', marker=dict(color='#dd3f3f')))
fig.add_trace(go.Bar(x=[m1.FP_count], y=[m1.model_name], name='FP', orientation='h', marker=dict(color='#d5a5a5')))

fig.add_trace(go.Bar(x=[m2.TP_count], y=[m2.model_name], name='TP', orientation='h', marker=dict(color='#1fb466')))
fig.add_trace(go.Bar(x=[m2.FN_count], y=[m2.model_name], name='FN', orientation='h', marker=dict(color='#dd3f3f')))
fig.add_trace(go.Bar(x=[m2.FP_count], y=[m2.model_name], name='FP', orientation='h', marker=dict(color='#d5a5a5')))
              
fig.update_layout(barmode='stack', title="Outcome Counts",
                  width=600, height=350)
fig.update_xaxes(title_text="Count")
fig.update_yaxes(tickangle=0)
fig.show()

## Recall

In [None]:
blue_color = '#1f77b4'
orange_color = '#ff7f0e'

In [None]:
per_class_metrics_df1 = m1.per_class_metrics()
per_class_metrics_df2 = m2.per_class_metrics()

f1_mean = (per_class_metrics_df1["f1"] + per_class_metrics_df2["f1"]) / 2
f1_mean = f1_mean.sort_values()
pr_mean = (per_class_metrics_df1["precision"] + per_class_metrics_df2["precision"]) / 2
pr_mean = pr_mean.sort_values()
rc_mean = (per_class_metrics_df1["recall"] + per_class_metrics_df2["recall"]) / 2
rc_mean = rc_mean.sort_values()

In [None]:
# Per-class Precision line chart
per_class_metrics_df_sorted1 = per_class_metrics_df1.iloc[f1_mean.index]
per_class_metrics_df_sorted2 = per_class_metrics_df2.iloc[f1_mean.index]

fig = go.Figure()
fig.add_trace(go.Scatter(x=per_class_metrics_df_sorted1["category"], y=per_class_metrics_df_sorted1["precision"],
                            mode='lines', name=m1.model_name, fill='tozeroy', line=dict(width=3)))
fig.add_trace(go.Scatter(x=per_class_metrics_df_sorted2["category"], y=per_class_metrics_df_sorted2["precision"],
                            mode='lines', name=m2.model_name, fill='tozeroy', line=dict(width=3)))

fig.update_layout(title="Per-class Precision", xaxis_title="Category", yaxis_title="Precision",
                    width=None, height=500)
fig.update_xaxes(tickangle=45)
fig.update_yaxes(range=[0., 1.])
fig.show()

In [None]:
# Per-class Recall line chart
per_class_metrics_df_sorted1 = per_class_metrics_df1.iloc[f1_mean.index]
per_class_metrics_df_sorted2 = per_class_metrics_df2.iloc[f1_mean.index]

fig = go.Figure()
fig.add_trace(go.Scatter(x=per_class_metrics_df_sorted1["category"], y=per_class_metrics_df_sorted1["recall"],
                            mode='lines', name=m1.model_name, fill='tozeroy', line=dict(width=3)))
fig.add_trace(go.Scatter(x=per_class_metrics_df_sorted2["category"], y=per_class_metrics_df_sorted2["recall"],
                            mode='lines', name=m2.model_name, fill='tozeroy', line=dict(width=3)))

fig.update_layout(title="Per-class Recall", xaxis_title="Category", yaxis_title="Recall",
                    width=None, height=500)
fig.update_xaxes(tickangle=45)
fig.update_yaxes(range=[0., 1.])
fig.show()

In [None]:
# Per-class F1 bar chart
per_class_metrics_df_sorted1 = per_class_metrics_df1.iloc[f1_mean.index]
per_class_metrics_df_sorted2 = per_class_metrics_df2.iloc[f1_mean.index]

fig = go.Figure()
fig.add_trace(go.Scatter(x=per_class_metrics_df_sorted1["category"], y=per_class_metrics_df_sorted1["f1"],
                            mode='lines', name=m1.model_name, fill='tozeroy', line=dict(width=3)))
fig.add_trace(go.Scatter(x=per_class_metrics_df_sorted2["category"], y=per_class_metrics_df_sorted2["f1"],
                            mode='lines', name=m2.model_name, fill='tozeroy', line=dict(width=3)))

fig.update_layout(title="Per-class F1", xaxis_title="Category", yaxis_title="F1",
                    width=None, height=500)
fig.update_xaxes(tickangle=45)
fig.update_yaxes(range=[0., 1.])
fig.show()

## PR-curve

In [None]:
pr_curve1 = m1.pr_curve()
pr_curve2 = m2.pr_curve()

In [None]:
import plotly.colors
colors = px.colors.qualitative.Plotly
colors_a = [f"rgba{tuple(list(plotly.colors.hex_to_rgb(c))+[0.35])}" for c in colors]

In [None]:
# Precision-Recall curve
fig = go.Figure()
fig.add_traces([
    go.Scatter(x=m1.recThrs, y=pr_curve1.mean(-1), name=m1.model_name, line=dict(width=2, color=colors[0]), mode='lines', fill='tozeroy', fillcolor=colors_a[0]),
    go.Scatter(x=m1.recThrs, y=pr_curve1.mean(-1), name=m1.model_name, line=dict(width=2, color=colors[0]), mode='lines', showlegend=False),
    go.Scatter(x=m2.recThrs, y=pr_curve2.mean(-1), name=m2.model_name, line=dict(width=2, color=colors[1]), mode='lines', fill='tozeroy', fillcolor=colors_a[1]),
    go.Scatter(x=m2.recThrs, y=pr_curve2.mean(-1), name=m2.model_name, line=dict(width=2, color=colors[1]), mode='lines', showlegend=False),
])
fig.update_layout(title="Precision-Recall Curve", xaxis_title="Recall", yaxis_title="Precision",
                    width=700, height=600)
fig.update_yaxes(range=[0., 1.01])
fig.update_xaxes(range=[0., 1.])
fig.add_trace(
    go.Scatter(
        x=m1.recThrs,
        y=[1]*len(m1.recThrs),
        name="Perfect",
        line=dict(color='orange', dash='dash'),
        showlegend=True
    )
)
fig.show()

## Calibration

In [None]:
# Calibration curve (only positive predictions)
true_probs, pred_probs = m_full1.calibration_metrics.calibration_curve()

fig = go.Figure()
fig.add_trace(go.Scatter(x=pred_probs, y=true_probs, mode='lines+markers', name=m_full1.model_name, 
                         ))

true_probs, pred_probs = m_full2.calibration_metrics.calibration_curve()
fig.add_trace(go.Scatter(x=pred_probs, y=true_probs, mode='lines+markers', name=m_full2.model_name, 
                         ))



fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Perfectly calibrated', 
                         line=dict(color='orange', dash='dash')))

fig.update_layout(
    title='Calibration Curve (only positive predictions)',
    xaxis_title='Confidence Score',
    yaxis_title='Fraction of True Positives',
    legend=dict(x=0.6, y=0.1),
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1]),
    width=700, height=500
)

fig.show()

In [None]:
df = pd.DataFrame(score_profile)
df.columns = ['scores', 'Precision', 'Recall', 'F1']

# downsample
if len(df) > 5000:
    df_down = df.iloc[::len(df)//1000]
else:
    df_down = df

color_map = {
    'Precision': '#1f77b4',
    'Recall': 'orange',
}
fig = px.line(df_down, x='scores', y=['Precision', 'Recall', 'F1'], title='Confidence Score Profile',
              labels={'value': 'Value', 'variable': 'Metric', 'scores': 'Confidence Score'},
              width=None, height=500,
              color_discrete_map=color_map)
fig.update_layout(yaxis=dict(range=[0, 1]),
                  xaxis=dict(range=[0, 1], tick0=0, dtick=0.1))

# Add vertical line for the best threshold
fig.add_shape(type="line", x0=f1_optimal_conf, x1=f1_optimal_conf, y0=0, y1=best_f1, line=dict(color="gray", width=2, dash="dash"))
fig.add_annotation(x=f1_optimal_conf, y=best_f1+0.04, text=f"F1-optimal threshold: {f1_optimal_conf:.2f}", showarrow=False)
fig.show()