## Evaluation of our FoooDetectionModel pipeline (detection + classification)

In [2]:
import os 
import torch
from ultralytics import YOLO
from FoodDetection import ClassificationModelManager, FoodDetectionModel

In [3]:
clsModelsDict = ({
    "meat": "classification_models/YOLO/meat.pt",
    "vegetable": "classification_models/YOLO/vegetable.pt",
    "fruit": "classification_models/YOLO/fruit.pt",
    "cheese-dairy" : "classification_models/YOLO/cheese-dairy.pt",
    "bread-pasta-grains": "classification_models/YOLO/bread-pasta-grains.pt",
    "nuts-seeds": "classification_models/YOLO/nuts-seeds.pt",
    "misc": "classification_models/YOLO/misc.pt",
})

model = FoodDetectionModel(
    detection_model_path="runs/detect/yolo_det_v4_m/weights/best.pt",
    classification_config=clsModelsDict,
    detection_id_to_name="pipeline_data/dicts/detect_classes_v4.json",
    det_to_cls_group="pipeline_data/dicts/det_to_cls_groups.json"
)

In [None]:
import os
import json
import numpy as np
from sklearn.metrics import f1_score, hamming_loss, jaccard_score, precision_score, recall_score
from tqdm import tqdm


def evaluate_multilabel_detection(model, val_img_dir, binary_labels_val_dir, detect_dict_path, conf_threshold=0.3):
    """
    Runs multilabel evaluation for a detection model.
    """

    with open(detect_dict_path, "r") as f:
        detection_dict = json.load(f)
    num_classes = len(detection_dict)

    all_y_true = []
    all_y_pred = []

    print(f"Starting evaluation of {len(os.listdir(val_img_dir))} files...")

    img_files = [f for f in os.listdir(val_img_dir) if f.lower().endswith((".jpg", ".png"))]

    bar_format = "{l_bar}{bar} {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"

    for file_name in tqdm(
        img_files,
        desc="Evaluating",
        bar_format=bar_format,
        leave=True,
    ):

        img_path = os.path.join(val_img_dir, file_name)

        # Run model
        output = model.run(
            image_path=img_path,
            conf_threshold=conf_threshold,
            det_imgsz=800,
            verbose=False
        )

        # Load ground-truth labels
        y_true_file = os.path.join(
            binary_labels_val_dir,
            file_name.replace(".jpg", ".txt").replace(".png", ".txt")
        )

        try:
            with open(y_true_file, "r") as f:
                y_true = [int(x) for x in f.read().strip().split()]
        except FileNotFoundError:
            continue

        # Extract unique predicted classes
        unique_pred_classes = set()
        for det in output:
            unique_pred_classes.add(det["pred_class_id"])

        # Create binary prediction vector
        y_pred = [1 if i in unique_pred_classes else 0 for i in range(num_classes)]

        all_y_true.append(y_true)
        all_y_pred.append(y_pred)

    Y_true_np = np.array(all_y_true)
    Y_pred_np = np.array(all_y_pred)

    print(f"Processed {len(Y_true_np)} samples.")

    macro_f1 = f1_score(Y_true_np, Y_pred_np, average="macro", zero_division=0)
    micro_f1 = f1_score(Y_true_np, Y_pred_np, average="micro", zero_division=0)

    samples_f1 = f1_score(Y_true_np, Y_pred_np, average="samples", zero_division=0)

    micro_prec = precision_score(Y_true_np, Y_pred_np, average="micro", zero_division=0)
    micro_rec = recall_score(Y_true_np, Y_pred_np, average="micro", zero_division=0)
    h_loss = hamming_loss(Y_true_np, Y_pred_np)
    j_score = jaccard_score(Y_true_np, Y_pred_np, average="samples", zero_division=0)

    print("\n--- Multi-label Classification Metrics ---")
    print(f"Macro F1:   {macro_f1:.4f}")
    print(f"Micro F1:   {micro_f1:.4f}")
    print(f"Samples F1:     {samples_f1:.4f}")
    print(f"Precision: {micro_prec:.4f}")
    print(f"Recall:  {micro_rec:.4f}")
    print(f"Hamming:    {h_loss:.4f}")
    print(f"Jaccard:    {j_score:.4f}")

    return {
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "samples_f1": samples_f1,
        "precision": micro_prec,
        "Recall": micro_rec,
        "hamming_loss": h_loss,
        "jaccard_score": j_score
    }

In [5]:
print("Evaluation metrics with confidence threshold 0.3:")
metrics = evaluate_multilabel_detection(
    model=model,
    val_img_dir="data/dataset_v4/val/images",
    binary_labels_val_dir="data/dataset_v4/val/binary_labels",
    detect_dict_path="pipeline_data/dicts/detect_classes_v4.json",
    conf_threshold=0.3
)

Evaluation metrics with confidence threshold 0.3:
Starting evaluation of 3599 files...


Evaluating: 100%|██████████ 3599/3599 [03:01<00:00]

Processed 3599 samples.

--- Multi-label Classification Metrics ---
Macro F1:   0.7303
Micro F1:   0.7433
Samples F1:     0.7238
Precision: 0.7591
Recall:  0.7282
Hamming:    0.0073
Jaccard:    0.6883





In [6]:
print("Evaluation metrics with confidence threshold 0.2:")
metrics = evaluate_multilabel_detection(
    model=model,
    val_img_dir="data/dataset_v4/val/images",
    binary_labels_val_dir="data/dataset_v4/val/binary_labels",
    detect_dict_path="pipeline_data/dicts/detect_classes_v4.json",
    conf_threshold=0.2
)

Evaluation metrics with confidence threshold 0.2:
Starting evaluation of 3599 files...


Evaluating: 100%|██████████ 3599/3599 [03:06<00:00]


Processed 3599 samples.

--- Multi-label Classification Metrics ---
Macro F1:   0.7261
Micro F1:   0.7373
Samples F1:     0.7337
Precision: 0.7181
Recall:  0.7575
Hamming:    0.0079
Jaccard:    0.6929


In [7]:
print("Evaluation metrics with confidence threshold 0.15:")
metrics = evaluate_multilabel_detection(
    model=model,
    val_img_dir="data/dataset_v4/val/images",
    binary_labels_val_dir="data/dataset_v4/val/binary_labels",
    detect_dict_path="pipeline_data/dicts/detect_classes_v4.json",
    conf_threshold=0.15
)

Evaluation metrics with confidence threshold 0.15:
Starting evaluation of 3599 files...


Evaluating: 100%|██████████ 3599/3599 [02:50<00:00]


Processed 3599 samples.

--- Multi-label Classification Metrics ---
Macro F1:   0.7181
Micro F1:   0.7288
Samples F1:     0.7339
Precision: 0.6901
Recall:  0.7721
Hamming:    0.0084
Jaccard:    0.6893


In [8]:
print("Evaluation metrics with confidence threshold 0.4:")
metrics = evaluate_multilabel_detection(
    model=model,
    val_img_dir="data/dataset_v4/val/images",
    binary_labels_val_dir="data/dataset_v4/val/binary_labels",
    detect_dict_path="pipeline_data/dicts/detect_classes_v4.json",
    conf_threshold=0.4
)

Evaluation metrics with confidence threshold 0.4:
Starting evaluation of 3599 files...


Evaluating: 100%|██████████ 3599/3599 [02:42<00:00]


Processed 3599 samples.

--- Multi-label Classification Metrics ---
Macro F1:   0.7291
Micro F1:   0.7422
Samples F1:     0.7053
Precision: 0.7911
Recall:  0.6991
Hamming:    0.0071
Jaccard:    0.6735


In [9]:
print("Evaluation metrics with confidence threshold 0.5:")
metrics = evaluate_multilabel_detection(
    model=model,
    val_img_dir="data/dataset_v4/val/images",
    binary_labels_val_dir="data/dataset_v4/val/binary_labels",
    detect_dict_path="pipeline_data/dicts/detect_classes_v4.json",
    conf_threshold=0.5
)

Evaluation metrics with confidence threshold 0.5:
Starting evaluation of 3599 files...


Evaluating: 100%|██████████ 3599/3599 [02:40<00:00]


Processed 3599 samples.

--- Multi-label Classification Metrics ---
Macro F1:   0.7196
Micro F1:   0.7340
Samples F1:     0.6826
Precision: 0.8144
Recall:  0.6681
Hamming:    0.0070
Jaccard:    0.6529
