In [2]:
import os 
import torch
from ultralytics import YOLO
from FoodDetection import ClassificationModelManager, FoodDetectionModel

In [4]:
clsModelsDict = ({
    "meat": "classification_models/YOLO/meat.pt",
    "vegetable": "classification_models/YOLO/vegetable.pt",
    "fruit": "classification_models/YOLO/fruit.pt",
    "cheese-dairy" : "classification_models/YOLO/cheese-dairy.pt",
    "bread-pasta-grains": "classification_models/YOLO/bread-pasta-grains.pt",
    "nuts-seeds": "classification_models/YOLO/nuts-seeds.pt",
    "misc": "classification_models/YOLO/misc.pt",
})

model = FoodDetectionModel(
    detection_model_path="runs/detect/yolo_det_v4_m/weights/best.pt",
    classification_config=clsModelsDict,
    detection_id_to_name="pipeline_data/dicts/detect_classes_v4.json",
    det_to_cls_group="pipeline_data/dicts/det_to_cls_groups.json"
)

In [18]:
import os
import json
import numpy as np
from sklearn.metrics import f1_score, hamming_loss, jaccard_score, precision_score, recall_score
from tqdm import tqdm


def evaluate_multilabel_detection(model, val_img_dir, binary_labels_val_dir, detect_dict_path):
    """
    Runs multilabel evaluation for a detection model.
    
    Parameters
    ----------
    model : object
        Detection model with .run(...) method returning detections.
    val_img_dir : str
        Path to directory with validation images.
    binary_labels_val_dir : str
        Path to directory with binary multilabel .txt files.
    detect_dict_path : str
        Path to JSON file mapping class names/IDs. Size determines num_classes.
    
    Returns
    -------
    dict
        Dictionary with metrics: macro_f1, micro_f1, hamming_loss, jaccard_score.
    """

    # Load detection dictionary → determine number of classes
    with open(detect_dict_path, "r") as f:
        detection_dict = json.load(f)
    num_classes = len(detection_dict)

    all_y_true = []
    all_y_pred = []

    print(f"Starting evaluation of {len(os.listdir(val_img_dir))} files...")

    img_files = [f for f in os.listdir(val_img_dir) if f.lower().endswith((".jpg", ".png"))]

    bar_format = "{l_bar}{bar} {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"

    for file_name in tqdm(
        img_files,
        desc="Evaluating",
        bar_format=bar_format,
        leave=True,
    ):

        img_path = os.path.join(val_img_dir, file_name)

        # Run model
        output = model.run(
            image_path=img_path,
            conf_threshold=0.3,
            det_imgsz=800,
            verbose=False
        )

        # Load ground-truth labels
        y_true_file = os.path.join(
            binary_labels_val_dir,
            file_name.replace(".jpg", ".txt").replace(".png", ".txt")
        )

        try:
            with open(y_true_file, "r") as f:
                y_true = [int(x) for x in f.read().strip().split()]
        except FileNotFoundError:
            continue

        # Extract unique predicted classes
        unique_pred_classes = set()
        for det in output:
            unique_pred_classes.add(det["pred_class_id"])

        # Create binary prediction vector
        y_pred = [1 if i in unique_pred_classes else 0 for i in range(num_classes)]

        all_y_true.append(y_true)
        all_y_pred.append(y_pred)

    # Convert lists to NumPy arrays
    Y_true_np = np.array(all_y_true)
    Y_pred_np = np.array(all_y_pred)

    print(f"Processed {len(Y_true_np)} samples.")

    # Compute metrics
    macro_f1 = f1_score(Y_true_np, Y_pred_np, average="macro", zero_division=0)
    micro_f1 = f1_score(Y_true_np, Y_pred_np, average="micro", zero_division=0)

    # Samples F1: "Jak dobrze oceniono pojedynczy talerz (średnia per zdjęcie)"
    samples_f1 = f1_score(Y_true_np, Y_pred_np, average="samples", zero_division=0)

    micro_prec = precision_score(Y_true_np, Y_pred_np, average="micro", zero_division=0)
    micro_rec = recall_score(Y_true_np, Y_pred_np, average="micro", zero_division=0)
    h_loss = hamming_loss(Y_true_np, Y_pred_np)
    j_score = jaccard_score(Y_true_np, Y_pred_np, average="samples", zero_division=0)

    print("\n--- Multi-label Classification Metrics ---")
    print(f"Macro F1:   {macro_f1:.4f}")
    print(f"Micro F1:   {micro_f1:.4f}")
    print(f"Samples F1:     {samples_f1:.4f}")
    print(f"Precision: {micro_prec:.4f}")
    print(f"Recall:  {micro_rec:.4f}")
    print(f"Hamming:    {h_loss:.4f}")
    print(f"Jaccard:    {j_score:.4f}")

    return {
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "samples_f1": samples_f1,
        "precision": micro_prec,
        "Recall": micro_rec,
        "hamming_loss": h_loss,
        "jaccard_score": j_score
    }

In [19]:
metrics = evaluate_multilabel_detection(
    model=model,
    val_img_dir="data/dataset_v4/val/images",
    binary_labels_val_dir="data/dataset_v4/val/binary_labels",
    detect_dict_path="pipeline_data/dicts/detect_classes_v4.json"
)

Starting evaluation of 3599 files...


Evaluating: 100%|██████████ 3599/3599 [02:59<00:00]


Processed 3599 samples.

--- Multi-label Classification Metrics ---
Macro F1:   0.7303
Micro F1:   0.7433
Samples F1:     0.7238
Precision: 0.7591
Recall:  0.7282
Hamming:    0.0073
Jaccard:    0.6883
