In [None]:
!pip install pandas sentence-transformers scikit-learn


In [3]:
# -------------------------------
# Step 1: Load Required Libraries
# -------------------------------
# Importing necessary libraries for file operations, natural language processing, and data processing
import os
import nltk

# Downloading required NLTK datasets for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Importing pandas for data manipulation and JSON handling
import pandas as pd
import json
import ast

# Importing sentence transformers for semantic similarity and cosine similarity calculations
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch

# -------------------------------
# Step 2: Initialize Tools
# -------------------------------
# Initializing the WordNet lemmatizer for text normalization
lemmatizer = WordNetLemmatizer()

# Loading the pre-trained sentence transformer model for semantic embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# -------------------------------
# Step 3: Load MSCOCO Entities
# -------------------------------
# Reading the MSCOCO captions entities CSV file into a DataFrame
mscoco_df = pd.read_csv('mscoco_captions_entities.csv')

# Creating a dictionary to store image names and their associated entities
mscoco_entities = {}
for _, row in mscoco_df.iterrows():
    image = row['image_name']
    # Converting string representation of entities list to actual list
    entities = ast.literal_eval(row['entities'])
    # Storing entities as a set of lowercase strings for case-insensitive matching
    mscoco_entities[image] = set(e.strip().lower() for e in entities)

# -------------------------------
# Step 4: Load YOLOv8 Detections
# -------------------------------
# Loading YOLOv8 detection results from JSON file
with open('Flux-Dev_yolov8_detections.json', 'r') as f:  # Replace the file name with SDXL_1.0 , SD2 for their corresponding computation
    detection_data = json.load(f)

# Defining the COCO dataset class labels for mapping label IDs to names
coco_labels = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
    "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote",
    "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book",
    "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]

# -------------------------------
# Step 5: Text Normalization Utils
# -------------------------------
# Defining function to normalize text by tokenizing, lowercasing, and lemmatizing
def normalize(text):
    tokens = word_tokenize(text.lower())
    return " ".join(lemmatizer.lemmatize(t) for t in tokens if t.isalpha())

# Defining function to check semantic relationship between two words
def is_semantically_related(word1, word2):
    # Normalizing both input words
    norm1 = normalize(word1)
    norm2 = normalize(word2)

    # Converting normalized texts to sets of tokens
    tokens1, tokens2 = set(norm1.split()), set(norm2.split())
    if not tokens1 or not tokens2:
        return False

    # Calculating Jaccard similarity between token sets
    jaccard = len(tokens1 & tokens2) / len(tokens1 | tokens2)
    if jaccard >= 0.5:
        return True

    # Checking WordNet synsets for semantic relationships
    for t1 in tokens1:
        for t2 in tokens2:
            s1 = wn.synsets(t1)
            s2 = wn.synsets(t2)
            if not s1 or not s2:
                continue
            for syn1 in s1:
                for syn2 in s2:
                    if syn1 == syn2:
                        return True
                    # Checking if one synset is a hypernym of the other
                    if syn1 in syn2.closure(lambda s: s.hypernyms()):
                        return True
                    if syn2 in syn1.closure(lambda s: s.hypernyms()):
                        return True
    return False

# -------------------------------
# Step 6: Compare YOLO Labels to MSCOCO
# -------------------------------
# Initializing list to store comparison results and dictionary for CHAIR score calculation
results = []
image_to_labels = {}  # For CHAIRi

# Processing each detection entry from YOLOv8 results
for entry in detection_data:
    # Extracting and normalizing image name to match MSCOCO format
    image_name_full = entry['image']
    image_name = image_name_full.replace("Flux-Dev-Pregen-", "").replace(".jpeg", "") # Here we are removing the prefixes of images names inside the json so that they match MSCOCO's image names
                                        #Replace the file name with SDXL_1.0 , SD2 for their corresponding computation
    # Extracting detected labels from YOLO bounding boxes
    detected_labels = set()
    for box in entry['boxes']:
        label_id = box['label']
        if label_id < len(coco_labels):
            detected_labels.add(coco_labels[label_id].strip().lower())

    # Retrieving ground truth entities for the current image
    ground_truth_entities = mscoco_entities.get(image_name, set())
    if not ground_truth_entities:
        continue

    # Generating embeddings for all ground truth entities
    entity_list = list(ground_truth_entities)
    entity_embeddings = model.encode(entity_list, convert_to_tensor=True)

    # Initializing set to track hallucinated labels for this image
    hallucinated_labels = set()

    # Comparing each detected label with ground truth entities
    for label in detected_labels:
        # Generating embedding for the current detected label
        label_embedding = model.encode(label, convert_to_tensor=True)
        # Calculating cosine similarities between label and all entities
        sims = util.cos_sim(label_embedding, entity_embeddings)[0]
        max_sim = sims.max().item()
        matched_entity = entity_list[sims.argmax().item()]

        # Checking semantic relationship between label and matched entity
        if is_semantically_related(label, matched_entity):
            hallucinated = False
        else:
            # Marking as hallucinated if similarity is below threshold
            hallucinated = max_sim < 0.45

        # Recording hallucinated labels for CHAIR score calculation
        if hallucinated:
            hallucinated_labels.add(label)

        # Storing comparison results for this label
        results.append({
            'image_name': image_name,
            'yolo_label': label,
            'matched_entity': matched_entity,
            'similarity_score': round(max_sim, 4),
            'hallucinated': hallucinated
        })

    # Storing hallucination data for CHAIR Score computation
    if detected_labels:
        image_to_labels[image_name] = {
            'hallucinated': hallucinated_labels,
            'all_detected': detected_labels
        }

# -------------------------------
# Step 7: Save Results
# -------------------------------
# Converting results to DataFrame and saving to CSV files
results_df = pd.DataFrame(results)
results_df.to_csv('Flux-Dev_yolov8_vs_mscoco_hallucination_check.csv', index=False)             #Replace the file name with SDXL_1.0 , SD2 for their corresponding file generations
results_df[results_df['hallucinated']].to_csv('Flux-Dev_yolov8_hallucinations_only.csv', index=False)

# -------------------------------
# Step 8: Final Metrics
# -------------------------------
# Calculating various evaluation metrics
total_preds = len(results_df)
hallucinations = results_df['hallucinated'].sum()
correct_detections = total_preds - hallucinations
precision = correct_detections / total_preds if total_preds > 0 else 0.0
hallucination_rate = hallucinations / total_preds if total_preds > 0 else 0.0

# Calculating CHAIR score (average over images)
chair_scores = []
for v in image_to_labels.values():
    total = len(v['all_detected'])
    hallucinated = len(v['hallucinated'])
    if total > 0:
        chair_scores.append(hallucinated / total)
CHAIR = sum(chair_scores) / len(chair_scores) if chair_scores else 0.0

# Calculating recall metric
total_gt_entities = sum(len(ents) for ents in mscoco_entities.values())
recall = correct_detections / total_gt_entities if total_gt_entities > 0 else 0.0

# Calculating F1 Score from precision and recall
if precision + recall > 0:
    f1_score = 2 * (precision * recall) / (precision + recall)
else:
    f1_score = 0.0

# Printing all calculated metrics
print(f" Total YOLO Predictions Checked: {total_preds}")
print(f" Correct Detections (Matched MSCOCO): {correct_detections}")
print(f" Hallucinated Labels (No Match): {hallucinations}")
print(f" Precision: {precision:.4f}")
print(f" Recall: {recall:.4f}")
print(f" F1 Score: {f1_score:.4f}")
print(f" Hallucination Rate: {hallucination_rate:.4f}")
print(f" CHAIR Score (avg over images): {CHAIR:.4f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


 Total YOLO Predictions Checked: 507
 Correct Detections (Matched MSCOCO): 350
 Hallucinated Labels (No Match): 157
 Precision: 0.6903
 Recall: 0.5636
 F1 Score: 0.6206
 Hallucination Rate: 0.3097
 CHAIR Score (avg over images): 0.2378
