# Generating the captions from Transformer models

In [None]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, ViTImageProcessor, VisionEncoderDecoderModel, AutoTokenizer
from PIL import Image
import pandas as pd
import os

# Defining the folder path where images are stored
image_folder = "/content/drive/MyDrive/Fine-Grained-Hallucination-main/sd_2_outputs"

# Defining the output CSV file path where captions will be saved
csv_output_path = "/content/drive/MyDrive/image_captions.csv"

# Loading the BLIP and ViT models along with their processors and tokenizers
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

vit_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vit_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Defining a function that is generating captions for a given image
def generate_caption(image_path, processor, model, tokenizer=None):
    # Opening and converting the image to RGB format for processing
    image = Image.open(image_path).convert("RGB")

    # Checking if the processor belongs to ViT and preparing the pixel values accordingly
    if isinstance(processor, ViTImageProcessor):
        pixel_values = processor(images=image, return_tensors="pt").pixel_values

        # Generating the caption using the ViT model while ensuring no gradient computation for efficiency
        with torch.no_grad():
            output_ids = model.generate(pixel_values)

        # Decoding the generated token IDs to get the final caption
        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    else:
        # Preparing inputs for BLIP model
        inputs = processor(image, return_tensors="pt")

        # Generating the caption using BLIP while ensuring efficient computation
        with torch.no_grad():
            output = model.generate(**inputs)

        # Decoding the generated output into a readable caption
        caption = processor.batch_decode(output, skip_special_tokens=True)[0]

    return caption

# Initializing an empty list for storing the image captions
captions_data = []

# Iterating through each image file in the specified folder
for image_name in os.listdir(image_folder):
    image_path = os.path.join(image_folder, image_name)

    # Checking if the file is an image format (PNG, JPG, JPEG)
    if image_path.lower().endswith((".png", ".jpg", ".jpeg")):
        try:
            # Generating captions using BLIP and ViT models
            blip_caption = generate_caption(image_path, blip_processor, blip_model)
            vit_caption = generate_caption(image_path, vit_processor, vit_model, tokenizer=vit_tokenizer)

            # Appending the generated captions and image ID to the list
            captions_data.append({
                "image_id": image_name,
                "blip_caption": blip_caption,
                "vit_caption": vit_caption
            })

        # Handling any exceptions that might occur during the processing
        except Exception as e:
            print(f"Failed to process {image_name}: {e}")

# Creating a DataFrame to store the collected captions
df = pd.DataFrame(captions_data)

# Saving the processed captions into a CSV file
df.to_csv(csv_output_path, index=False)

# Printing a confirmation message to indicate successful saving of captions
print(f"Captions saved to {csv_output_path}")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

Captions saved to /content/drive/MyDrive/image_captions.csv


In [None]:
!pip install --upgrade nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')  # Explicitly download punkt_tab




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=40932ebdb7f28c646098ec774315d89efda413237416ff6c40f89f1e9fe88f4a
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:

# Load the CSV file
csv_path = "/content/drive/MyDrive/image_captions.csv"
df = pd.read_csv(csv_path)

# Extract numeric part of image_id and convert to integer for sorting
df['numeric_id'] = df['image_id'].str.extract(r'(\d+)').astype(int)

# Sort by numeric_id in ascending order
df_sorted = df.sort_values(by="numeric_id", ascending=True).drop(columns=['numeric_id'])

# Save the sorted CSV as a new file
sorted_csv_path = "/content/drive/MyDrive/sorted_images_captions.csv"
df_sorted.to_csv(sorted_csv_path, index=False)

print(f"Sorted CSV saved successfully at {sorted_csv_path}")


In [None]:
# Here we loaded the original prompts containing .csv file and renamed it for ease of use
# Load the CSV file
csv_path = "/content/drive/MyDrive/Fine-Grained-Hallucination-main/DrawBenchPrompts.csv"
df = pd.read_csv(csv_path)

# Generate image IDs in the format '0.jpg', '1.jpg', ..., '199.jpg'
df.insert(0, 'image_id', [f"{i}.jpg" for i in range(len(df))])

# Save the updated CSV with the new name
new_csv_path = "/content/drive/MyDrive/Fine-Grained-Hallucination-main/Prompts.csv"
df.to_csv(new_csv_path, index=False)

print(f"Updated CSV saved successfully as {new_csv_path}")


Updated CSV saved successfully as /content/drive/MyDrive/Fine-Grained-Hallucination-main/Prompts.csv


In [None]:
#Importing all necessary libraries
import os
import time
import string
import nltk
from ast import literal_eval
from PIL import Image
from tqdm import tqdm
import google.generativeai as genai

# NLTK-related imports
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score

# ROUGE scoring
from rouge_score import rouge_scorer

# Sentence similarity tools
from sentence_transformers import SentenceTransformer, util


# Calculating various metrics

In [None]:
# Loading the CSV files that contain captions and corresponding prompts
captions_df = pd.read_csv('/content/drive/MyDrive/sorted_images_captions.csv')
prompts_df = pd.read_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/Prompts.csv')

# Initializing the ROUGE scorer to calculate similarity scores based on different ROUGE metrics
# Also, initializing the BLEU smoothing function to improve BLEU score calculations
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction()

# Defining a function that is computing the BLEU, METEOR, and ROUGE scores for a given prompt and caption
def compute_scores(prompt, caption):
    # Tokenizing both the prompt and the caption to prepare them for BLEU and METEOR calculations
    tokenized_prompt = word_tokenize(prompt)
    tokenized_caption = word_tokenize(caption)

    # Calculating the BLEU score by comparing the caption against the reference prompt
    bleu = sentence_bleu([tokenized_prompt], tokenized_caption, smoothing_function=smoothing.method1)

    # Calculating the METEOR score which considers synonyms and paraphrases for evaluation
    meteor = meteor_score([tokenized_prompt], tokenized_caption)

    # Calculating the ROUGE score to measure the overlap of words and phrases between the prompt and caption
    rouge_scores = rouge.score(prompt, caption)

    return bleu, meteor, rouge_scores['rouge1'].fmeasure

# Merging the captions dataset with the prompts dataset based on the 'image_id' column
merged_df = captions_df.merge(prompts_df, on='image_id')

# Initializing an empty list to store the computed similarity scores for each image caption
results = []

# Iterating over each row in the merged dataset to compute the similarity scores
for index, row in merged_df.iterrows():
    prompt = row['Prompts']
    blip_caption = row['blip_caption']
    vit_caption = row['vit_caption']

    # Computing BLEU, METEOR, and ROUGE scores for BLIP-generated captions
    bleu_blip, meteor_blip, rouge_blip = compute_scores(prompt, blip_caption)

    # Computing BLEU, METEOR, and ROUGE scores for ViT-generated captions
    bleu_vit, meteor_vit, rouge_vit = compute_scores(prompt, vit_caption)

    # Storing the computed scores in a structured dictionary for later analysis
    results.append({
        'image_id': row['image_id'],
        'bleu_blip': bleu_blip,
        'bleu_vit': bleu_vit,
        'meteor_blip': meteor_blip,
        'meteor_vit': meteor_vit,
        'rouge_blip': rouge_blip,
        'rouge_vit': rouge_vit
    })

# Creating a DataFrame from the results and saving it as a CSV file for further analysis
final_df = pd.DataFrame(results)
final_df.to_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/1final.csv', index=False)

# Calculating and printing the average scores for BLEU, METEOR, and ROUGE metrics across all captions
averages = final_df.mean(numeric_only=True)
print("Average Scores:\n", averages)


Average Scores:
 bleu_blip      0.038710
bleu_vit       0.033498
meteor_blip    0.227122
meteor_vit     0.186605
rouge_blip     0.314312
rouge_vit      0.289201
dtype: float64


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Calculating similarity scores between the original prompts and the ones generated from transformer models

In [None]:
# Downloading the required NLTK datasets for tokenization, stopword removal, and lemmatization
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Loading the CSV files containing image captions and their corresponding prompts
captions_df = pd.read_csv('/content/drive/MyDrive/sorted_images_captions.csv')
prompts_df = pd.read_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/Prompts.csv')

# Initializing the evaluation tools: ROUGE scorer for textual overlap, smoothing function for BLEU, and Sentence Transformer for semantic similarity
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction()
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initializing the lemmatizer for text preprocessing and setting up the stopword removal set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Defining a function that is computing the semantic similarity between a prompt and generated caption
def compute_semantic_similarity(prompt, caption):
    # Encoding the prompt and caption into vector representations
    prompt_embedding = model.encode(prompt, convert_to_tensor=True)
    caption_embedding = model.encode(caption, convert_to_tensor=True)

    # Calculating cosine similarity between the two encoded vectors to assess semantic closeness
    return util.cos_sim(prompt_embedding, caption_embedding).item()

# Defining a function that is computing various evaluation metrics between a prompt and its corresponding caption
def compute_scores(prompt, caption):
    # Ensuring inputs are converted to strings if necessary
    if not isinstance(prompt, str): prompt = str(prompt)
    if not isinstance(caption, str): caption = str(caption)

    # Tokenizing the prompt and caption for BLEU and METEOR score calculations
    tokenized_prompt = word_tokenize(prompt)
    tokenized_caption = word_tokenize(caption)

    # Computing the BLEU score using a smoothing function to improve readability of comparisons
    bleu = sentence_bleu([tokenized_prompt], tokenized_caption, smoothing_function=smoothing.method1)

    # Computing the METEOR score, which takes synonym matching into account for better evaluation
    meteor = meteor_score([tokenized_prompt], tokenized_caption)

    # Computing the ROUGE score, which measures the amount of text overlap between the prompt and generated caption
    rouge_scores = rouge.score(prompt, caption)

    # Computing semantic similarity between the prompt and caption using cosine similarity
    semantic_similarity = compute_semantic_similarity(prompt, caption)

    return bleu, meteor, rouge_scores['rouge1'].fmeasure, semantic_similarity

# Merging the captions dataset with the prompts dataset based on the image ID
merged_df = captions_df.merge(prompts_df, on='image_id')

# Initializing an empty list for storing evaluation results
results = []

# Iterating through each row in the merged dataset to compute evaluation scores
for index, row in merged_df.iterrows():
    prompt = row['Prompts']
    blip_caption = row['blip_caption']
    vit_caption = row['vit_caption']

    # Computing evaluation scores for BLIP-generated captions
    bleu_blip, meteor_blip, rouge_blip, semantic_blip = compute_scores(prompt, blip_caption)

    # Computing evaluation scores for ViT-generated captions
    bleu_vit, meteor_vit, rouge_vit, semantic_vit = compute_scores(prompt, vit_caption)

    # Storing the computed scores for analysis
    results.append({
        'image_id': row['image_id'],
        'bleu_blip': bleu_blip,
        'bleu_vit': bleu_vit,
        'meteor_blip': meteor_blip,
        'meteor_vit': meteor_vit,
        'rouge_blip': rouge_blip,
        'rouge_vit': rouge_vit,
        'semantic_blip': semantic_blip,
        'semantic_vit': semantic_vit
    })

# Creating a DataFrame from the results and saving it as a CSV file for further analysis
final_df = pd.DataFrame(results)
final_df.to_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/modifies.csv', index=False)

# Calculating and printing the average scores across all evaluated captions
averages = final_df.mean(numeric_only=True)
print("Average Scores:\n", averages)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Average Scores:
 bleu_blip        0.038710
bleu_vit         0.033498
meteor_blip      0.227122
meteor_vit       0.186605
rouge_blip       0.314312
rouge_vit        0.289201
semantic_blip    0.542584
semantic_vit     0.427803
dtype: float64


# Calculating CHAIR score using a custom function

In [None]:
# Loading the generated captions dataset and the ground truth prompts dataset
captions_df = pd.read_csv('/content/drive/MyDrive/sorted_images_captions.csv')  # Generated captions
prompts_df = pd.read_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/Prompts.csv')  # Ground truth

# Defining a function that is extracting objects from a given text
def extract_objects(text):
    # Tokenizing the input text and converting it to lowercase for consistency
    tokens = nltk.word_tokenize(str(text).lower())
    return tokens  # Adjusting the method if needed for better object detection

# Defining a function that is identifying hallucinations by comparing caption and ground truth objects
def compute_hallucinations(caption_objects, gt_objects):
    # Converting both sets of objects into unique sets for comparison
    caption_set = set(caption_objects)
    gt_set = set(gt_objects)

    # Determining objects that appear in the caption but are missing in the ground truth
    hallucinated_objects = caption_set - gt_set
    return hallucinated_objects

# Merging the datasets on 'image_id' to align generated captions with their respective prompts
merged_df = captions_df.merge(prompts_df, on='image_id')

# Initializing an empty list for storing CHAIR scores and a counter for hallucinated captions
chair_scores = []
captions_with_hallucinations = 0

# Iterating through each row in the merged dataset to compute CHAIR scores
for _, row in merged_df.iterrows():
    image_id = row['image_id']
    prompt = row['Prompts']
    blip_caption = row['blip_caption']
    vit_caption = row['vit_caption']

    # Extracting objects from the ground truth prompt
    gt_objects = extract_objects(prompt)

    # Extracting objects from the generated captions using the same method
    blip_objects = extract_objects(blip_caption)
    vit_objects = extract_objects(vit_caption)

    # Identifying hallucinations by comparing generated captions to the ground truth objects
    hallucinated_blip = compute_hallucinations(blip_objects, gt_objects)
    hallucinated_vit = compute_hallucinations(vit_objects, gt_objects)

    # Computing CHAIR_i scores, which measure the proportion of hallucinated objects in the generated caption
    chair_i_blip = len(hallucinated_blip) / len(blip_objects) if len(blip_objects) > 0 else 0
    chair_i_vit = len(hallucinated_vit) / len(vit_objects) if len(vit_objects) > 0 else 0

    # Tracking captions that contain hallucinated objects for CHAIR_s score computation
    if hallucinated_blip or hallucinated_vit:
        captions_with_hallucinations += 1

    # Storing computed CHAIR scores and hallucinated objects in a structured format
    chair_scores.append({
        'image_id': image_id,
        'chair_i_blip': chair_i_blip,
        'chair_i_vit': chair_i_vit,
        'hallucinated_objects_blip': list(hallucinated_blip),
        'hallucinated_objects_vit': list(hallucinated_vit)
    })

# Computing CHAIR_s score, which evaluates how frequently captions contain hallucinations
total_captions = len(captions_df)
chair_s = captions_with_hallucinations / total_captions

# Creating a DataFrame with the computed CHAIR scores and saving the results as a CSV file
final_df = pd.DataFrame(chair_scores)
final_df.to_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/CHAIR_Score_Results.csv', index=False)

# Printing the computed CHAIR_s score for summary analysis
print(f"CHAIR_s Score: {chair_s}")


CHAIR_s Score: 1.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Calculating SPICE score

In [None]:
# Loading datasets that contain generated captions and ground truth prompts for evaluation
captions_df = pd.read_csv('/content/drive/MyDrive/sorted_images_captions.csv')  # Generated captions
prompts_df = pd.read_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/Prompts.csv')  # Ground truth prompts

# Merging both datasets on 'image_id' to align generated captions with their respective reference prompts
merged_df = captions_df.merge(prompts_df, on='image_id')

# Initializing the sentence similarity model, which is being used for semantic evaluation
model = SentenceTransformer('all-MiniLM-L6-v2')

# Defining a function that is computing semantic similarity between the reference prompt and the generated caption
def compute_spice(reference, candidate):
    # Encoding both reference and candidate text into vector representations
    reference_embedding = model.encode(reference, convert_to_tensor=True)
    candidate_embedding = model.encode(candidate, convert_to_tensor=True)

    # Calculating cosine similarity, which is serving as a proxy for SPICE score
    return util.cos_sim(reference_embedding, candidate_embedding).item()

# Initializing an empty list to store computed SPICE scores
results = []

# Iterating over each row in the merged dataset to compute SPICE scores for BLIP and ViT captions
for _, row in merged_df.iterrows():
    image_id = row['image_id']
    prompt = row['Prompts']
    blip_caption = row['blip_caption']
    vit_caption = row['vit_caption']

    # Computing SPICE scores for both BLIP and ViT-generated captions using semantic similarity
    spice_blip = compute_spice(prompt, blip_caption)
    spice_vit = compute_spice(prompt, vit_caption)

    # Storing computed SPICE scores for later analysis
    results.append({
        "image_id": image_id,
        "spice_blip": spice_blip,
        "spice_vit": spice_vit
    })

# Creating a DataFrame from the computed SPICE scores and saving it as a CSV file
final_df = pd.DataFrame(results)
final_df.to_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/Custom_SPICE_Scores.csv', index=False)

# Computing and displaying the average SPICE scores across all evaluated captions
averages = final_df.mean(numeric_only=True)
print("Average Custom SPICE Scores:\n", averages)


Average Custom SPICE Scores:
 spice_blip    0.542584
spice_vit     0.427803
dtype: float64


# Generating captions from gemini-1.5-flash

In [None]:
# Configuring the Gemini API by setting up the authentication key
GOOGLE_API_KEY = "AIzaSyBo_KdiOjai53nGeBylfcJzRI_3rNctiuk"
genai.configure(api_key=GOOGLE_API_KEY)

# Defining configuration settings for the caption generation process
CONFIG = {
    "image_folder": "/content/drive/MyDrive/sd_2_outputs",  # Specifying the directory where images are stored
    "csv_output_path": "/content/drive/MyDrive/gemini_gen_captions.csv",  # Setting the output file path for captions
    "resize_dim": (512, 512),  # Resizing images to uniform dimensions for processing
    "retries": 3,  # Setting the number of retries for failed API requests
    "base_delay": 1,  # Defining the base delay in seconds before retrying failed requests
    "request_delay": 10,  # Ensuring a fixed delay between API requests to respect rate limits
    "save_interval": 5,  # Saving the progress after processing every N images
    "prompt": "Generate a single line caption describing what's in the image using fewer words. Keep it short and simple, and generate only the caption."
}

# Initializing the Gemini model that is being used for caption generation
model = genai.GenerativeModel("gemini-1.5-flash")

# Defining a function that is loading existing caption data if available
def load_existing_data():
    """Loading previously processed captions to avoid re-processing images"""
    if os.path.exists(CONFIG["csv_output_path"]):  # Checking if the CSV file already exists
        df_existing = pd.read_csv(CONFIG["csv_output_path"])  # Reading existing captions
        processed_images = set(df_existing["image_id"])  # Extracting IDs of images already processed
        captions_data = df_existing.to_dict("records")  # Converting existing data to a dictionary format
    else:
        processed_images = set()  # Initializing an empty set if no previous data exists
        captions_data = []  # Initializing an empty list for new captions
    return processed_images, captions_data

# Defining a function that is generating a caption for a given image
def generate_caption(image_path):
    """Generating a single-line caption describing the content of the image"""
    for attempt in range(CONFIG["retries"]):  # Attempting multiple retries in case of failure
        try:
            with Image.open(image_path) as img:  # Opening the image for processing
                img = img.convert("RGB")  # Ensuring the image is in RGB format
                img = img.resize(CONFIG["resize_dim"])  # Resizing the image before sending it to the API

                # Sending the image and prompt to Gemini API for caption generation
                response = model.generate_content(
                    [img, CONFIG["prompt"]],
                    generation_config={"temperature": 0.2}  # Setting temperature to control response randomness
                )

                # Checking if the API response contains a valid caption and returning it
                if response.text:
                    return response.text.strip()
                return "No caption generated"  # Returning a default message if caption generation fails

        except Exception as e:
            print(f"\nAttempt {attempt + 1} failed: {str(e)}")  # Logging error messages
            if attempt < CONFIG["retries"] - 1:  # Implementing exponential backoff for retries
                time.sleep(CONFIG["base_delay"] * (2 ** attempt))  # Waiting before retrying
            else:
                return f"Error: {str(e)}"  # Returning an error message if all retry attempts fail

# Defining the main function that is processing images and generating captions
def main():
    """Executing the main workflow for image caption generation"""
    processed_images, captions_data = load_existing_data()  # Loading previously processed data
    image_files = [f for f in os.listdir(CONFIG["image_folder"])  # Listing image files in the folder
                  if f.lower().endswith((".png", ".jpg", ".jpeg"))]  # Filtering for valid image formats

    # Calculating the number of images to be processed
    total_images = len(image_files)
    processed_count = len(processed_images)
    remaining_images = total_images - processed_count

    # Printing summary of images found and processing status
    print(f"Found {total_images} images total")
    print(f"{processed_count} already processed")
    print(f"{remaining_images} remaining to process\n")

    # Creating a progress bar to track the processing of images
    progress_bar = tqdm(
        [img for img in image_files if img not in processed_images],
        desc="Processing images",
        unit="image"
    )

    # Iterating through each unprocessed image for caption generation
    for i, image_name in enumerate(progress_bar):
        image_path = os.path.join(CONFIG["image_folder"], image_name)  # Constructing image file path

        # Updating the progress bar description with the current image being processed
        progress_bar.set_description(f"Processing {image_name[:20]}...")

        # Generating the caption using the defined function
        caption = generate_caption(image_path)

        # Storing the generated caption along with the image ID
        captions_data.append({
            "image_id": image_name,
            "gemini_caption": caption
        })

        # Saving the progress periodically based on the defined interval
        if (i + 1) % CONFIG["save_interval"] == 0:
            pd.DataFrame(captions_data).to_csv(CONFIG["csv_output_path"], index=False)
            progress_bar.set_postfix({"Saved": "✓"})  # Indicating progress is saved

        # Respecting API rate limits by adding a delay before the next request
        time.sleep(CONFIG["request_delay"])

    # Saving the final set of generated captions
    pd.DataFrame(captions_data).to_csv(CONFIG["csv_output_path"], index=False)

    # Printing completion message with final stats
    print(f"\n✅ Completed processing all images")
    print(f"Total processed: {len(captions_data)}")
    print(f"Saved to: {CONFIG['csv_output_path']}")

# Executing the main function when the script is run
if __name__ == "__main__":
    main()


In [1]:
# Load the CSV file
csv_path = "/content/drive/MyDrive/gemini_gen_captions.csv"
df = pd.read_csv(csv_path)

# Extract numeric part of image_id and convert to integer for sorting
df['numeric_id'] = df['image_id'].str.extract(r'(\d+)').astype(int)

# Sort by numeric_id in ascending order
df_sorted = df.sort_values(by="numeric_id", ascending=True).drop(columns=['numeric_id'])

# Save the sorted CSV as a new file
sorted_csv_path = "/content/drive/MyDrive/gemini_sorted_images_captions.csv"
df_sorted.to_csv(sorted_csv_path, index=False)

print(f"Sorted CSV saved successfully at {sorted_csv_path}")


Sorted CSV saved successfully at /content/drive/MyDrive/gemini_sorted_images_captions.csv


# Calculating the various metrics between the original prompts and the gemini generated ones

In [5]:
# Loading CSV files that contain Gemini-generated captions and their corresponding prompts
captions_df = pd.read_csv('/content/drive/MyDrive/gemini_sorted_images_captions.csv')  # Gemini captions file
prompts_df = pd.read_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/Prompts.csv')  # Reference prompts

# Initializing tools used for various text similarity evaluations
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)  # Setting up ROUGE scorer
smoothing = SmoothingFunction()  # Initializing the smoothing function for BLEU score computation
model = SentenceTransformer('all-MiniLM-L6-v2')  # Loading pre-trained SentenceTransformer model

# Initializing the lemmatizer for text preprocessing and setting up the stopword removal set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Defining a function that is computing the semantic similarity between a prompt and generated caption
def compute_semantic_similarity(prompt, caption):
    # Encoding both prompt and caption into numerical vector representations
    prompt_embedding = model.encode(prompt, convert_to_tensor=True)
    caption_embedding = model.encode(caption, convert_to_tensor=True)

    # Calculating cosine similarity between the two vectors to assess semantic closeness
    return util.cos_sim(prompt_embedding, caption_embedding).item()

# Defining a function that is computing evaluation metrics between a prompt and its corresponding caption
def compute_scores(prompt, caption):
    # Ensuring inputs are converted to strings if necessary
    if not isinstance(prompt, str): prompt = str(prompt)
    if not isinstance(caption, str): caption = str(caption)

    # Tokenizing the prompt and caption for BLEU and METEOR score calculations
    tokenized_prompt = word_tokenize(prompt)
    tokenized_caption = word_tokenize(caption)

    # Computing the BLEU score using a smoothing function for better readability
    bleu = sentence_bleu([tokenized_prompt], tokenized_caption, smoothing_function=smoothing.method1)

    # Computing the METEOR score, which considers synonyms and paraphrases for evaluation
    meteor = meteor_score([tokenized_prompt], tokenized_caption)

    # Computing the ROUGE score to measure text overlap between the prompt and generated caption
    rouge_scores = rouge.score(prompt, caption)

    # Computing semantic similarity between the prompt and caption
    semantic_similarity = compute_semantic_similarity(prompt, caption)

    return bleu, meteor, rouge_scores['rouge1'].fmeasure, semantic_similarity

# Merging the captions dataset with the prompts dataset based on the image ID
merged_df = captions_df.merge(prompts_df, on='image_id')

# Initializing an empty list for storing evaluation results
results = []

# Iterating through each row in the merged dataset to compute evaluation scores for Gemini captions
for _, row in merged_df.iterrows():
    prompt = row['Prompts']
    gemini_caption = row['gemini_caption']

    # Computing BLEU, METEOR, ROUGE, and Semantic Similarity scores
    bleu_gemini, meteor_gemini, rouge_gemini, semantic_gemini = compute_scores(prompt, gemini_caption)

    # Storing computed evaluation scores in a structured format
    results.append({
        'image_id': row['image_id'],
        'bleu_gemini': bleu_gemini,
        'meteor_gemini': meteor_gemini,
        'rouge_gemini': rouge_gemini,
        'semantic_gemini': semantic_gemini
    })

# Creating a DataFrame from the results and saving it as a CSV file for further analysis
final_df = pd.DataFrame(results)
final_df.to_csv('/content/drive/MyDrive/Fine-Grained-Hallucination-main/gemini_scores.csv', index=False)

# Computing and printing the average scores across all evaluated Gemini captions
averages = final_df.mean(numeric_only=True)
print("Average Scores:\n", averages)


Average Scores:
 bleu_gemini        0.048525
meteor_gemini      0.260639
rouge_gemini       0.325015
semantic_gemini    0.578047
dtype: float64
