In [2]:
import os
import json
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Initialize the captioning model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Set the device (GPU if available)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(DEVICE)

# Function to generate caption for an image
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(DEVICE)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Iterate through the dataset and generate captions
data_path = '/workspaces/finetune/combined_classes'
output_data = []

for class_folder in os.listdir(data_path):
    class_folder_path = os.path.join(data_path, class_folder)
    if os.path.isdir(class_folder_path):
        for image_name in os.listdir(class_folder_path):
            image_path = os.path.join(class_folder_path, image_name)
            try:
                caption = generate_caption(image_path)
                output_data.append({
                    "image_path": image_path,
                    "class": class_folder,
                    "caption": caption
                })
                print(f"Processed {image_path}")
            except Exception as e:
                print(f"Error processing {image_path}: {e}")

# Save the results to a JSON file
output_file = 'image_captions.json'
with open(output_file, 'w') as f:
    json.dump(output_data, f, indent=4)

print(f"Captions saved to {output_file}")


  from .autonotebook import tqdm as notebook_tqdm
2024-05-17 21:30:09.477654: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Processed /workspaces/finetune/combined_classes/3000000033/3000000033(1).jpg
Processed /workspaces/finetune/combined_classes/2997330284/2997330284(1).jpg
Processed /workspaces/finetune/combined_classes/2997330284/2997330284(2).jpg
Processed /workspaces/finetune/combined_classes/3001323629/3001323629(5).jpg
Processed /workspaces/finetune/combined_classes/3001323629/3001323629(4).jpg
Processed /workspaces/finetune/combined_classes/3001323629/3001323629(2).jpg
Processed /workspaces/finetune/combined_classes/2999488571/2999488571(2).jpg
Processed /workspaces/finetune/combined_classes/3001571027/3001571027(2).jpg
Processed /workspaces/finetune/combined_classes/3001571027/3001571027(3).jpg
Processed /workspaces/finetune/combined_classes/3001795688/3001795688(4).jpg
Processed /workspaces/finetune/combined_classes/3001795688/3001795688(2).jpg
Processed /workspaces/finetune/combined_classes/3001766651/3001766651(5).jpg
Processed /workspaces/finetune/combined_classes/3001766651/3001766651(3).jpg