In [None]:
# To Install the required libraries
!pip install transformers torch Pillow panda

In [None]:
#Importing necessary libraries
import os
import json
import random
import pandas as pd
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import ViTImageProcessor, VisionEncoderDecoderModel, AutoTokenizer

# Setting the paths
IMG_DIR = '/content/drive/MyDrive/MSCOCO/train2014'
CAPTIONS_PATH = '/content/drive/MyDrive/MSCOCO/annotations_trainval2014/annotations/captions_train2014.json'

# Loading MSCOCO annotations
with open(CAPTIONS_PATH, 'r') as f:
    coco_data = json.load(f)

# Creating image_id to captions mapping
image_id_to_captions = {}
for annotation in coco_data['annotations']:
    image_id = annotation['image_id']
    caption = annotation['caption']
    if image_id not in image_id_to_captions:
        image_id_to_captions[image_id] = []
    image_id_to_captions[image_id].append(caption)

# Creating image_id to filename mapping AND build list of actually available files
available_files = set(os.listdir(IMG_DIR))
image_id_to_filename = {}
valid_image_ids = []

for image in coco_data['images']:
    if image['file_name'] in available_files:
        image_id_to_filename[image['id']] = image['file_name']
        valid_image_ids.append(image['id'])

print(f"Found {len(valid_image_ids)} available images out of {len(coco_data['images'])}")

# Randomly sampling 200 image IDs from only the available ones
random.seed(42)
sampled_image_ids = random.sample(valid_image_ids, min(200, len(valid_image_ids)))

# Initializing models
device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# ViT-GPT2 model
vit_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vit_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

def generate_blip_caption(image_path):
    try:
        raw_image = Image.open(image_path).convert('RGB')
        inputs = blip_processor(raw_image, return_tensors="pt").to(device)
        out = blip_model.generate(**inputs)
        caption = blip_processor.decode(out[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"BLIP Error processing {image_path}: {str(e)}")
        return "BLIP caption generation failed"

def generate_vit_caption(image_path):
    try:
        raw_image = Image.open(image_path).convert('RGB')
        pixel_values = vit_processor(images=raw_image, return_tensors="pt").pixel_values.to(device)
        output_ids = vit_model.generate(pixel_values, max_length=50)
        caption = vit_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"ViT Error processing {image_path}: {str(e)}")
        return "ViT caption generation failed"

# Processing the sampled images
results = []
for i, image_id in enumerate(sampled_image_ids):
    filename = image_id_to_filename[image_id]
    image_path = os.path.join(IMG_DIR, filename)

    # Getting the MSCOCO captions (take first one)
    coco_caption = image_id_to_captions[image_id][0]

    # Generating BLIP caption
    blip_caption = generate_blip_caption(image_path)

    # Generating the ViT-GPT2 caption
    vit_caption = generate_vit_caption(image_path)

    results.append({
        'image_name': filename,
        'mscoco_caption': coco_caption,
        'blip_caption': blip_caption,
        'vit_caption': vit_caption
    })

    print(f"Processed {i+1}/{len(sampled_image_ids)}: {filename}")
    print(f"COCO: {coco_caption}")
    print(f"BLIP: {blip_caption}")
    print(f"ViT: {vit_caption}")
    print("---")

# Creating DataFrame and saving to CSV
df = pd.DataFrame(results)
output_path = 'mscoco-blip-vit_captions.csv'
df.to_csv(output_path, index=False)
print(f"Done! CSV saved to {output_path}")