In [2]:
import os
import random

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

from transformers import BlipProcessor, BlipForConditionalGeneration

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt')
nltk.download('punkt_tab') 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
from datasets import load_dataset

hf_full = load_dataset("jxie/flickr8k", split="train")  

print("Full HF dataset size:", len(hf_full))

hf_500 = hf_full.shuffle(seed=42).select(range(2500))
print("Subset size:", len(hf_500))

split1 = hf_500.train_test_split(test_size=0.3, seed=42)
hf_train = split1["train"]
hf_temp = split1["test"]

split2 = hf_temp.train_test_split(test_size=0.5, seed=42)
hf_val = split2["train"]
hf_test = split2["test"]

Full HF dataset size: 6000
Subset size: 2500


In [29]:
class HFImageCaptionDataset(Dataset):
    def __init__(self, hf_dataset, transform=None, use_all_captions=False):
        self.hf_dataset = hf_dataset
        self.transform = transform
        self.use_all_captions = use_all_captions

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        example = self.hf_dataset[idx]
        image = example["image"]  

        if self.use_all_captions:
            captions = [example[f"caption_{i}"] for i in range(5)]
        else:
            captions = example["caption_0"]  

        if self.transform is not None:
            image_out = self.transform(image)  
        else:
            image_out = image  

        sample_id = idx
        return image_out, captions, sample_id


baseline_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

train_dataset = HFImageCaptionDataset(hf_train, transform=baseline_transform, use_all_captions=True)
val_dataset   = HFImageCaptionDataset(hf_val,   transform=baseline_transform, use_all_captions=True)
test_dataset  = HFImageCaptionDataset(hf_test,  transform=baseline_transform, use_all_captions=True)

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print("Train len:", len(train_dataset), "Val len:", len(val_dataset), "Test len:", len(test_dataset))

Train len: 1750 Val len: 375 Test len: 375


In [38]:
smooth_fn = SmoothingFunction().method1
def compute_bleu_scores(reference_texts, predicted_text):
    if isinstance(reference_texts, str):
        reference_texts = [reference_texts]

    refs_tokenized = [
        nltk.word_tokenize(ref.lower()) for ref in reference_texts
    ]
    pred_tokens = nltk.word_tokenize(predicted_text.lower())

    bleu1 = sentence_bleu(
        refs_tokenized,
        pred_tokens,
        weights=(1.0, 0.0, 0.0, 0.0),
        smoothing_function=smooth_fn,
    )
    bleu2 = sentence_bleu(
        refs_tokenized,
        pred_tokens,
        weights=(0.5, 0.5, 0.0, 0.0),
        smoothing_function=smooth_fn,
    )
    return bleu1, bleu2

In [39]:
def extract_simple_features(image_tensor):
    if not torch.is_floating_point(image_tensor):
        image_tensor = image_tensor.float() / 255.0

    C, H, W = image_tensor.shape

    brightness = image_tensor.mean().item()

    channel_means = image_tensor.view(C, -1).mean(dim=1)  
    total = channel_means.sum() + 1e-8
    channel_ratios = channel_means / total

    colors = ["r", "g", "b"]
    dominant_idx = int(torch.argmax(channel_ratios))
    dominant_channel = colors[dominant_idx]

    if channel_ratios[dominant_idx] < 0.4:
        dominant_channel = "none"

    gray = image_tensor.mean(dim=0, keepdim=True) 

    gx = gray[:, :, 1:] - gray[:, :, :-1]  
    gy = gray[:, 1:, :] - gray[:, :-1, :] 

    gx = F.pad(gx, (0, 1, 0, 0))
    gy = F.pad(gy, (0, 0, 0, 1))

    edge_mag = torch.sqrt(gx ** 2 + gy ** 2) 
    edge_density = (edge_mag > 0.1).float().mean().item()

    return {
        "brightness": brightness,
        "dominant_channel": dominant_channel,
        "edge_density": edge_density,
    }


def naive_baseline_caption(image_tensor):
    feats = extract_simple_features(image_tensor)
    b = feats["brightness"]
    d = feats["dominant_channel"]
    e = feats["edge_density"]

    if b < 0.25:
        if e > 0.15:
            return "A dark scene with many shapes and edges."
        else:
            return "A dark scene with a few bright regions."

    elif b > 0.65:
        if d == "b":
            if e < 0.12:
                return "A bright outdoor scene with a lot of blue sky."
            else:
                return "A bright outdoor scene with blue tones and many objects."
        elif d == "g":
            return "A bright outdoor scene with green vegetation."
        elif d == "r":
            return "A bright scene with a warm-colored object in view."
        else:  
            if e > 0.2:
                return "A bright scene with many detailed objects."
            else:
                return "A bright scene with a few large regions."

    else:
        if d == "g":
            return "A moderately lit scene with some greenery."
        elif d == "b":
            if e > 0.18:
                return "A scene with blue tones and several objects."
            else:
                return "A calm scene with some blue regions."
        elif d == "r":
            return "A scene with noticeable warm colors and moderate lighting."
        else:  
            if e > 0.2:
                return "A busy scene with many edges and details."
            else:
                return "A simple scene with a few large regions."

In [40]:

baseline_results = []

for image_tensor, gt_caption, sample_id in test_loader:
    image_tensor = image_tensor[0] 

    pred_caption = naive_baseline_caption(image_tensor)
    bleu1, bleu2 = compute_bleu_scores(gt_caption[0], pred_caption)

    baseline_results.append({
        "sample_id": int(sample_id[0].item()) if isinstance(sample_id[0], torch.Tensor) else int(sample_id[0]),
        "gt_caption": gt_caption[0],
        "pred_caption": pred_caption,
        "bleu1": bleu1,
        "bleu2": bleu2,
    })

baseline_df = pd.DataFrame(baseline_results)
print("Naive baseline mean BLEU-1:", baseline_df["bleu1"].mean())
print("Naive baseline mean BLEU-2:", baseline_df["bleu2"].mean())
baseline_df.head()

Naive baseline mean BLEU-1: 0.39585358993441616
Naive baseline mean BLEU-2: 0.10649451853917476


Unnamed: 0,sample_id,gt_caption,pred_caption,bleu1,bleu2
0,0,(A child holding onto handles sliding across a...,A busy scene with many edges and details.,0.29828,0.057762
1,8,(A bearded man is sitting on a bench wearing a...,A busy scene with many edges and details.,0.333333,0.06455
2,16,"(A girl with black gloves is running ., Two gi...",A simple scene with a few large regions.,0.444444,0.074536
3,24,(A boy in a blue wetsuit is riding a surfboard...,A calm scene with some blue regions.,0.375,0.073193
4,32,(A chubby or buff kid in shorts is holding on ...,A dark scene with a few bright regions.,0.444444,0.074536


In [41]:
@torch.no_grad()
def blip_generate_caption(pil_image, max_length=20, num_beams=3):
    """
    pil_image: PIL.Image in RGB
    Returns: caption string
    """
    inputs = processor(images=pil_image, return_tensors="pt").to(device)

    output_ids = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=num_beams
    )
    caption = processor.decode(output_ids[0], skip_special_tokens=True)
    return caption.strip()

In [45]:
ai_results = []

MODEL_NAME = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(MODEL_NAME)
model = BlipForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
model.eval()

pil_test_dataset = HFImageCaptionDataset(hf_test, transform=None, use_all_captions=True)

pil_test_loader = DataLoader(
    pil_test_dataset,
    batch_size=1,
    shuffle=False,
    collate_fn=lambda batch: batch[0], 
)

for pil_image, gt_caption, sample_id in tqdm(pil_test_loader):
    sid = int(sample_id)

    pred_caption = blip_generate_caption(pil_image)
    bleu1, bleu2 = compute_bleu_scores(gt_caption, pred_caption)

    ai_results.append({
        "sample_id": sid,
        "gt_caption": gt_caption,
        "pred_caption": pred_caption,
        "bleu1": bleu1,
        "bleu2": bleu2,
    })

ai_df = pd.DataFrame(ai_results)
print("AI pipeline mean BLEU-1:", ai_df["bleu1"].mean())
print("AI pipeline mean BLEU-2:", ai_df["bleu2"].mean())
ai_df.head()

  0%|          | 0/375 [00:00<?, ?it/s]

AI pipeline mean BLEU-1: 0.5601613623240005
AI pipeline mean BLEU-2: 0.4091202389465279


Unnamed: 0,sample_id,gt_caption,pred_caption,bleu1,bleu2
0,0,[A child holding onto handles sliding across a...,a man jumping in the air,0.256709,0.162357
1,1,[A brown dog is running through tall green gra...,a dog running through a field of tall grass,1.0,0.935414
2,2,[A lonely skier enjoys the slopes on a beautif...,the sky is clear,0.25,0.091287
3,3,[A greyhound in a race wearing a metal muzzle ...,the dog is white,0.286505,0.052308
4,4,[A cyclist is leaning his bicycle up on its fr...,a man doing a trick on a bike,0.778801,0.721029


In [46]:

merged_df = baseline_df.merge(
    ai_df,
    on="sample_id",
    suffixes=("_baseline", "_ai"),
)

print("=== Overall Mean Scores ===")
print("Baseline BLEU-1:", merged_df["bleu1_baseline"].mean())
print("Baseline BLEU-2:", merged_df["bleu2_baseline"].mean())
print("AI BLEU-1:", merged_df["bleu1_ai"].mean())
print("AI BLEU-2:", merged_df["bleu2_ai"].mean())

merged_df.head(10)

=== Overall Mean Scores ===
Baseline BLEU-1: 0.39585358993441616
Baseline BLEU-2: 0.10649451853917476
AI BLEU-1: 0.5554942828753789
AI BLEU-2: 0.3979549488340454


Unnamed: 0,sample_id,gt_caption_baseline,pred_caption_baseline,bleu1_baseline,bleu2_baseline,gt_caption_ai,pred_caption_ai,bleu1_ai,bleu2_ai
0,0,(A child holding onto handles sliding across a...,A busy scene with many edges and details.,0.29828,0.057762,[A child holding onto handles sliding across a...,a man jumping in the air,0.256709,0.162357
1,8,(A bearded man is sitting on a bench wearing a...,A busy scene with many edges and details.,0.333333,0.06455,[A bearded man is sitting on a bench wearing a...,man sitting on a park bench,0.427848,0.296422
2,16,"(A girl with black gloves is running ., Two gi...",A simple scene with a few large regions.,0.444444,0.074536,"[A girl with black gloves is running ., A woma...",a woman wearing a purple shirt,0.833333,0.57735
3,24,(A boy in a blue wetsuit is riding a surfboard...,A calm scene with some blue regions.,0.375,0.073193,[A boy in a blue wetsuit is riding a surfboard...,two people in the water,0.536256,0.423948
4,32,(A chubby or buff kid in shorts is holding on ...,A dark scene with a few bright regions.,0.444444,0.074536,[A chubby or buff kid in shorts is holding on ...,a little boy standing on a rug,0.571429,0.308607
5,40,"(A brown dog digging a hole ., A kayaker goes ...",A busy scene with many edges and details.,0.198853,0.047162,"[A brown dog digging a hole ., A brown dog dig...",a dog digging a hole in the ground,0.875,0.866025
6,48,(A child stoops to pick up a watermelon from a...,A dark scene with a few bright regions.,0.398073,0.267035,[A child stoops to pick up a watermelon from a...,a little girl standing in front of a pile of w...,0.727273,0.603023
7,56,"(many people look over the side of a bridge .,...",A simple scene with a few large regions.,0.444444,0.235702,"[many people look over the side of a bridge .,...",a group of people standing on a bridge over a ...,0.909091,0.797724
8,64,(A black and brown dog is laying on a white sh...,A simple scene with a few large regions.,0.444444,0.235702,[A black and brown dog is laying on a white sh...,the dog is black and brown,0.505442,0.428882
9,72,(A black dog carries a huge stick in its mouth...,A bright scene with a few large regions.,0.333333,0.06455,[A black dog carries a huge stick in its mouth...,a dog running in the snow,0.427848,0.363041


In [55]:
idx = 5
merged_df.iloc[idx]['gt_caption_baseline'], merged_df.iloc[idx]['pred_caption_baseline'], merged_df.iloc[idx]['pred_caption_ai']

(('A brown dog digging a hole .',
  'A kayaker goes through the waves holding his paddle .',
  'The men are climbing .',
  'A child in a red jacket holding up two fingers .',
  'A child in a red jacket jumping off of a rock into sand .',
  'A group of people stand in the snow in a mountain .',
  'A security man guards the door as another one brings his items off .',
  'A little dog carries a small stick in his mouth .'),
 'A busy scene with many edges and details.',
 'a dog digging a hole in the ground')

In [52]:
merged_df.iloc[idx]['pred_caption_ai']

sample_id                                                               16
gt_caption_baseline      (A girl with black gloves is running ., Two gi...
pred_caption_baseline             A simple scene with a few large regions.
bleu1_baseline                                                    0.444444
bleu2_baseline                                                    0.074536
gt_caption_ai            [A girl with black gloves is running ., A woma...
pred_caption_ai                             a woman wearing a purple shirt
bleu1_ai                                                          0.833333
bleu2_ai                                                           0.57735
Name: 2, dtype: object