## __System Information, Packages, and Data Loading__

### __Package Imports__

In [2]:
# Imports
import os
import nltk
import torch
import psutil
import platform
import kagglehub
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
from pathlib import Path
from rouge_score import rouge_scorer
from torch.utils.data import Dataset

from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    Trainer,
    TrainingArguments
)

from nltk.translate.bleu_score import (
    sentence_bleu, 
    corpus_bleu, 
    SmoothingFunction
)

### __Variable Definitions__

In [3]:
# Define Variables
TRAIN_SAVE = True

### __Software and Hardware Specs__

In [3]:
print("=" * 50)
print("HARDWARE INFO")
print("=" * 50)

# System
print(f"OS: {platform.system()} {platform.release()}")
print(f"Python: {platform.python_version()}")
print(f"PyTorch: {torch.__version__}")

# CPU & Memory
print(f"\nCPU: {psutil.cpu_count(logical=True)} cores")
print(f"RAM: {psutil.virtual_memory().total / (1024**3):.1f} GB")

# GPU
if torch.cuda.is_available():
    print(f"\nGPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f} GB")
else:
    print("\nGPU: None (using CPU)")

print("=" * 50)

HARDWARE INFO
OS: Linux 5.10.0-36-cloud-amd64
Python: 3.10.19
PyTorch: 2.9.1+cu128

CPU: 32 cores
RAM: 125.8 GB

GPU: NVIDIA L4
CUDA: 12.8
GPU Memory: 22.0 GB


### __Download Flickr Dataset and Load Base Model__

In [4]:
# Download latest version
path = kagglehub.dataset_download("hsankesara/flickr-image-dataset")
print("Path to dataset files:", path)

Path to dataset files: /home/jupyter/.cache/kagglehub/datasets/hsankesara/flickr-image-dataset/versions/1


## __Model Selection and Customization__

In [6]:
# Load pre-trained BLIP model
model_name = "Salesforce/blip-image-captioning-base"

model = BlipForConditionalGeneration.from_pretrained(model_name)
processor = BlipProcessor.from_pretrained(model_name, use_fast=True)

## __Dataset Preparation__

In [7]:
# ============================================
# 1. LOAD THE DATASET FROM DISK
# ============================================

class FlickrDataset(Dataset):
    def __init__(self, root_dir, csv_file, processor):
        self.root_dir = root_dir
        self.annotations = pd.read_csv(csv_file, sep="|")
        # Strip whitespace from column names
        self.annotations.columns = self.annotations.columns.str.strip()
        self.processor = processor

        # group captions by image (BLIP expects 1 caption per sample)
        self.data = (
            self.annotations.groupby("image_name")["comment"]
            .apply(list)
            .reset_index()
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_name = row["image_name"]
        captions = row["comment"]

        # Use the first caption (simplest for training)
        caption = captions[0]

        # Load image
        image_path = os.path.join(self.root_dir, img_name)
        image = Image.open(image_path).convert("RGB")

        # Preprocess
        encoding = self.processor(
            images=image,
            text=caption,
            padding="max_length",
            truncation=True,
            max_length=40,
            return_tensors="pt",
        )

        # Remove batch dimension and prepare labels
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # BLIP needs labels for training
        encoding["labels"] = encoding["input_ids"].clone()
        
        return encoding

# ============================================
# 2. CUSTOM COLLATOR
# ============================================

def collate_fn(batch):
    """Custom collator to handle BLIP's expected input format"""
    # Stack all tensors
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


# ============================================
# 3. CREATE DATASET OBJECTS
# ============================================

dataset_root = Path(path) / "flickr30k_images" / "flickr30k_images"
csv_path = Path(path) / "flickr30k_images" / "results.csv"

full_dataset = FlickrDataset(dataset_root, csv_path, processor)

# Split manually
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(
    full_dataset,
    [train_size, val_size],
    generator=torch.Generator().manual_seed(42)
)

## __Training and Fine-tuning__

In [6]:
# ============================================
# 1. TRAINING ARGUMENTS
# ============================================

training_args = TrainingArguments(
    output_dir="./blip-finetuned-flickr",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_steps=100,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,  # Only keep 2 best checkpoints
    load_best_model_at_end=True,
    report_to="none",
    remove_unused_columns=False,
)


# ============================================
# 2. CUSTOM TRAINER TO HANDLE BLIP
# ============================================

class BlipTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """Override to remove num_items_in_batch argument that BLIP doesn't expect"""
        # Remove any keys that BLIP doesn't need
        inputs = {k: v for k, v in inputs.items() if k in ["pixel_values", "input_ids", "attention_mask", "labels"]}
        
        outputs = model(**inputs)
        loss = outputs.loss
        
        return (loss, outputs) if return_outputs else loss

if TRAIN_SAVE:
    # Define the model trainer
    trainer = BlipTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=collate_fn,  # custom collator
    )
    # ============================================
    # 3. TRAIN
    # ============================================

    print("Starting training...")
    trainer.train()

    # ============================================
    # 4. SAVE MODEL 
    # ============================================

    trainer.save_model("./blip-finetuned-flickr")
    processor.save_pretrained("./blip-finetuned-flickr")

    print("Training complete and model saved!")

Starting training...


Epoch,Training Loss,Validation Loss
1,1.2696,1.286147
2,1.0387,1.238511
3,0.7043,1.274218


There were missing keys in the checkpoint model loaded: ['text_decoder.cls.predictions.decoder.weight', 'text_decoder.cls.predictions.decoder.bias'].


Training complete and model saved!


## __Model Evaluation and Comparison__

### __Load and Evaluate Finetuned Model__

In [7]:
# ============================================
# 1. LOAD YOUR FINE-TUNED MODEL 
# ============================================

model_path = "./blip-finetuned-flickr"
image_path = Path(path) / "flickr30k_images" / "flickr30k_images"
csv_path = Path(path) / "flickr30k_images" / "results.csv"

print("Loading model and processor...")
processor = BlipProcessor.from_pretrained(model_path)
model = BlipForConditionalGeneration.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()  # Set to evaluation mode

print(f"Model loaded on {device}")

# ============================================
# 2. GENERATE CAPTION FOR A SINGLE IMAGE
# ============================================

def generate_caption(image_path, max_length=50, num_beams=5):
    """
    Generate a caption for a single image
    
    Args:
        image_path: Path to the image file
        max_length: Maximum length of generated caption
        num_beams: Number of beams for beam search (higher = better quality, slower)
    
    Returns:
        Generated caption string
    """
    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")
    
    # Process image
    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate caption
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True
        )
    
    # Decode the generated caption
    caption = processor.decode(output[0], skip_special_tokens=True)
    
    return caption


# ============================================
# 3. EVALUATE ON FLICKR TEST SET
# ============================================

def evaluate_on_flickr(csv_path, images_dir, num_samples=100):
    """Evaluate the model on Flickr30k test images with ground truth captions"""
    
    import pandas as pd
    
    # Load captions
    df = pd.read_csv(csv_path, sep="|")
    df.columns = df.columns.str.strip()
    
    # Group by image
    grouped = df.groupby("image_name")["comment"].apply(list).reset_index()
    
    # Sample some images
    sample_data = grouped.sample(min(num_samples, len(grouped)), random_state=42)
    
    print(f"\nEvaluating on {len(sample_data)} images...\n")
    
    results = []
    for idx, row in sample_data.iterrows():
        img_name = row["image_name"]
        ground_truth_captions = row["comment"]
        
        img_path = Path(images_dir) / img_name
        
        if not img_path.exists():
            continue
        
        try:
            # Generate caption
            generated_caption = generate_caption(img_path)
            
            results.append({
                'image': img_name,
                'generated': generated_caption,
                'ground_truth': ground_truth_captions[0]  # First ground truth
            })
            
            print(f"Image: {img_name}")
            print(f"Generated:    {generated_caption}")
            print(f"Ground truth: {ground_truth_captions[0]}\n")
            
        except Exception as e:
            print(f"Error on {img_name}: {e}\n")
    
    return results

# Evaluate on Flickr test set
results = evaluate_on_flickr(
    csv_path=csv_path,
    images_dir=image_path,
    num_samples=10
)


# ============================================
# 4. SAVE RESULTS TO FILE
# ============================================

def save_results_to_csv(results, output_file="caption_results.csv"):
    """Save caption results to a CSV file"""
    import pandas as pd
    
    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")


Loading model and processor...
Model loaded on cuda

Evaluating on 10 images...

Image: 3655176735.jpg
Generated:    a woman in a white shirt is reading a picture book to a man in a military uniform sitting in a chair.
Ground truth:  AN older woman appears to read from a children 's book in an indoor setting , while a seated gentleman in a service uniform looks on .

Image: 7669392800.jpg
Generated:    a group of bicyclists race down a street lined with bushes.
Ground truth:  Numerous bicyclists wearing bicyclist apparel , helmets , goggles , and gloves racing fiercely down a paved road despite the rain .

Image: 4546029322.jpg
Generated:    many people are walking on the street with orange and white traffic cones.
Ground truth:  In this picture we see multiple people crossing a courtyard with a line of traffic cones dissecting it .

Image: 120764850.jpg
Generated:    a little boy and a little girl are laughing while sitting on the floor.
Ground truth:  A baby on the floor laughing at 

### __Model Evaluation with BLEU and ROUGE__

In [8]:
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# ============================================
# 1. LOAD MODEL
# ============================================

def load_model(model_path):
    """Load the fine-tuned model"""
    print("Loading model...")
    processor = BlipProcessor.from_pretrained(model_path)
    model = BlipForConditionalGeneration.from_pretrained(model_path)
    
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    
    print(f"Model loaded on {device}")
    return model, processor, device


def generate_caption(model, processor, device, image_path, max_length=50, num_beams=5):
    """Generate caption for an image"""
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, num_beams=num_beams)
    
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption


# ============================================
# 2. BLEU SCORE CALCULATION
# ============================================

def calculate_bleu_scores(reference_captions, generated_caption):
    """
    Calculate BLEU scores (BLEU-1, BLEU-2, BLEU-3, BLEU-4)
    
    Args:
        reference_captions: List of reference captions (ground truth)
        generated_caption: Generated caption string
    
    Returns:
        Dictionary with BLEU-1 through BLEU-4 scores
    """
    # Tokenize
    references = [caption.lower().split() for caption in reference_captions]
    candidate = generated_caption.lower().split()
    
    # Smoothing function to handle edge cases
    smoothing = SmoothingFunction().method1
    
    # Calculate individual BLEU scores
    bleu_scores = {
        'BLEU-1': sentence_bleu(references, candidate, weights=(1, 0, 0, 0), smoothing_function=smoothing),
        'BLEU-2': sentence_bleu(references, candidate, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing),
        'BLEU-3': sentence_bleu(references, candidate, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing),
        'BLEU-4': sentence_bleu(references, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing),
    }
    
    return bleu_scores


# ============================================
# 3. ROUGE SCORE CALCULATION
# ============================================

def calculate_rouge_scores(reference_captions, generated_caption):
    """
    Calculate ROUGE scores (ROUGE-1, ROUGE-2, ROUGE-L)
    
    Args:
        reference_captions: List of reference captions
        generated_caption: Generated caption string
    
    Returns:
        Dictionary with ROUGE scores
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate ROUGE against each reference and take the maximum
    all_scores = []
    for ref in reference_captions:
        scores = scorer.score(ref, generated_caption)
        all_scores.append(scores)
    
    # Aggregate scores (take max F1 for each metric)
    rouge_scores = {
        'ROUGE-1': max(s['rouge1'].fmeasure for s in all_scores),
        'ROUGE-2': max(s['rouge2'].fmeasure for s in all_scores),
        'ROUGE-L': max(s['rougeL'].fmeasure for s in all_scores),
    }
    
    return rouge_scores


# ============================================
# 4. EVALUATE ON FLICKR DATASET
# ============================================

def evaluate_model(model_path, csv_path, images_dir, num_samples=None, output_file="evaluation_results.csv"):
    """
    Evaluate the model on Flickr30k dataset
    
    Args:
        model_path: Path to fine-tuned model
        csv_path: Path to Flickr30k CSV file
        images_dir: Directory containing images
        num_samples: Number of samples to evaluate (None = all)
        output_file: Where to save detailed results
    """
    
    # Load model
    model, processor, device = load_model(model_path)
    
    # Load dataset
    print("Loading dataset...")
    df = pd.read_csv(csv_path, sep="|")
    df.columns = df.columns.str.strip()
    
    # Group captions by image
    grouped = df.groupby("image_name")["comment"].apply(list).reset_index()
    
    # Sample if needed
    if num_samples:
        grouped = grouped.sample(min(num_samples, len(grouped)), random_state=42)
    
    print(f"Evaluating on {len(grouped)} images...\n")
    
    # Storage for results
    all_results = []
    all_bleu_scores = {f'BLEU-{i}': [] for i in range(1, 5)}
    all_rouge_scores = {'ROUGE-1': [], 'ROUGE-2': [], 'ROUGE-L': []}
    
    # Evaluate each image
    for idx, row in tqdm(grouped.iterrows(), total=len(grouped), desc="Evaluating"):
        img_name = row["image_name"]
        reference_captions = row["comment"]
        
        img_path = Path(images_dir) / img_name
        
        if not img_path.exists():
            continue
        
        try:
            # Generate caption
            generated_caption = generate_caption(model, processor, device, img_path)
            
            # Calculate BLEU scores
            bleu_scores = calculate_bleu_scores(reference_captions, generated_caption)
            
            # Calculate ROUGE scores
            rouge_scores = calculate_rouge_scores(reference_captions, generated_caption)
            
            # Store scores
            for key, value in bleu_scores.items():
                all_bleu_scores[key].append(value)
            
            for key, value in rouge_scores.items():
                all_rouge_scores[key].append(value)
            
            # Store detailed results
            result = {
                'image': img_name,
                'generated_caption': generated_caption,
                'reference_caption_1': reference_captions[0],
                **bleu_scores,
                **rouge_scores
            }
            all_results.append(result)
            
        except Exception as e:
            print(f"Error processing {img_name}: {e}")
            continue
    
    # Calculate average scores
    print("\n" + "="*60)
    print("EVALUATION RESULTS")
    print("="*60)
    print(f"\nNumber of images evaluated: {len(all_results)}\n")
    
    print("BLEU Scores:")
    for key in all_bleu_scores:
        avg_score = np.mean(all_bleu_scores[key])
        print(f"  {key}: {avg_score:.4f}")
    
    print("\nROUGE Scores:")
    for key in all_rouge_scores:
        avg_score = np.mean(all_rouge_scores[key])
        print(f"  {key}: {avg_score:.4f}")
    
    print("="*60)
    
    # Save detailed results
    results_df = pd.DataFrame(all_results)
    results_df.to_csv(output_file, index=False)
    print(f"\nDetailed results saved to {output_file}")
    
    # Return summary statistics
    summary = {
        'num_samples': len(all_results),
        **{f'avg_{key}': np.mean(all_bleu_scores[key]) for key in all_bleu_scores},
        **{f'avg_{key}': np.mean(all_rouge_scores[key]) for key in all_rouge_scores}
    }
    
    return summary, results_df


# ============================================
# 5. COMPARE WITH BASELINE
# ============================================

def compare_with_baseline(finetuned_path, csv_path, images_dir, num_samples=100):
    """Compare fine-tuned model with original BLIP baseline"""
    
    print("Evaluating Fine-tuned Model...")
    ft_summary, _ = evaluate_model(finetuned_path, csv_path, images_dir, num_samples, "finetuned_results.csv")
    
    print("\n\nEvaluating Baseline Model...")
    baseline_summary, _ = evaluate_model("Salesforce/blip-image-captioning-base", csv_path, images_dir, num_samples, "baseline_results.csv")
    
    # Print comparison
    print("\n" + "="*60)
    print("COMPARISON: Fine-tuned vs Baseline")
    print("="*60)
    
    metrics = ['avg_BLEU-1', 'avg_BLEU-2', 'avg_BLEU-3', 'avg_BLEU-4', 
               'avg_ROUGE-1', 'avg_ROUGE-2', 'avg_ROUGE-L']
    
    for metric in metrics:
        ft_score = ft_summary[metric]
        baseline_score = baseline_summary[metric]
        improvement = ((ft_score - baseline_score) / baseline_score) * 100
        
        print(f"\n{metric.replace('avg_', '')}:")
        print(f"  Baseline:    {baseline_score:.4f}")
        print(f"  Fine-tuned:  {ft_score:.4f}")
        print(f"  Improvement: {improvement:+.2f}%")
    
    print("="*60)

# Evaluate fine-tuned model
model_path = "./blip-finetuned-flickr"

compare_with_baseline(
    finetuned_path=model_path,
    csv_path=csv_path,
    images_dir=image_path,
    num_samples=100
)

Evaluating Fine-tuned Model...
Loading model...
Model loaded on cuda
Loading dataset...
Evaluating on 100 images...



Evaluating: 100%|██████████| 100/100 [00:45<00:00,  2.22it/s]



EVALUATION RESULTS

Number of images evaluated: 100

BLEU Scores:
  BLEU-1: 0.6094
  BLEU-2: 0.4473
  BLEU-3: 0.3143
  BLEU-4: 0.2196

ROUGE Scores:
  ROUGE-1: 0.5500
  ROUGE-2: 0.3077
  ROUGE-L: 0.4944

Detailed results saved to finetuned_results.csv


Evaluating Baseline Model...
Loading model...
Model loaded on cuda
Loading dataset...
Evaluating on 100 images...



Evaluating: 100%|██████████| 100/100 [00:23<00:00,  4.33it/s]



EVALUATION RESULTS

Number of images evaluated: 100

BLEU Scores:
  BLEU-1: 0.5696
  BLEU-2: 0.4086
  BLEU-3: 0.2902
  BLEU-4: 0.1982

ROUGE Scores:
  ROUGE-1: 0.4741
  ROUGE-2: 0.2704
  ROUGE-L: 0.4505

Detailed results saved to baseline_results.csv

COMPARISON: Fine-tuned vs Baseline

BLEU-1:
  Baseline:    0.5696
  Fine-tuned:  0.6094
  Improvement: +7.00%

BLEU-2:
  Baseline:    0.4086
  Fine-tuned:  0.4473
  Improvement: +9.47%

BLEU-3:
  Baseline:    0.2902
  Fine-tuned:  0.3143
  Improvement: +8.33%

BLEU-4:
  Baseline:    0.1982
  Fine-tuned:  0.2196
  Improvement: +10.81%

ROUGE-1:
  Baseline:    0.4741
  Fine-tuned:  0.5500
  Improvement: +16.01%

ROUGE-2:
  Baseline:    0.2704
  Fine-tuned:  0.3077
  Improvement: +13.79%

ROUGE-L:
  Baseline:    0.4505
  Fine-tuned:  0.4944
  Improvement: +9.73%
