# Image captioning 

In [1]:
import os
import random 
import shutil 
import torch
from pathlib import Path
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
# CONFIG
IMG_DIR = Path(r"D:\work_space\projects\deep_learning\CAP6415_F25_project-Finding-and-solving-hard-to-generate-examples\Data_set\processed")  
CAPTIONS_PER_IMAGE = 4      
USE_AI_REFINEMENT = True  # Set false if you want only the TEMPLATES captions

In [3]:
# BASE CAPTION TEMPLATES
TEMPLATES = [
    "a raised 3d speed bump on asphalt road.",
    "a curved speed hump used to slow vehicles.",
    "a small elevated speed bump in a residential area.",
    "a realistic photo of a speed bump with visible height.",
    "a concrete speed bump with a curved top.",
    "a highly visible yellow road bump on asphalt.",
    "a worn-out raised speed bump on a city street.",
    "a realistic 3d speed bump used for traffic calming.",
    "a clearly visible speed hump with elevation on the road.",
    "a close-up view of a speed bump with realistic height."
]

ANGLES = [
    "from a low-angle perspective",
    "captured from a side view showing elevation",
    "seen from a driver's perspective",
    "from a close-up ground-level view",
    "with visible depth and height"
]


ENVIRONMENT = [
    "during bright daylight",
    "after light rain on the road",
    "in early morning light",
    "on an overcast cloudy day",
    "at night with streetlights",
    "in a quiet suburban neighborhood",
    "on a busy urban street",
    "during sunset lighting"
]

In [4]:
# OPTIONAL AI REFINEMENT (FLAN-T5)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if USE_AI_REFINEMENT:
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)

    def refine_caption(text):
        prompt = f"Rewrite this caption professionally and clearly: {text}"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            output = model.generate(**inputs, max_new_tokens=40, temperature=0.7)
        return tokenizer.decode(output[0], skip_special_tokens=True).lower().strip()
else:
    def refine_caption(text):
        return text.lower().strip()  

Using device: cuda


In [5]:
def ensure_speed_bump(text):
    if "speed bump" not in text and "bump" not in text:
        # Force-reset caption
        text = random.choice(TEMPLATES)  # choose from good templates
    return text

In [6]:
# GENERATE CAPTIONS FOR ALL IMAGES
count = 0

for img_path in tqdm(list(IMG_DIR.glob("*.jpg")), desc="Captioning images"):
    txt_path = img_path.with_suffix(".txt")

    # Base template caption
    base = random.choice(TEMPLATES)
    refined = refine_caption(base)
    refined = ensure_speed_bump(refined)

    # Create caption file
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(refined)

        # Add additional caption variations
        obj = refined.replace("a ", "").replace("the ", "").strip()
        for _ in range(CAPTIONS_PER_IMAGE - 1): 
            aug_caption = (
                f"a realistic photo of {obj} "
                f"{random.choice(ANGLES)} "
                f"{random.choice(ENVIRONMENT)}."
            )
            aug_caption = refine_caption(aug_caption)
            aug_caption = ensure_speed_bump(aug_caption) 
            f.write("\n" + aug_caption)

    count += 1

print(f"Captioning completed for {count} images.")

Captioning images:   0%|          | 0/222 [00:00<?, ?it/s]

Captioning images: 100%|██████████| 222/222 [09:07<00:00,  2.47s/it]

Captioning completed for 222 images.



