START

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import os, glob

candidates = glob.glob("/content/drive/MyDrive/**/*.zip", recursive=True)
# Show only zip files that look like flickr8k
[f for f in candidates if "flickr" in f.lower() or "flickr8k" in f.lower()][:20]


In [None]:
ZIP_PATH = "/content/drive/MyDrive/flickr8k.zip"
OUT_DIR = "/content/data/flickr8k"

!mkdir -p "$OUT_DIR"
!unzip -q "$ZIP_PATH" -d "$OUT_DIR"
print("Unzipped to:", OUT_DIR)


In [None]:
import os

OUT_DIR = "/content/data/flickr8k"
print("Top-level folders/files inside OUT_DIR:")
print(os.listdir(OUT_DIR)[:50])


In [None]:
import os

OUT_DIR = "/content/data/flickr8k"
images_dir = os.path.join(OUT_DIR, "Images")
captions_file = os.path.join(OUT_DIR, "captions.txt")

print("Images dir exists:", os.path.isdir(images_dir))
print("Num images:", len([f for f in os.listdir(images_dir) if f.lower().endswith(".jpg")]))

print("Captions file exists:", os.path.isfile(captions_file))
with open(captions_file, "r") as f:
    for _ in range(5):
        print(f.readline().strip())


MOST IMORTANT CELL FOR EXECUTION

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)

print("Model loaded on", device)


REBUILDING

In [None]:
import transformers, tokenizers
print("transformers:", transformers.__version__)
print("tokenizers:", tokenizers.__version__)


In [None]:
import os, random
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import numpy as np

from transformers import BlipProcessor, BlipForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

print("Model loaded")


In [None]:
OUT_DIR = "/content/data/flickr8k"
images_dir = os.path.join(OUT_DIR, "Images")
captions_file = os.path.join(OUT_DIR, "captions.txt")

print("Images:", len([f for f in os.listdir(images_dir) if f.lower().endswith(".jpg")]))
print("Captions file exists:", os.path.isfile(captions_file))


In [None]:
import pandas as pd
import numpy as np

rows = []
with open(captions_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        if "," in line:
            img, cap = line.split(",", 1)
        else:
            continue
        img, cap = img.strip(), cap.strip()
        if img.lower() == "image" and cap.lower() == "caption":
            continue
        rows.append({"image": img, "caption": cap})

df = pd.DataFrame(rows)

# split by image (80/10/10)
unique_images = df["image"].unique()
rng = np.random.default_rng(42)
rng.shuffle(unique_images)

n = len(unique_images)
train_end = int(0.8 * n)
val_end   = int(0.9 * n)

train_imgs = set(unique_images[:train_end])
val_imgs   = set(unique_images[train_end:val_end])
test_imgs  = set(unique_images[val_end:])

def assign_split(img):
    if img in train_imgs: return "train"
    if img in val_imgs: return "val"
    return "test"

df["split"] = df["image"].apply(assign_split)

print("Rows per split:")
print(df["split"].value_counts())
print("\nUnique images per split:")
print(df.groupby("split")["image"].nunique())


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
from PIL import Image

MAX_TEXT_LEN = 32

class FlickrCaptionDataset(Dataset):
    def __init__(self, df, split):
        d = df[df["split"] == split].reset_index(drop=True)
        self.grouped = d.groupby("image")["caption"].apply(list).reset_index()

    def __len__(self):
        return len(self.grouped)

    def __getitem__(self, idx):
        row = self.grouped.iloc[idx]
        img_name = row["image"]
        caption = random.choice(row["caption"])

        img_path = os.path.join(images_dir, img_name)
        image = Image.open(img_path).convert("RGB")

        enc = processor(
            images=image,
            text=caption,
            padding="max_length",
            truncation=True,
            max_length=MAX_TEXT_LEN,
            return_tensors="pt",
        )

        item = {k: v.squeeze(0) for k, v in enc.items()}

        labels = item["input_ids"].clone()
        labels[labels == processor.tokenizer.pad_token_id] = -100
        item["labels"] = labels
        return item

def collate_fn(batch):
    keys = batch[0].keys()
    return {k: torch.stack([b[k] for b in batch]) for k in keys}

train_dataset = FlickrCaptionDataset(df, "train")
val_dataset   = FlickrCaptionDataset(df, "val")
test_dataset  = FlickrCaptionDataset(df, "test")

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn, num_workers=0)
val_loader   = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn, num_workers=0)

print("Train batches:", len(train_loader), "Val batches:", len(val_loader))


In [None]:
from torch.optim import AdamW

model.train()
optimizer = AdamW(model.parameters(), lr=5e-5)

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total = 0.0
    n = 0

    for step, batch in enumerate(loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.set_grad_enabled(train):
            out = model(**batch)
            loss = out.loss

            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        total += loss.item()
        n += 1

        if train and step % 100 == 0:
            print(f"step {step}/{len(loader)}  loss={loss.item():.4f}")

    return total / max(n, 1)

train_loss = run_epoch(train_loader, train=True)
val_loss   = run_epoch(val_loader, train=False)
print(f"\nDONE   train_loss={train_loss:.4f}  val_loss={val_loss:.4f}")


In [None]:
SAVE_DIR = "/content/blip_flickr8k_finetuned"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
print("Saved to:", SAVE_DIR)


Saved to: /content/blip_flickr8k_finetuned


In [None]:
import pandas as pd
import random
from PIL import Image

model.eval()

test_grouped = df[df["split"]=="test"].groupby("image")["caption"].apply(list).reset_index()
sample_rows = test_grouped.sample(10, random_state=42).reset_index(drop=True)

results_samples = []

for i, row in sample_rows.iterrows():
    img_name = row["image"]
    refs = row["caption"][:3]  # show 3 reference captions
    img_path = os.path.join(images_dir, img_name)
    image = Image.open(img_path).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_length=20, num_beams=5)
    pred = processor.decode(out[0], skip_special_tokens=True)

    results_samples.append({
        "image": img_name,
        "prediction": pred,
        "ref1": refs[0],
        "ref2": refs[1] if len(refs)>1 else "",
        "ref3": refs[2] if len(refs)>2 else ""
    })

samples_df = pd.DataFrame(results_samples)
samples_df


In [None]:
!pip -q install nltk


In [None]:
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

smooth = SmoothingFunction().method1
model.eval()

# Evaluate on 200 test images for speed
test_subset = test_grouped.sample(n=min(200, len(test_grouped)), random_state=42).reset_index(drop=True)

predictions = []
references = []

for _, row in test_subset.iterrows():
    img_name = row["image"]
    ref_caps = row["caption"]
    img_path = os.path.join(images_dir, img_name)
    image = Image.open(img_path).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_length=20, num_beams=5)
    pred = processor.decode(out[0], skip_special_tokens=True)

    # BLEU expects tokenized refs/preds
    predictions.append(pred.split())
    references.append([r.split() for r in ref_caps])

bleu_score = corpus_bleu(references, predictions, smoothing_function=smooth)
print("BLEU (200 test images):", bleu_score)


In [None]:
train_loss2 = run_epoch(train_loader, train=True)
val_loss2   = run_epoch(val_loader, train=False)
print(f"IMPROVED  train_loss={train_loss2:.4f}  val_loss={val_loss2:.4f}")


In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from PIL import Image
import os
import numpy as np

smooth = SmoothingFunction().method1
model.eval()

# Recreate grouped test set (if not already)
test_grouped = df[df["split"]=="test"].groupby("image")["caption"].apply(list).reset_index()

# Evaluate on 200 test images for speed
test_subset = test_grouped.sample(n=min(200, len(test_grouped)), random_state=42).reset_index(drop=True)

predictions = []
references = []

for _, row in test_subset.iterrows():
    img_name = row["image"]
    ref_caps = row["caption"]
    img_path = os.path.join(images_dir, img_name)
    image = Image.open(img_path).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_length=20, num_beams=5)
    pred = processor.decode(out[0], skip_special_tokens=True)

    predictions.append(pred.split())
    references.append([r.split() for r in ref_caps])

bleu_score_2 = corpus_bleu(references, predictions, smoothing_function=smooth)
print("BLEU (200 test images) AFTER 2 epochs:", bleu_score_2)


In [None]:
import pandas as pd

results_table = pd.DataFrame([
    {"Run": "Epoch 1 (baseline)", "Train Loss": 2.3334, "Val Loss": 2.2349, "BLEU@200": 0.17805705554978052},
    {"Run": "Epoch 2 (improved)", "Train Loss": 2.0635, "Val Loss": 2.2337, "BLEU@200": 0.1944406075573781},
])

results_table


In [None]:
results_csv_path = "/content/drive/MyDrive/blip_results_table_epoch2.csv"
results_table.to_csv(results_csv_path, index=False)
print(" Saved:", results_csv_path)


In [None]:
import pandas as pd
from PIL import Image
import os
import torch

model.eval()

test_grouped = df[df["split"]=="test"].groupby("image")["caption"].apply(list).reset_index()
sample_rows = test_grouped.sample(10, random_state=7).reset_index(drop=True)

qual_rows = []

for _, row in sample_rows.iterrows():
    img_name = row["image"]
    refs = row["caption"][:3]
    img_path = os.path.join(images_dir, img_name)
    image = Image.open(img_path).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_length=20, num_beams=5)
    pred = processor.decode(out[0], skip_special_tokens=True)

    qual_rows.append({
        "image": img_name,
        "prediction_epoch2": pred,
        "ref1": refs[0],
        "ref2": refs[1] if len(refs)>1 else "",
        "ref3": refs[2] if len(refs)>2 else "",
    })

qual_epoch2 = pd.DataFrame(qual_rows)
qual_path = "/content/drive/MyDrive/blip_qualitative_epoch2.csv"
qual_epoch2.to_csv(qual_path, index=False)
print(" Saved:", qual_path)

qual_epoch2.head(5)


In [None]:
train_loss3 = run_epoch(train_loader, train=True)
val_loss3   = run_epoch(val_loader, train=False)
print(f"EPOCH 3  train_loss={train_loss3:.4f}  val_loss={val_loss3:.4f}")


In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from PIL import Image
import os
import torch

smooth = SmoothingFunction().method1
model.eval()

test_grouped = df[df["split"]=="test"].groupby("image")["caption"].apply(list).reset_index()
test_subset = test_grouped.sample(n=min(200, len(test_grouped)), random_state=42).reset_index(drop=True)

predictions = []
references = []

for _, row in test_subset.iterrows():
    img_name = row["image"]
    ref_caps = row["caption"]
    img_path = os.path.join(images_dir, img_name)
    image = Image.open(img_path).convert("RGB")

    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model.generate(**inputs, max_length=20, num_beams=5)
    pred = processor.decode(out[0], skip_special_tokens=True)

    predictions.append(pred.split())
    references.append([r.split() for r in ref_caps])

bleu_score_3 = corpus_bleu(references, predictions, smoothing_function=smooth)
print("BLEU (200 test images) AFTER 3 epochs:", bleu_score_3)


In [None]:
import pandas as pd

results_table_3 = pd.DataFrame([
    {"Run": "Epoch 1 (baseline)", "Train Loss": 2.3334, "Val Loss": 2.2349, "BLEU@200": 0.17805705554978052},
    {"Run": "Epoch 2 (improved)", "Train Loss": 2.0635, "Val Loss": 2.2337, "BLEU@200": 0.1944406075573781},
    {"Run": "Epoch 3 (final)", "Train Loss": float(f"{train_loss3:.4f}"), "Val Loss": float(f"{val_loss3:.4f}"), "BLEU@200": float(bleu_score_3)},
])

results_table_3


In [None]:
results_csv_path = "/content/drive/MyDrive/blip_results_table_epoch3.csv"
results_table_3.to_csv(results_csv_path, index=False)
print(" Saved:", results_csv_path)


In [None]:
FINAL_DIR = "/content/drive/MyDrive/blip_flickr8k_finetuned_epoch3"
model.save_pretrained(FINAL_DIR)
processor.save_pretrained(FINAL_DIR)
print(" Saved final model to:", FINAL_DIR)


END OF FILE