<a href="https://colab.research.google.com/github/yagizterzi/ImageCaptionGenerator/blob/main/%C4%B0mageCaption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!kaggle competitions download -c obss-intern-competition-2025

In [None]:
import zipfile
import os

# Specify the path to your zip file
zip_file_path = '/content/obss-intern-competition-2025.zip' # Replace with the actual path

# Specify the directory where you want to extract the contents
extraction_dir = '/content/' # Replace with your desired directory name

# Create the extraction directory if it doesn't exist
if not os.path.exists(extraction_dir):
    os.makedirs(extraction_dir)

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents into the specified directory
    zip_ref.extractall(extraction_dir)

print(f"Zip file extracted to: {extraction_dir}")


**Image caption generator using transformers**

Before strarting download all the libaries necessary

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset
import os
from torch.utils.data import DataLoader
import torch
from sentence_transformers import SentenceTransformer
import pandas.api.types
import numpy as np
from scipy.linalg import sqrtm
from typing import List
import gc
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch.optim as optim


# 1-) Download the BLIP model
BLIP is a VLP(Vision-Language Pre training) framework created by Salesforce research it designed for image captioning and visual question answering.It generates natural language descriptions from input images using a transformer-based architecture.

**Processor** turns the image into a tensor using feature extractor and tokenizes the text

**Model** captions the image with creating relation between tokenized text and image tensor using Transformer based architecture

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# 2-) Create a dataset class
Dataset class is designed to load image data and corresponding captions for use in training or evaluating a model.

In [None]:
max_length = 128

class Dataset(Dataset):
    def __init__(self, csv_path, image_folder, processor, train=True):
        self.data = pd.read_csv(csv_path)
        self.image_folder = image_folder
        self.processor = processor
        self.train = train
# Method that returns the total number of samples in the dataset
    def __len__(self):
        return len(self.data)
# Method for getting a single from the dataset at a given index
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        image_path = os.path.join(self.image_folder, str(item['image_id'])+'.jpg')
        # Convert the image RGB format for input uniformity
        image = Image.open(image_path).convert("RGB")

        if self.train:
            caption = item['caption']
            # Process the image and text using the processor
            encoding = self.processor(images=image, text=caption, return_tensors="pt", padding="max_length", truncation=True,max_length=max_length)
            encoding = {k: v.squeeze(0) for k, v in encoding.items()}
            encoding['labels'] = encoding['input_ids']
        else:
          # If not in training mode, process only the image
            encoding = self.processor(images=image, return_tensors="pt", padding="max_length", truncation=True,max_length=max_length)
            encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        return encoding

# 3-) Create a data loader
Create data loader firstly using our dataset class then using the **DataLoader** from Pytorch

Load seperate for testing and training so we can process both text and image or only image depending on our goal



In [None]:
train_dataset = Dataset("/content/train.csv", "/content/train/train", processor, train=True)
train_loader = DataLoader(train_dataset, batch_size=32,shuffle=True)

test_dataset = Dataset("/content/test.csv", "/content/test/test", processor, train=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# 4-) Train the model
During training, the model receives images and captions from the DataLoader.
Images are converted to tensors using the processor and passed to the model.
The predicted captions are compared to the ground truth to calculate the loss.
Based on the loss, model weights are updated and the process repeats for each batch.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

model.to(device)


optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 10
model.train()

for epoch in range(epochs):
    total_loss = 0
    epoch_losses = []

    # For GPU Optimization
    gc.collect()
    torch.cuda.empty_cache()

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        # Move batch to device safely
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward + Backward
        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        # Accumulate loss
        total_loss += loss.item()

    # Calculate metrics
    avg_loss = total_loss / len(train_loader)
    epoch_losses.append(avg_loss)

    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")

torch.save(model.state_dict(), f"blip_epoch{epochs}.pth")

# Create a Loss chart for trained model

In [None]:

# Data
epochs = list(range(1, 11))
loss = [0.8526, 0.2746, 0.2040, 0.1496, 0.1063, 0.0729, 0.0502, 0.0355, 0.0269, 0.0225]

# Plotting
plt.figure(figsize=(8, 5))
plt.plot(epochs, loss, marker='o', linestyle='-', color='blue')
plt.title('Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.xticks(epochs)
plt.tight_layout()
plt.show()

# Load the trained models weights and train the text decoder for better results

In [None]:
state_dict = torch.load("blip_epoch10.pth")
model.load_state_dict(state_dict)
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# Stage 1: Fine-tune the text decoder
for name, param in model.named_parameters():
    if "text_decoder" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-6)

num_epochs = 2
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = batch['input_ids'].to(device)
        images = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(pixel_values=images, input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Decoder Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Stage 2: Fine-tune the last layers of the visual backbone (layer11, layer12)
for name, param in model.named_parameters():
    if "vision_model.encoder.layers.10" in name or "vision_model.encoder.layers.11" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

learnable_params = list(filter(lambda p: p.requires_grad, model.parameters()))
optimizer = torch.optim.AdamW(learnable_params,lr=1e-6)

num_epochs_backbone = 3
for epoch in range(num_epochs_backbone):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        images = batch['pixel_values'].to(device)
        captions_input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)


        outputs = model(pixel_values=images, input_ids=captions_input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Backbone Epoch {epoch+1}/{num_epochs_backbone}, Loss: {loss.item():.4f}")

# Save the fully fine-tuned model
torch.save(model.state_dict(), "blip_finetuned.pth")

In [None]:
model.load_state_dict(torch.load("blip_finetuned.pth"))
# Free up memory
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# Move model to device and enable all parameters for training
model.to(device)
for param in model.parameters():
    param.requires_grad = True

# Define optimizer for all parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)

# Final full model training
final_epochs = 5
model.train()

for epoch in range(final_epochs):
    total_loss = 0.0
    each_loss = []

    gc.collect()
    torch.cuda.empty_cache()

    for batch in tqdm(train_loader, desc=f"Final Training Epoch {epoch+1}/{final_epochs}"):
        # Move inputs to device
        inputs = batch['input_ids'].to(device)
        images = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        # Forward and backward pass
        optimizer.zero_grad()
        outputs = model(pixel_values=images, input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    each_loss.append(avg_loss)
    print(f"Final Training Epoch {epoch+1}/{final_epochs}, Loss: {avg_loss:.4f}")
# Save final model after all fine-tuning steps
torch.save(model.state_dict(), "blip_final.pth")

In [None]:
# Data
epochs = list(range(1, 6))
loss = [0.0035,0.0027,0.0022,0.0019,0.0017]

# Plotting
plt.figure(figsize=(8, 5))
plt.plot(epochs, loss, marker='o', linestyle='-', color='blue')
plt.title('Final Training Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.xticks(epochs)
plt.tight_layout()
plt.show()

In [None]:
# Generate captions and create an empty list to store those captions

# Load the trained model state dictionary
model.load_state_dict(torch.load("blip_final.pth"))

model.eval()
predicted_captions = []


# Disable gradient calculation to save memory
with torch.no_grad():
  # Loop over the test data
    for batch in test_loader:
      # Extract pixel values and generate captions after that decode the text into normal
        pixel_values = batch["pixel_values"].to(device)
        output_ids = model.generate(pixel_values=pixel_values, max_length=128)
        captions = processor.batch_decode(output_ids, skip_special_tokens=True)
        predicted_captions.extend(captions)

# Save the captions into a .csv file
test_df = pd.read_csv("test.csv")
test_df["caption"] = predicted_captions
test_df.to_csv("results.csv", index=False)
