In [9]:
import os
import pandas as pd
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering, Trainer, TrainingArguments
import torch
from accelerate import Accelerator
from sklearn.model_selection import train_test_split
from peft import LoraConfig, get_peft_model
from transformers.data.data_collator import default_data_collator

# Initialize Accelerator for efficient multi-GPU training
accelerator = Accelerator()

# Load BLIP-1 VQA processor and model with fast image processing
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large", use_fast=True)
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")

# Define dataset paths (adjust these to your Kaggle environment)
VQA_DATASET_PATH = '/kaggle/input/vqa-dataset/VQA_Dataset.csv'
ABO_METADATA_PATH = '/kaggle/input/abo-small/metadata/images.csv'
ABO_IMAGE_BASE_PATH = '/kaggle/input/abo-small/small'

# Load VQA dataset
vqa_df = pd.read_csv(VQA_DATASET_PATH)
print(f"Loaded VQA dataset with {len(vqa_df)} entries")

# Load ABO metadata and merge to get image paths
abo_metadata = pd.read_csv(ABO_METADATA_PATH)
vqa_df = pd.merge(vqa_df, abo_metadata[['image_id', 'path']], on='image_id', how='left')

# Handle missing values and ensure answers are strings
vqa_df['answer'] = vqa_df['answer'].fillna('unknown').astype(str)

# Split data into train and test sets
train_df, test_df = train_test_split(vqa_df, test_size=0.2, random_state=51)
print(f"Training set size: {len(train_df)}, Test set size: {len(test_df)}")

Loaded VQA dataset with 64406 entries
Training set size: 51524, Test set size: 12882


In [10]:
# Define a custom VQA Dataset class
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor, image_base_path):
        self.df = df
        self.processor = processor
        self.image_base_path = image_base_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_base_path, row['path'])
        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            image = Image.new("RGB", (224, 224), (0, 0, 0))
        
        question = row['question']
        answer = row['answer']

        # Process image and question with attention mask
        encoding = self.processor(
            images=image,
            text=question,
            padding="max_length",
            max_length=128,  # Fixed length for input_ids and attention_mask
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True  # Explicitly include attention_mask
        )
        
        # Tokenize answer as labels with fixed length
        labels = self.processor.tokenizer(
            answer,
            padding="max_length",
            truncation=True,
            max_length=32,  # Fixed length for labels
            return_tensors="pt"
        )["input_ids"]

        # Remove batch dimension from tensors
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = labels.squeeze(0)

        return encoding

# Create training dataset
train_dataset = VQADataset(train_df, processor, ABO_IMAGE_BASE_PATH)

In [11]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # Target attention layers in BLIP-1
    lora_dropout=0.05,  # Dropout for regularization
    bias="none"  # No bias adaptation
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print("LoRA applied to the model")

# Prepare model with Accelerator
model = accelerator.prepare(model)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    run_name="blip_vqa_lora_finetune",  # Unique run name
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Reduced for memory stability
    gradient_accumulation_steps=4,  # Simulate larger batch size (effective batch size = 16)
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,  # Mixed precision for efficiency
    remove_unused_columns=False,  # Keep all dataset columns
    report_to="none"  # Disable W&B and other logging integrations
)

# Create Trainer instance with default data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=default_data_collator,  # Handle tensor stacking
)

LoRA applied to the model


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
# Check GPU memory usage before training
if torch.cuda.is_available():
    print("GPU Memory Usage Before Training:")
    print(torch.cuda.memory_summary())

# Start fine-tuning with LoRA
trainer.train()

# Save the fine-tuned model
trainer.save_model("./fine_tuned_blip_vqa_lora")
print("Model saved to './fine_tuned_blip_vqa_lora'")

GPU Memory Usage Before Training:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   3040 MiB |   4199 MiB |  11662 GiB |  11659 GiB |
|       from large pool |   3009 MiB |   4008 MiB |   9745 GiB |   9742 GiB |
|       from small pool |     30 MiB |    195 MiB |   1917 GiB |   1917 GiB |
|---------------------------------------------------------------------------|
| Active memory         |   3040 MiB |   4199 MiB |  11662 GiB |  11659 GiB |
|       from large pool |   3009 MiB |   4008 MiB |   9745 GiB |   9742 GiB |
|       from small pool |     30 MiB |    195 MiB |   1917 GiB |   1917 GiB |
|-----------------------------

Step,Training Loss
10,10.2969
20,10.0815
30,9.9105
40,9.7026
50,9.5757
60,9.4385
70,9.3479
80,9.1974
90,9.0906
100,9.0311


Model saved to './fine_tuned_blip_vqa_lora'


In [13]:
import os
import zipfile
from IPython.display import FileLink

# Define the directories to include in the zip file
output_dirs = ['./fine_tuned_blip_vqa_lora', './results', './logs']

# Define the output zip file name
zip_filename = 'fine_tuning_outputs.zip'

# Create a zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for output_dir in output_dirs:
        if os.path.exists(output_dir):
            for root, _, files in os.walk(output_dir):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Add file to zip with a relative path
                    zipf.write(file_path, os.path.relpath(file_path, '.'))
            print(f"Added {output_dir} to {zip_filename}")
        else:
            print(f"Directory {output_dir} does not exist, skipping")

# Generate a download link for the zip file
display(FileLink(zip_filename))

# Print instructions for downloading
print(f"Click the link above to download '{zip_filename}'.")
print("If the link doesn't work, go to the 'Output' tab in Kaggle, locate the file, and download it manually.")

Added ./fine_tuned_blip_vqa_lora to fine_tuning_outputs.zip
Added ./results to fine_tuning_outputs.zip
Directory ./logs does not exist, skipping


Click the link above to download 'fine_tuning_outputs.zip'.
If the link doesn't work, go to the 'Output' tab in Kaggle, locate the file, and download it manually.
