In [None]:
!pip install transformers peft accelerate bitsandbytes -q

In [None]:
# Importing required libraries

import os
import torch
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import train_test_split
from accelerate.utils import DataLoaderConfiguration
from accelerate import Accelerator
from transformers import (
    BlipProcessor,
    BlipForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)


In [None]:
#Config

DATASET_CSV = '/kaggle/input/lora-finetune/qna_2.csv'
IMAGE_BASE_DIR = '/kaggle/input/lora-finetune/images/images'
MODEL_NAME = "Salesforce/blip-vqa-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
BATCH_SIZE = 16
EVALUATION_BATCH_SIZE = 32
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5
OUTPUT_DIR = "/kaggle/working"
LORA_ADAPTER_DIR = os.path.join(OUTPUT_DIR, "lora_adapters")
MAX_LENGTH = 128
print(f"Device: {DEVICE}")

In [None]:
# A custom dataset class for VQA — connects questions, answers, and images
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor, image_dir, max_length):
        self.processor = processor
        self.image_dir = image_dir
        self.valid_entries = []
        self.max_length = max_length
        
        # Go through each row and make sure the image file actually exists    
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Verifying image paths"):
            img_path = os.path.join(image_dir, str(row['filename']))
            if os.path.exists(img_path):
                self.valid_entries.append(row)
            else:
                print(f"Missing image at {img_path}. Skipping.")

        if not self.valid_entries:
            raise RuntimeError("No valid image paths found. Check your input paths.")

        # Keep only the rows that had working image paths
        self.df = pd.DataFrame(self.valid_entries)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]
        question = str(row['question'])
        answer = str(row['answer'])
        image_path = os.path.join(self.image_dir, str(row['filename']))

        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            raise
        
        # Send everything (image + question + answer) to the processor
        # The processor handles both image encoding and tokenization  
        encoding = self.processor(
            images=image,
            text=question,
            text_target=answer,
            padding="max_length",# Pad everything to the same size
            truncation=True,# Truncate everything to max_length
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in encoding.items()}


In [None]:
# Loading model and processor

model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME)
processor = BlipProcessor.from_pretrained(MODEL_NAME, use_fast=True)
print("Finished loading model and processor.")


In [None]:
# Setting up Lora
target_modules = ["q_proj", "k_proj", "v_proj", "query", "key", "value"] # These are the common target_modules

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=target_modules,
    lora_dropout=LORA_DROPOUT,
    bias="none",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


In [None]:
# Generating train and test data with 80-20 split
df_full = pd.read_csv(DATASET_CSV)
train_df, val_df = train_test_split(df_full, test_size=0.2, random_state=7)

# Converting the train and test datasets to the required format

try:
    train_dataset = VQADataset(train_df, processor, IMAGE_BASE_DIR, MAX_LENGTH)
    val_dataset = VQADataset(val_df, processor, IMAGE_BASE_DIR, MAX_LENGTH)
    print(f"Datasets loaded: train={len(train_dataset)}, val={len(val_dataset)}")
except Exception as e:
    print(f"Error: {e}")
    exit()

if len(train_dataset) == 0:
    print("train dataset empty")
    exit()


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=processor.tokenizer)

total_steps = len(train_dataset) * NUM_EPOCHS // BATCH_SIZE

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
accelerator = Accelerator(dataloader_config=dataloader_config)


In [None]:
# Training model with Adam optimizer and saving model after every epoch
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import time

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

model.to(DEVICE)
model.train()

best_loss = float("inf")
patience = 2
epochs_no_improve = 0

save_dir = os.path.join(OUTPUT_DIR, "lora_output")
os.makedirs(save_dir, exist_ok=True)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=data_collator
)

print("Starting manual training...")

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS}")
    start_time = time.time()

    running_loss = 0.0
    loop = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}", leave=False)

    for index, batch in enumerate(loop):
        input_ids = batch["input_ids"].to(DEVICE)
        pixel_values = batch["pixel_values"].to(DEVICE)
        attention_mask = batch.get("attention_mask", None)
        if attention_mask is not None:
            attention_mask = attention_mask.to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()

        with autocast():
            outputs = model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = running_loss / len(train_dataloader)
    duration = time.time() - start_time
    print(f"Avg Loss for Epoch {epoch+1}: {avg_loss:.4f} | Time taken: {duration:.2f} sec")

    # Save model after each epoch
    epoch_dir = os.path.join(save_dir, f"epoch_{epoch+1}")
    model.save_pretrained(epoch_dir)
    processor.save_pretrained(epoch_dir)
    print(f"Saved model to: {epoch_dir}")

    # Early stopping check
    if avg_loss < best_loss:
        best_loss = avg_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping: No improvement for 2 consecutive epochs.")
            break


In [None]:
!zip -r /kaggle/working/epoch_3.zip /kaggle/working/lora_output/epoch_3
