In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import zipfile
with zipfile.ZipFile("/content/drive/MyDrive/filtered_images_split1.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/drive/MyDrive/VR_Dataset/")


##Blip

In [2]:
!pip install rouge_score
!pip install -q transformers datasets accelerate timm peft bert-score evaluate nltk

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=faa2f8f8bb224392c427c4ecc3d682914085698c41df7c79000b8e100e6dd219
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━

In [3]:
import os
import json
import pandas as pd
from PIL import Image
from tqdm import tqdm
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import BlipProcessor, BlipForQuestionAnswering, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, TaskType
import torch
import evaluate


In [4]:
MODEL_NAME = "Salesforce/blip-vqa-base"
CSV_PATH = "/content/drive/MyDrive/VR_Dataset/split1.csv"
IMAGE_ROOT = "/content/drive/MyDrive/VR_Dataset/filtered_images_split1"
JSON_PATH = "/content/drive/MyDrive/VR_Dataset/qna_fast.json"
BATCH_SIZE = 4
EPOCHS = 5

device = "cuda" if torch.cuda.is_available() else "cpu"

df = pd.read_csv(CSV_PATH)
with open(JSON_PATH, "r") as f:
    qna_dict = json.load(f)

vqa_data = []
for _, row in df.iterrows():
    image_id = str(row["image_id"])
    image_path = os.path.join(IMAGE_ROOT, row["path"])
    if not os.path.exists(image_path): continue
    for qa in qna_dict.get(image_id, []):
        q = qa["question"].strip("?").strip() + "?"
        a = qa["answer"].strip()
        if a:
            vqa_data.append({
                "image_path": image_path,
                "question": q,
                "answer": a
            })

print(f"Collected {len(vqa_data)} question-answer pairs.")
train_data, eval_data = train_test_split(vqa_data, test_size=0.2, random_state=42)


Collected 2466 question-answer pairs.


In [5]:
# Load model and processor
processor = BlipProcessor.from_pretrained(MODEL_NAME)
model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
# model.gradient_checkpointing_enable()

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)

# Apply LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],  # Update based on model internals if needed
    lora_dropout=0.05,
    bias="none",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

trainable params: 1,179,648 || all params: 385,852,220 || trainable%: 0.3057


In [6]:
def preprocess(example):
    image = Image.open(example["image_path"]).convert("RGB")
    inputs = processor(image, example["question"], return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    labels = processor.tokenizer(example["answer"], return_tensors="pt", padding="max_length", truncation=True, max_length=20)
    inputs["labels"] = labels["input_ids"]
    return {k: v.squeeze() for k, v in inputs.items()}

train_dataset = Dataset.from_list(train_data).map(preprocess)
eval_dataset = Dataset.from_list(eval_data).map(preprocess)


Map:   0%|          | 0/1972 [00:00<?, ? examples/s]

Map:   0%|          | 0/494 [00:00<?, ? examples/s]

In [7]:
def predict(model, entries, label="prediction"):
    preds = []
    for ex in tqdm(entries, desc=f"{label}"):
        try:
            image = Image.open(ex["image_path"]).convert("RGB")
            inputs = processor(image, ex["question"], return_tensors="pt").to(device)
            output = model.generate(**inputs, max_new_tokens=15)
            pred = processor.tokenizer.decode(output[0], skip_special_tokens=True).strip()
            preds.append(pred)
        except Exception as e:
            preds.append("ERROR")
            print(f"[ERROR] {ex['image_path']} - {e}")
    return preds


In [8]:
def evaluate_metrics(preds, refs):
    bertscore = evaluate.load("bertscore")
    bleu = evaluate.load("bleu")
    rouge = evaluate.load("rouge")
    meteor = evaluate.load("meteor")

    metrics = {}

    bert = bertscore.compute(predictions=preds, references=refs, lang="en", rescale_with_baseline=True)
    metrics["bertscore_f1"] = sum(bert["f1"]) / len(bert["f1"])

    metrics["bleu"] = bleu.compute(predictions=preds, references=[[r] for r in refs])["bleu"]
    metrics["meteor"] = meteor.compute(predictions=preds, references=refs)["meteor"]
    metrics["rougeL"] = rouge.compute(predictions=preds, references=refs)["rougeL"]

    return metrics


In [9]:
model_before = BlipForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
preds_before = predict(model_before, eval_data, label="Before Finetuning")




Before Finetuning: 100%|██████████| 494/494 [00:46<00:00, 10.51it/s]


In [10]:
refs = [ex["answer"] for ex in eval_data]
metrics_before = evaluate_metrics(preds_before, refs)

print("Evaluation BEFORE fine-tuning:")
for k, v in metrics_before.items():
    print(f"{k}: {v:.4f}")

pd.DataFrame({
    "image_path": [ex["image_path"] for ex in eval_data],
    "question": [ex["question"] for ex in eval_data],
    "true_answer": refs,
    "predicted_answer": preds_before
}).to_csv("/content/drive/MyDrive/VR_Dataset/blip_vqa_before_finetune.csv", index=False)

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📉 Evaluation BEFORE fine-tuning:
bertscore_f1: 0.7386
bleu: 0.0000
meteor: 0.2378
rougeL: 0.4106


In [11]:
training_args = TrainingArguments(
    output_dir="/content/blip_lora_finetuned",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    logging_dir="/content/logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    report_to="none",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()
trainer.save_model("/content/blip_lora_finetuned")
print("LoRA fine-tuned model saved.")


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
1,8.0716,No log
2,7.9244,No log
3,7.889,No log
4,7.8998,No log
5,7.873,No log


✅ LoRA fine-tuned model saved.


In [12]:
# Save the fine-tuned model and tokenizer
trainer.save_model("/content/drive/MyDrive/VR_Dataset/blip_finetuned_model")

# Also save the processor/tokenizer explicitly if needed
processor.save_pretrained("/content/drive/MyDrive/VR_Dataset/blip_finetuned_model")

[]

In [13]:
from bert_score import score as bertscore

preds = predict(model, eval_data, label="Eval Predictions")
refs = [ex["answer"] for ex in eval_data]
P, R, F1 = bertscore(preds, refs, lang="en", rescale_with_baseline=True)
avg_f1 = F1.mean().item()
print(f"BERTScore F1 (Eval Set): {avg_f1:.4f}")

results_df = pd.DataFrame({
    "image_path": [ex["image_path"] for ex in eval_data],
    "question": [ex["question"] for ex in eval_data],
    "true_answer": refs,
    "predicted_answer": preds,
    "bertscore_f1": F1.tolist()
})

results_df.to_csv("/content/drive/MyDrive/VR_Dataset/blip_vqa_eval_results.csv", index=False)
print("📄 Evaluation results saved.")


Eval Predictions: 100%|██████████| 494/494 [00:56<00:00,  8.79it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔍 BERTScore F1 (Eval Set): 0.7739
📄 Evaluation results saved.


In [26]:
processor = BlipProcessor.from_pretrained(MODEL_NAME)
model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
# model.gradient_checkpointing_enable()

# Freeze some layers to reduce GPU load and speed up training
for name, param in model.named_parameters():
    if "text_encoder.encoder.layer." in name and not name.startswith("text_encoder.encoder.layer.11"):
        param.requires_grad = False
    if "vision_model.encoder" in name:
        param.requires_grad = False


from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "key", "value", "q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)
model = get_peft_model(model, lora_config)

In [27]:
training_args = TrainingArguments(
    output_dir="/content/blip_lora_finetuned",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    logging_dir="/content/logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True,
    report_to="none",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
preds_before = predict(model, eval_data, label="Before Fine-Tuning")

Before Fine-Tuning: 100%|██████████| 494/494 [01:08<00:00,  7.21it/s]


In [28]:
trainer.train()
trainer.save_model("/content/blip_lora_finetuned")

Epoch,Training Loss,Validation Loss
1,8.0602,No log
2,7.9149,No log
3,7.8825,No log
4,7.8897,No log
5,7.8662,No log


In [29]:
#Saving model weights
trainer.save_model("/content/drive/MyDrive/VR_Dataset/blip_finetuned_freezing_model")

processor.save_pretrained("/content/drive/MyDrive/VR_Dataset/blip_finetuned_freezing_model")

[]

In [31]:
preds_after = predict(model, eval_data, label="After Fine-Tuning")

After Fine-Tuning: 100%|██████████| 494/494 [01:04<00:00,  7.72it/s]


In [32]:
refs = [ex["answer"] for ex in eval_data]
P_before, R_before, F1_before = bertscore(preds_before, refs, lang="en", rescale_with_baseline=True)
P_after, R_after, F1_after = bertscore(preds_after, refs, lang="en", rescale_with_baseline=True)

print(f"BERTScore F1 Before Fine-Tuning: {F1_before.mean().item():.4f}")
print(f"BERTScore F1 After Fine-Tuning:  {F1_after.mean().item():.4f}")

results_df = pd.DataFrame({
    "image_path": [ex["image_path"] for ex in eval_data],
    "question": [ex["question"] for ex in eval_data],
    "true_answer": refs,
    "pred_before": preds_before,
    "pred_after": preds_after,
    "bertscore_f1_before": F1_before.tolist(),
    "bertscore_f1_after": F1_after.tolist()
})

results_df.to_csv("/content/drive/MyDrive/VR_Dataset/blip_vqa_lora_results.csv", index=False)
print("📄 Evaluation saved to Google Drive.")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore F1 Before Fine-Tuning: 0.7386
BERTScore F1 After Fine-Tuning:  0.7697
📄 Evaluation saved to Google Drive.


In [14]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [15]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bertscore

def evaluate_metrics(csv_path, pred_col="predicted_answer", ref_col="true_answer"):
    """
    Evaluates Exact Match, BLEU-1, and BERTScore F1 from a CSV.

    Args:
        csv_path (str): Path to the CSV file.
        pred_col (str): Column name for predicted answers.
        ref_col (str): Column name for true answers.

    Returns:
        dict: Dictionary containing 'exact_match', 'bleu1', and 'bertscore_f1'.
    """
    df = pd.read_csv(csv_path)

    preds = df[pred_col].astype(str).str.strip().str.lower().tolist()
    refs = df[ref_col].astype(str).str.strip().str.lower().tolist()

    # Exact Match
    exact_matches = [pred == ref for pred, ref in zip(preds, refs)]
    exact_match_score = sum(exact_matches) / len(exact_matches)

    # BLEU-1 Score
    smoothie = SmoothingFunction().method1
    bleu1_scores = [
        sentence_bleu([[ref]], pred, weights=(1.0, 0, 0, 0), smoothing_function=smoothie)
        for pred, ref in zip(preds, refs)
    ]
    avg_bleu1 = sum(bleu1_scores) / len(bleu1_scores)

    # BERTScore F1
    _, _, F1 = bertscore(preds, refs, lang="en", rescale_with_baseline=True)
    avg_bertscore_f1 = F1.mean().item()

    print(f"Exact Match Accuracy: {exact_match_score:.4f}")
    print(f"BLEU-1 Score: {avg_bleu1:.4f}")
    print(f"BERTScore F1: {avg_bertscore_f1:.4f}")

    return {
        "exact_match": exact_match_score,
        "bleu1": avg_bleu1,
        "bertscore_f1": avg_bertscore_f1
    }


In [18]:
evaluate_metrics("/content/drive/MyDrive/VR_Dataset/blip_vqa_eval_results.csv")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Exact Match Accuracy: 0.5344
🟦 BLEU-1 Score: 0.0020
🔍 BERTScore F1: 0.8721


{'exact_match': 0.5344129554655871,
 'bleu1': 0.0020242914979757085,
 'bertscore_f1': 0.8721415400505066}

In [19]:
evaluate_metrics("/content/drive/MyDrive/VR_Dataset/blip_vqa_before_finetune.csv")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Exact Match Accuracy: 0.4028
🟦 BLEU-1 Score: 0.0020
🔍 BERTScore F1: 0.8231


{'exact_match': 0.402834008097166,
 'bleu1': 0.0020242914979757085,
 'bertscore_f1': 0.8230882287025452}

In [20]:
evaluate_metrics("/content/drive/MyDrive/VR_Dataset/blip_vqa_lora_results.csv","pred_before","true_answer")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Exact Match Accuracy: 0.4028
🟦 BLEU-1 Score: 0.0020
🔍 BERTScore F1: 0.8231


{'exact_match': 0.402834008097166,
 'bleu1': 0.0020242914979757085,
 'bertscore_f1': 0.8230882287025452}

In [21]:
evaluate_metrics("/content/drive/MyDrive/VR_Dataset/blip_vqa_lora_results.csv","pred_after","true_answer")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Exact Match Accuracy: 0.5344
🟦 BLEU-1 Score: 0.0020
🔍 BERTScore F1: 0.8709


{'exact_match': 0.5344129554655871,
 'bleu1': 0.0020242914979757085,
 'bertscore_f1': 0.8708845973014832}