In [6]:
import os
# ─────────────────────────────────────────────────────────────
# Only GPU 0 visible
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# Disable WandB / multi-node
os.environ["WANDB_DISABLED"] = "true"
# ─────────────────────────────────────────────────────────────

import sys
import traceback
import pandas as pd
import torch
from PIL import Image
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

from transformers import (
    BlipProcessor,
    BlipForQuestionAnswering,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# --- CONFIG ---
DATASET_CSV      = '../VR-mini-Proj-2/fullInput.csv'
IMAGE_BASE_DIR   = '../images/small'
MODEL_NAME       = "Salesforce/blip-vqa-base"
USE_4BIT         = True
BATCH_SIZE       = 8
EVAL_BATCH_SIZE  = 16
NUM_EPOCHS       = 3
LR               = 5e-5
LORA_R           = 16
LORA_ALPHA       = 32
LORA_DROPOUT     = 0.05
MAX_LEN          = 128
OUTPUT_DIR       = "./blip_vqa_lora_q_v8"
LORA_DIR         = os.path.join(OUTPUT_DIR, "lora_adapters")
DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"

print("CUDA available:", torch.cuda.is_available(),
      "Device count:", torch.cuda.device_count())

CUDA available: False Device count: 0


In [7]:
# --- DATASET CLASS ---
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor, img_dir, max_len):
        self.df = df.reset_index(drop=True)
        self.proc = processor
        self.img_dir = img_dir
        self.max_len = max_len
        valid = []
        for i, r in tqdm(self.df.iterrows(), total=len(self.df), desc="Checking images"):
            if os.path.exists(os.path.join(img_dir, str(r['filename']))):
                valid.append(i)
        if not valid:
            raise RuntimeError("No valid images found.")
        self.df = self.df.loc[valid].reset_index(drop=True)

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(os.path.join(self.img_dir, str(row['filename']))).convert("RGB")
        enc = self.proc(
            images=img,
            text=str(row['question']),
            text_target=str(row['answer']),
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
        )
        return {k: v.squeeze(0) for k, v in enc.items()}

In [10]:
# --- LOAD & SPLIT CSV ---
df = pd.read_csv(DATASET_CSV)
if 'filename' not in df.columns:
    sys.exit("ERROR: 'filename' column missing in CSV")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# --- PROCESSOR & 4-BIT SETUP ---
processor = BlipProcessor.from_pretrained(MODEL_NAME, use_fast=True)
model_kwargs = {}
if USE_4BIT:
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=(
            torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        ),
        bnb_4bit_use_double_quant=True,
    )
    model_kwargs["quantization_config"] = quant_cfg
    model_kwargs["torch_dtype"] = (
        torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    )
    print("4-bit quantization configured (dtype:", model_kwargs["torch_dtype"], ")")

# --- LOAD & PREPARE MODEL ---
model = BlipForQuestionAnswering.from_pretrained(
    MODEL_NAME,
    device_map="auto",      # pins everything to cuda:0
    **model_kwargs
)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "query", "key", "value"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

4-bit quantization configured (dtype: torch.float16 )
trainable params: 3,538,944 || all params: 388,211,516 || trainable%: 0.9116


In [11]:
# --- Datasets ---
train_ds = VQADataset(train_df, processor, IMAGE_BASE_DIR, MAX_LEN)
val_ds   = VQADataset(val_df,   processor, IMAGE_BASE_DIR, MAX_LEN)

# --- TRAINING ARGS (no best-model) ---
warmup = max(100, int(0.1 * len(train_ds) * NUM_EPOCHS / BATCH_SIZE))
logs   = max(10,  int(len(train_ds) / BATCH_SIZE / 10))
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=warmup,
    learning_rate=LR,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=logs,
    eval_strategy="epoch",
    save_strategy="epoch",
    # drop load_best_model_at_end & metric_for_best_model
    eval_accumulation_steps=1,
    prediction_loss_only=True,
    report_to="tensorboard",
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    fp16=torch.cuda.is_available(),
)



Checking images: 100%|██████████| 39708/39708 [00:14<00:00, 2651.60it/s]
Checking images: 100%|██████████| 9928/9928 [00:02<00:00, 3561.05it/s]


In [None]:
# --- CUSTOM TRAINER (match signature) ---
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        global trainer
        trainer = self

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        out = model(**inputs)
        loss = out.loss if hasattr(out, "loss") else out[0]
        return (loss, out) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
)

# --- RUN TRAINING ---
print("Starting 4-bit LoRA fine-tuning…")
try:
    trainer.train()
    trainer.model.save_pretrained(LORA_DIR)
    processor.save_pretrained(LORA_DIR)
    print("Saved LoRA adapters to", LORA_DIR)
except Exception as e:
    print("Training failed:", e)
    traceback.print_exc()
    sys.exit(1)


CUDA available: True Device count: 1
4-bit quantization ready: torch.bfloat16
trainable params: 3,538,944 || all params: 388,211,516 || trainable%: 0.9116


Checking images:   0%|          | 0/39708 [00:00<?, ?it/s]

Checking images:   0%|          | 0/9928 [00:00<?, ?it/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting 4-bit LoRA fine-tuning…


Epoch,Training Loss,Validation Loss
1,8.4816,No log
2,8.4742,No log
3,8.4651,No log


Saved LoRA adapters to ./blip_vqa_lora_q_final/lora_adapters


In [36]:
print("\n--- Starting Post-Training Evaluation ---")

# --- CONFIG FOR EVALUATION (Mostly inherited or defined above) ---
SAVED_LORA_ADAPTER_DIR_EVAL = LORA_DIR # Adapters just saved
USE_4BIT_FOR_BASE_MODEL_EVAL = USE_4BIT # Match training quantization
# For evaluation, we'll use the same val_df_for_training split,
# or you can define a new df for a dedicated test set.
eval_df_for_final_eval = val_df.copy()


--- Starting Post-Training Evaluation ---


In [27]:
try:
    eval_processor = BlipProcessor.from_pretrained(SAVED_LORA_ADAPTER_DIR_EVAL, use_fast=True)
    print("Evaluation processor loaded from adapter directory.")
except Exception as e_proc:
    print(f"Could not load eval processor from adapter dir: {e_proc}. Using training processor.")
    eval_processor = processor # Fallback to the one used in training

Evaluation processor loaded from adapter directory.


In [28]:
# --- LOAD BASE MODEL AND APPLY BEST LoRA ADAPTER FOR EVALUATION ---
print("Loading base model for final evaluation...")
model_kwargs_eval = {}
if USE_4BIT_FOR_BASE_MODEL_EVAL:
    # Use the same compute_dtype determined during training setup
    compute_dtype_eval = model_kwargs.get("torch_dtype", torch.float16)
    quant_cfg_eval = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype_eval,
        bnb_4bit_use_double_quant=True,
    )
    model_kwargs_eval["quantization_config"] = quant_cfg_eval
    model_kwargs_eval["torch_dtype"] = compute_dtype_eval
    print(f"Base model for final eval will be loaded with 4-bit. Compute dtype: {compute_dtype_eval}")

base_model_eval = BlipForQuestionAnswering.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    **model_kwargs_eval
)

Loading base model for final evaluation...
Base model for final eval will be loaded with 4-bit. Compute dtype: torch.bfloat16


In [29]:
if USE_4BIT_FOR_BASE_MODEL_EVAL and hasattr(base_model_eval, 'is_loaded_in_4bit') and base_model_eval.is_loaded_in_4bit:
    print("Base model for final evaluation successfully loaded in 4-bit.")

Base model for final evaluation successfully loaded in 4-bit.


In [30]:
print(f"Loading best LoRA weights from {SAVED_LORA_ADAPTER_DIR_EVAL} for final evaluation...")
eval_model = PeftModel.from_pretrained(base_model_eval, SAVED_LORA_ADAPTER_DIR_EVAL)
eval_model.eval() # Set to evaluation mode
# Optional: Merge for potentially faster inference
# eval_model = eval_model.merge_and_unload()
# print("LoRA adapters merged into base model for final evaluation.")
print("Best LoRA adapters loaded successfully for final evaluation.")

Loading best LoRA weights from ./blip_vqa_lora_q_final/lora_adapters for final evaluation...
Best LoRA adapters loaded successfully for final evaluation.


In [31]:
eval_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): BlipForQuestionAnswering(
      (vision_model): BlipVisionModel(
        (embeddings): BlipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (encoder): BlipEncoder(
          (layers): ModuleList(
            (0-11): 12 x BlipEncoderLayer(
              (self_attn): BlipAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): Linear4bit(in_features=768, out_features=2304, bias=True)
                (projection): Linear4bit(in_features=768, out_features=768, bias=True)
              )
              (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): BlipMLP(
                (activation_fn): GELUActivation()
                (fc1): Linear4bit(in_features=768, out_features=3072, bias=True)
                (fc2): Linear4bit(in_features=3072, out_features=768, bias=True)
              )
              (l

In [38]:
# --- CREATE EVALUATION DATASET & DATALOADER FOR FINAL EVAL ---
print("Creating dataset for final evaluation...")
final_eval_dataset = VQADataset(eval_df_for_final_eval, eval_processor, IMAGE_BASE_DIR, MAX_LEN)

Creating dataset for final evaluation...


Checking images:   0%|          | 0/9928 [00:00<?, ?it/s]

In [41]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [48]:
import math

predictions_ft = []
ground_truths_normalized_ft = []
original_indices_ft = []

num_batches_eval = math.ceil(len(eval_df_for_final_eval) / EVAL_BATCH_SIZE)

In [50]:
with torch.no_grad():
    for i in tqdm(range(0, len(eval_df_for_final_eval), EVAL_BATCH_SIZE), total=num_batches_eval, desc="Evaluating Fine-tuned Model"):
        batch_df = eval_df_for_final_eval[i:i+EVAL_BATCH_SIZE]
        
        batch_images_pil = []
        batch_questions = []
        current_batch_ground_truths = [] # Ground truths for this specific batch
        current_batch_original_indices = [] # Original indices for this specific batch

        for idx_in_batch, (original_df_idx, row) in enumerate(batch_df.iterrows()):
            question = str(row['question'])
            true_answer = str(row['answer']).lower().strip()
            # Use the 'filename' column
            image_filename = str(row['filename'])
            img_path = os.path.join(IMAGE_BASE_DIR, image_filename)

            try:
                raw_image = Image.open(img_path).convert('RGB')
                batch_images_pil.append(raw_image)
                batch_questions.append(question)
                current_batch_ground_truths.append(true_answer)
                current_batch_original_indices.append(original_df_idx)
            except FileNotFoundError:
                print(f"Warning (Eval): Image not found at {img_path}. Skipping.")
            except Exception as e:
                print(f"Warning (Eval): Error loading image {img_path}: {e}. Skipping.")

        if not batch_images_pil:
            print(f"Warning (Eval): No valid images for batch starting at {i}. Skipping.")
            continue

        inputs = eval_processor(images=batch_images_pil, text=batch_questions, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
        
        # Generate answers
        # For LoRA model, if it's merged, call is same. If separate, PEFT handles it.
        outputs = eval_model.generate(**inputs, max_new_tokens=10) # Keep consistent with baseline
        
        batch_preds_decoded = eval_processor.batch_decode(outputs, skip_special_tokens=True)

        for pred_idx, decoded_pred in enumerate(batch_preds_decoded):
            predicted_answer = decoded_pred.strip().lower()
            predicted_answer = re.sub(r'[^\w\s]', '', predicted_answer)

            true_answer_normalized = current_batch_ground_truths[pred_idx]
            true_answer_normalized = re.sub(r'[^\w\s]', '', true_answer_normalized)

            predictions_ft.append(predicted_answer)
            ground_truths_normalized_ft.append(true_answer_normalized)
            original_indices_ft.append(current_batch_original_indices[pred_idx])

Evaluating Fine-tuned Model:   0%|          | 0/621 [00:00<?, ?it/s]

In [51]:
results_ft_df = pd.DataFrame({
    'original_index': original_indices_ft,
    'predicted_answer_ft': predictions_ft,
    'ground_truth_normalized': ground_truths_normalized_ft # Name kept same for consistency
})

In [52]:
results_ft_df.head()

Unnamed: 0,original_index,predicted_answer_ft,ground_truth_normalized
0,7055,hard,hard
1,49141,gold,gold
2,40947,three,three
3,12004,yes,yes
4,7905,two,two


In [53]:
eval_df = eval_df_for_final_eval

In [54]:
df_with_ft_results = eval_df.merge(results_ft_df, left_index=True, right_on='original_index', how='right')

results_ft_filename = 'vqa_results_lora_finetuned.csv'
df_with_ft_results.to_csv(results_ft_filename, index=False)
print(f"Fine-tuned results saved to {results_ft_filename}")

Fine-tuned results saved to vqa_results_lora_finetuned.csv


In [55]:
df_with_ft_results.keys()

Index(['id', 'question', 'answer', 'filename', 'original_index',
       'predicted_answer_ft', 'ground_truth_normalized'],
      dtype='object')

In [59]:
# --- Calculate Metrics for Fine-tuned Model ---
from bert_score import score
import evaluate

if not predictions_ft:
    print("Error: No valid fine-tuned predictions available to calculate metrics.")
else:
    correct_predictions_ft = sum(p == gt for p, gt in zip(predictions_ft, ground_truths_normalized_ft))
    total_valid_ft = len(predictions_ft)
    accuracy_ft = correct_predictions_ft / total_valid_ft if total_valid_ft > 0 else 0
    pred= df_with_ft_results['predicted_answer_ft']
    gt = df_with_ft_results['ground_truth_normalized']
    print(f"Fine-tuned Model Accuracy (Exact Match): {accuracy_ft:.4f}")

    # BERTScore (optional, if you want to compare)!
    try:
        bertscore = evaluate.load("bertscore")
        # Ensure predictions and references are lists of strings
        # P, R, F1_bs  = score(pred.tolist(), gt.tolist(), lang='en', verbose=False, model_type='distilbert-base-uncased',rescale_with_baseline=True)
        P, R, F1_bs  = score(pred.tolist(), gt.tolist(), lang='en', verbose=False,rescale_with_baseline=True)
        P1, R1, F1_bs1  = score(pred.tolist(), gt.tolist(), lang='en', verbose=False)
        # avg_f1_bertscore = sum(bertscore_results['f1']) / len(bertscore_results['f1']) if bertscore_results['f1'] else 0
        print(f"▶ BERTScore → P: {P.mean().item():.4f}, R: {R.mean().item():.4f}, F1: {F1_bs.mean().item():.4f}")
        print(f"▶ BERTScore1 → P1: {P1.mean().item():.4f}, R: {R1.mean().item():.4f}, F1: {F1_bs1.mean().item():.4f}")
    except Exception as e:
        print(f"Could not compute BERTScore for fine-tuned model: {e}")

print("LoRA Fine-tuning and Evaluation Script Finished.")

Fine-tuned Model Accuracy (Exact Match): 0.5701


Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


▶ BERTScore → P: 0.8724, R: 0.8459, F1: 0.8572
▶ BERTScore1 → P1: 0.9785, R: 0.9740, F1: 0.9759
LoRA Fine-tuning and Evaluation Script Finished.
