In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('/kaggle/input/qna-final/qna_final.csv')
print(data.columns)

# Split into train (80%) and val (20%) with shuffling and fixed random seed for reproducibility
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")


Index(['Image_ID', 'Item_ID', 'Question', 'Answer', 'Image_Path'], dtype='object')
Train size: 19109, Validation size: 4778


In [2]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
import string



class VQADataset(Dataset):
    def __init__(self, data, image_dir, processor):
        self.data = data
        self.image_dir = image_dir
        self.processor = processor
    def normalize(self, text):
        text = text.strip().lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        return text

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        question = self.normalize(item["Question"])
        answer = self.normalize(item["Answer"])
        image_path = os.path.join(self.image_dir, item["Image_Path"])
        image = Image.open(image_path).convert("RGB")

        # Tokenize inputs
        inputs = self.processor(image, question,padding="max_length", 
                                max_length=64,
                                 return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = self.processor.tokenizer(
                            answer,
                            padding="max_length",       # pad answer to max length
                            max_length=64,  # match input length or set your own max_length
                            
                            return_tensors="pt"
                        )["input_ids"].squeeze(0)

        return inputs


In [3]:
from transformers import BlipProcessor

# Load processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

# Initialize dataset
train_dataset = VQADataset(
    data = train_df, 
    image_dir="/kaggle/input/vqa-images", 
    processor=processor
)
val_dataset = VQADataset(
    data = val_df, 
    image_dir="/kaggle/input/vqa-images", 
    processor=processor
)
# Test sample
sample = train_dataset[0]
print({k: v.shape for k, v in sample.items()})
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=8)


2025-05-18 04:01:12.938986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747540873.134526      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747540873.190354      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

{'pixel_values': torch.Size([3, 384, 384]), 'input_ids': torch.Size([64]), 'attention_mask': torch.Size([64]), 'labels': torch.Size([64])}


In [4]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import get_peft_model, LoraConfig, TaskType
import torch

model_id = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_id)
model = BlipForQuestionAnswering.from_pretrained(model_id)

# Configure LoRA
lora_config = LoraConfig(
    r=16,               # rank
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"]  # common attention projection layers
    #task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
from transformers import BlipProcessor

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
tokenizer = processor.tokenizer


# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

PeftModel(
  (base_model): LoraModel(
    (model): BlipForQuestionAnswering(
      (vision_model): BlipVisionModel(
        (embeddings): BlipVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
        )
        (encoder): BlipEncoder(
          (layers): ModuleList(
            (0-11): 12 x BlipEncoderLayer(
              (self_attn): BlipAttention(
                (dropout): Dropout(p=0.0, inplace=False)
                (qkv): Linear(in_features=768, out_features=2304, bias=True)
                (projection): Linear(in_features=768, out_features=768, bias=True)
              )
              (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
              (mlp): BlipMLP(
                (activation_fn): GELUActivation()
                (fc1): Linear(in_features=768, out_features=3072, bias=True)
                (fc2): Linear(in_features=3072, out_features=768, bias=True)
              )
              (layer_norm2): Lay

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
import random
from tqdm import tqdm

#train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
#val_loader = DataLoader(val_dataset, shuffle=True, batch_size=8)
num_epochs = 20
optimizer = AdamW(model.parameters(), lr=5e-5)

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

best_val_acc = 0.0

def decode_predictions(generated_ids, tokenizer):
    return [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]

def decode_labels(label_ids, tokenizer):
    labels = label_ids.cpu().tolist()
    return [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

for epoch in range(num_epochs):
    model.train()
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    # Training loop with tqdm
    train_iter = tqdm(train_loader, desc="Training", leave=False)
    for batch_idx, batch in enumerate(train_iter):

        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)
        attention_mask = batch.pop("attention_mask").to(device)
        labels = batch.pop("labels").to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, 
                        attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        train_iter.set_postfix(loss=loss.item())
        """if batch_idx ==10:
            break"""

    model.eval()
    total_correct = 0
    total_count = 0
    
    with torch.no_grad():
        val_iter = tqdm(val_loader, desc="Validation", leave=False)
        for batch_idx, batch in enumerate(val_iter):

            input_ids = batch.pop("input_ids").to(device)
            pixel_values = batch.pop("pixel_values").to(device)
            attention_mask = batch.pop("attention_mask").to(device)
            labels = batch.pop("labels").to(device)
    
            # ⬇️ Use generate instead of logits
            generated_ids = model.generate(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                max_new_tokens=20  # Set as needed
            )
            
            # Decode predictions
            pred_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
            label_texts = decode_labels(labels, tokenizer)
    
            batch_correct = sum(p.strip().lower() == l.strip().lower()
                                for p, l in zip(pred_texts, label_texts))
            total_correct += batch_correct
            total_count += len(pred_texts)
    
            val_iter.set_postfix(acc=total_correct / total_count)
            """if batch_idx == 10:
                break"""


    val_acc = total_correct / total_count
    print(f"Validation Accuracy: {val_acc:.4f}")

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")
        print("Saved best model!")

    """# Print 2 random train samples
    train_samples = random.sample(list(train_dataset), 2)
    print("\nSample train predictions:")
    model.eval()
    with torch.no_grad():
        for sample in train_samples:
            # Prepare inputs
            inputs = {k: v.unsqueeze(0).to(device) for k, v in sample.items() if k != 'labels'}
            labels = sample['labels'].unsqueeze(0).to(device)
    
            # Generate prediction
            generated_ids = model.generate(**inputs, max_new_tokens=20)
            pred_text = decode_predictions(generated_ids, tokenizer)[0]
            label_text = decode_labels(labels, tokenizer)[0]
    
            print(f"Predicted: {pred_text}")
            print(f"Actual:    {label_text}")
    
    # Print 2 random val samples
    val_samples = random.sample(list(val_dataset), 2)
    print("\nSample val predictions:")
    with torch.no_grad():
        for sample in val_samples:
            inputs = {k: v.unsqueeze(0).to(device) for k, v in sample.items() if k != 'labels'}
            labels = sample['labels'].unsqueeze(0).to(device)
    
            generated_ids = model.generate(**inputs, max_new_tokens=20)
            pred_text = decode_predictions(generated_ids, tokenizer)[0]
            label_text = decode_labels(labels, tokenizer)[0]
    
            print(f"Predicted: {pred_text}")
            print(f"Actual:    {label_text}")
    print("\n" + "=" * 40 + "\n")"""



Epoch 1/20


                                                                        

Validation Accuracy: 0.5429
Saved best model!
Epoch 2/20


                                                                        

Validation Accuracy: 0.4774
Epoch 3/20


                                                                        

Validation Accuracy: 0.4077
Epoch 4/20


                                                                        

Validation Accuracy: 0.4494
Epoch 5/20


                                                                        

Validation Accuracy: 0.5795
Saved best model!
Epoch 6/20


                                                                        

Validation Accuracy: 0.5950
Saved best model!
Epoch 7/20


                                                                        

Validation Accuracy: 0.6365
Saved best model!
Epoch 8/20


                                                                        

Validation Accuracy: 0.6141
Epoch 9/20


                                                                        

Validation Accuracy: 0.6373
Saved best model!
Epoch 10/20


                                                                        

Validation Accuracy: 0.6241
Epoch 11/20


                                                                        

Validation Accuracy: 0.6570
Saved best model!
Epoch 12/20


                                                                        

Validation Accuracy: 0.6342
Epoch 13/20


                                                                        

Validation Accuracy: 0.6319
Epoch 14/20


                                                                        

Validation Accuracy: 0.6666
Saved best model!
Epoch 15/20


Training:   3%|▎         | 68/2389 [00:51<29:10,  1.33it/s, loss=8.33]

In [56]:
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import random
from tqdm import tqdm
model.load_state_dict(torch.load("/kaggle/working/best_model.pt"))

# DataLoaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader   = DataLoader(val_dataset,   shuffle=False, batch_size=8)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer + Scheduler
optimizer = AdamW(
    model.parameters(),
    lr=5e-5,
    weight_decay=1e-2,           # 🔹 add weight decay
)
total_steps  = len(train_loader) * 20  # 20 epochs
warmup_steps = int(0.1 * total_steps)  # 10% warmup
scheduler    = get_linear_schedule_with_warmup(
    optimizer, warmup_steps, total_steps
)

# For mixed precision
scaler = torch.cuda.amp.GradScaler()

# Early stopping
best_val_acc = 0.0
patience, wait = 3, 0

def decode_predictions(generated_ids, tokenizer):
    return [tokenizer.decode(ids, skip_special_tokens=True) for ids in generated_ids]

def decode_labels(label_ids, tokenizer):
    labels = label_ids.cpu().tolist()
    return [tokenizer.decode(l, skip_special_tokens=True) for l in labels]

for epoch in range(1, 21):
    model.train()
    print(f"\n→ Epoch {epoch}/20")
    train_iter = tqdm(train_loader, desc=" Training", leave=False)

    for batch_idx, batch in enumerate(train_iter):
        input_ids      = batch.pop("input_ids").to(device)
        pixel_values   = batch.pop("pixel_values").to(device)
        attention_mask = batch.pop("attention_mask").to(device)
        labels         = batch.pop("labels").to(device)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():  # 🔹 mixed precision
            outputs = model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()  # 🔹 step LR every batch

        train_iter.set_postfix(batch=batch_idx+1, loss=f"{loss.item():.3f}")

    # —— Validation ——  
    model.eval()
    total_correct, total_count = 0, 0
    val_iter = tqdm(val_loader, desc=" Validation", leave=False)

    with torch.no_grad():
        for batch_idx, batch in enumerate(val_iter):
            input_ids      = batch.pop("input_ids").to(device)
            pixel_values   = batch.pop("pixel_values").to(device)
            attention_mask = batch.pop("attention_mask").to(device)
            labels         = batch.pop("labels").to(device)

            # 🔹 Beam search decoding
            generated_ids = model.generate(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask,
                max_new_tokens=20,
                num_beams=3,
                early_stopping=True
            )

            pred_texts  = decode_predictions(generated_ids, tokenizer)
            label_texts = decode_labels(labels, tokenizer)

            for p, l in zip(pred_texts, label_texts):
                if p.strip().lower() == l.strip().lower():
                    total_correct += 1
                total_count += 1

            val_iter.set_postfix(acc=f"{100*total_correct/total_count:.2f}%")

    val_acc = total_correct / total_count
    print(f"→ Validation Accuracy: {val_acc*100:.2f}%")

    # Early‑stop & checkpoint
    if val_acc > best_val_acc:
        best_val_acc, wait = val_acc, 0
        torch.save(model.state_dict(), "best_model.pt")
        print("✔️  Saved best model!")
    else:
        wait += 1
        if wait >= patience:
            print(f"⏹ Early stopping at epoch {epoch}")
            break

# …after 20 epochs you can reload and continue training:
# model.load_state_dict(torch.load("best_model.pt"))
# then run another loop (epochs 21–40) with the same setup.


Batch keys: dict_keys(['pixel_values', 'input_ids', 'attention_mask', 'labels'])


# Inference

In [4]:
# install dependencies
!pip install bert-score
!git clone https://github.com/neulab/BARTScore.git
# 1. Add BARTScore to path
import sys
sys.path.append("./BARTScore")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.0.0->bert-score)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.0.0->bert-score)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=1.0.0->bert-score)
  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 220, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 220 (delta 18), reused 14 (delta 14), pack-reused 194 (from 1)[K
Receiving objects: 100% (220/220), 101.98 MiB | 23.74 MiB/s, done.
Resolving deltas: 100% (47/47), done.
Updating files: 100% (192/192), done.


In [6]:
import sys
# 1. Add BARTScore to path
sys.path.append("./BARTScore")

In [5]:
import sys, time, torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from bert_score import score as bert_score
from peft import PeftModel
from transformers import BlipProcessor, BlipForQuestionAnswering
from peft import get_peft_model, LoraConfig
from bart_score import BARTScorer  # assume path already added
import gc

# 1. Device

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print("1 done")

# 2. Load processor & base model (full precision)
model_id = "Salesforce/blip-vqa-base"
processor = BlipProcessor.from_pretrained(model_id)
tokenizer = processor.tokenizer
print("2 done")

model = BlipForQuestionAnswering.from_pretrained(model_id).to(device)
print("2 done")

# 3. Attach LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"]
)
model = get_peft_model(model, lora_config)
print("3 done")

# 4. Load adapter weights
adapter_state = torch.load("/kaggle/input/blip_best/transformers/default/1/best_model.pt", map_location="cpu")
model.load_state_dict(adapter_state, strict=False)
print("4 done")

# 5. Build label ↔ ID mapping
label_list = sorted(set(tokenizer.decode(sample["labels"], skip_special_tokens=True).strip().lower()
                        for sample in val_dataset))
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}
print("5 done")

# 6. Evaluation loop
all_preds, all_labels = [], []
all_pred_ids, all_true_ids = [], []
print("6 started")

t0 = time.time()
model.eval()
with torch.no_grad():
    print("6 loop started")
    for batch in tqdm(val_loader, desc="Evaluating", mininterval=2.0):
        input_ids = batch["input_ids"].to(device)
        pixel_values = batch["pixel_values"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]

        # Generate predictions
        generated_ids = model.generate(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            max_new_tokens=20,
            num_beams=3,
            early_stopping=True
        )

        preds = [tokenizer.decode(g, skip_special_tokens=True).strip().lower()
                 for g in generated_ids]
        trues = [tokenizer.decode(l, skip_special_tokens=True).strip().lower()
                 for l in labels]

        all_preds.extend(preds)
        all_labels.extend(trues)
        all_pred_ids.extend([label2id.get(p, -1) for p in preds])
        all_true_ids.extend([label2id.get(t, -1) for t in trues])

        # Free memory
        del input_ids, pixel_values, attention_mask, generated_ids
        torch.cuda.empty_cache()

t1 = time.time()
print(f"\n✅ Inference completed in {t1 - t0:.1f}s")

# 7. Classification Metrics
mask = [(t >= 0 and p >= 0) for t, p in zip(all_true_ids, all_pred_ids)]
y_true = [t for m, t in zip(mask, all_true_ids) if m]
y_pred = [p for m, p in zip(mask, all_pred_ids) if m]

acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec = recall_score(y_true, y_pred, average="macro", zero_division=0)
f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)

print("\n=== Classification Metrics ===")
print(f"Accuracy       : {acc:.4f}")
print(f"Precision (M)  : {prec:.4f}")
print(f"Recall    (M)  : {rec:.4f}")
print(f"F1 Score  (M)  : {f1:.4f}")

# 8. BERTScore (semantic similarity)
print("\nComputing BERTScore...")
t0 = time.time()
p, r, f = bert_score(
    all_preds, all_labels,
    lang="en", model_type="bert-base-uncased",
    rescale_with_baseline=True
)
t1 = time.time()
print("=== BERTScore ===")
print(f"P: {p.mean().item():.4f}  R: {r.mean().item():.4f}  F1: {f.mean().item():.4f}")
print(f"Computed in {t1 - t0:.1f}s")

# 9. BARTScore (semantic entailment) on CPU
print("\nComputing BARTScore...")
t0 = time.time()
bart_scorer = BARTScorer(device="cpu", checkpoint="facebook/bart-large-cnn")
bart_scores = bart_scorer.score(all_preds, all_labels, batch_size=4)
mean_bart = sum(bart_scores) / len(bart_scores)
print("=== BARTScore ===")
print(f"Mean score: {mean_bart:.4f}")
print(f"Computed in {time.time() - t0:.1f}s")

# 10. Cleanup
del model, processor, tokenizer, bart_scorer
gc.collect()
torch.cuda.empty_cache()


config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Evaluating: 100%|██████████| 598/598 [1:38:53<00:00,  9.92s/it]


✅ Inference completed in 5933.5s

=== Classification Metrics ===
Accuracy       : 0.7564
Precision (M)  : 0.2813
Recall    (M)  : 0.2880
F1 Score  (M)  : 0.2546

Computing BERTScore...





tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

=== BERTScore ===
P: 0.8245  R: 0.8127  F1: 0.8170
Computed in 12.2s

Computing BARTScore...


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

=== BARTScore ===
Mean score: -3.6180
Computed in 378.8s
