In [None]:
import torch, json, random
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm
from datasets import load_dataset, Image as HfImage
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from PIL import Image
import torch.nn.functional as F
from torch.cuda.amp import autocast


In [None]:
from datasets import load_dataset, Image as HfImage

ds = load_dataset("SimulaMet/Kvasir-VQA-x1")["test"]
val_set_task2 = (
    ds.filter(lambda x: x["complexity"] == 1)
      .shuffle(seed=42)
      .select(range(1500))
      .add_column("val_id", list(range(1500)))
      .remove_columns(["complexity", "answer", "original", "question_class"])
      .cast_column("image", HfImage())
)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
!hf auth login --add-to-git-credential


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) V
Invalid input. Must be one of ('y', 'yes', '1', 'n', 'no', '0', '')
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
The token `VQA_GI_1` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot au

In [None]:
from peft import PeftModel
base_model = PaliGemmaForConditionalGeneration.from_pretrained(
    "google/paligemma2-3b-pt-224",
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)


model = PeftModel.from_pretrained(base_model, "vishy395/Kvasir-VQA-x1-paligemma2")

processor = AutoProcessor.from_pretrained(
    "google/paligemma2-3b-pt-224",
    trust_remote_code=True
)
processor.tokenizer = processor.tokenizer.from_pretrained("vishy395/Kvasir-VQA-x1-paligemma2")


config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/75.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/424 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/243k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/243k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/34.6M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/733 [00:00<?, ?B/s]

In [None]:
print(model.vision_tower.vision_model.embeddings)


SiglipVisionEmbeddings(
  (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
  (position_embedding): Embedding(256, 1152)
)


In [None]:
import os
os.mkdir("visuals")
os.mkdir("visuals/heatmaps")

In [None]:
for p in model.parameters():
    p.requires_grad_(False)
for p in model.vision_tower.parameters():
    p.requires_grad_(True)

# Enable gradient checkpointing (saves memory)
model.gradient_checkpointing_enable()

# ---------------- HOOKS ---------------- #
activations = []
gradients = []

def forward_hook(module, inp, out):
    activations.append(out)

def backward_hook(module, grad_in, grad_out):
    gradients.append(grad_out[0])

# Attach hooks to vision patch embeddings
target_layer = model.vision_tower.vision_model.embeddings.patch_embedding
if hasattr(target_layer, "gradcam_fh"):
    target_layer.gradcam_fh.remove()
if hasattr(target_layer, "gradcam_bh"):
    target_layer.gradcam_bh.remove()

target_layer.gradcam_fh = target_layer.register_forward_hook(forward_hook)
target_layer.gradcam_bh = target_layer.register_full_backward_hook(backward_hook)

# ---------------- Grad-CAM Overlay ---------------- #
def save_cam_overlay(pil_img, cam, save_path):
    img = np.array(pil_img.convert("RGB"))
    H, W = img.shape[:2]

    if cam is None or cam.size == 0:
        print(f"⚠️ Invalid CAM (empty), skipping overlay for {save_path}.")
        return None

    # Convert CAM to tensor and upsample to image size
    cam_tensor = torch.from_numpy(cam).unsqueeze(0).unsqueeze(0)  # (1,1,h,w)
    cam_resized = F.interpolate(
        cam_tensor, size=(H, W), mode="bilinear", align_corners=False
    ).squeeze().numpy()

    # Normalize CAM 0–1
    cam_resized = (cam_resized - cam_resized.min()) / (cam_resized.max() - cam_resized.min() + 1e-8)

    # Create heatmap with matplotlib (instead of cv2)
    import matplotlib.cm as cm
    colormap = cm.get_cmap("jet")
    heatmap = (colormap(cam_resized)[:, :, :3] * 255).astype(np.uint8)

    # Blend heatmap with image
    overlay = (0.6 * img + 0.4 * heatmap).astype(np.uint8)

    # Save overlay
    Image.fromarray(overlay).save(save_path)
    return save_path,overlay

# ---------------- Generation + Grad-CAM ---------------- #
def generate_with_confidence_and_cam(question, image, val_id):
    global activations, gradients
    activations, gradients = [], []
    model.eval()
    prompt = f"<image>\n{question}"
    inputs = processor(text=[prompt], images=[image],
                   return_tensors="pt", padding=True).to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()
              if k not in ['labels', 'attention_mask']}
    # Forward pass with grad enabled

    for p in model.parameters():
        p.requires_grad_(False)
    for p in model.vision_tower.parameters():
        p.requires_grad_(True)

    with torch.no_grad():
      outputs = model.generate(**inputs)
    answer = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    for line in answer.splitlines():
      if line.strip().lower().startswith(question.lower()):
        answer = answer.replace(line, '').strip()
    outputs = model(**inputs)
    logits = outputs.logits  # (B, seq, vocab)
    generated_ids = torch.argmax(logits, dim=-1)

    # Pick last token prediction
    target_token_id = generated_ids[0, -1]
    target_logit = logits[0, -1, target_token_id]

    # Backward
    model.zero_grad()
    target_logit.backward(retain_graph=True)

    # Grad-CAM computation
    grad = gradients[-1]          # (B, C, H, W)
    act  = activations[-1]        # (B, C, H, W)
    grad = grad.mean(dim=(2,3), keepdim=True).to(act.device)  # GAP
    cam  = torch.relu((grad * act).sum(dim=1)).squeeze().detach().cpu().numpy()

    # Confidence
    probs = F.softmax(logits[0, -1], dim=-1)
    confidence = probs[target_token_id].item()
    heatmap_path = f"visuals/heatmaps/{val_id}.png"
    _,hm_img=save_cam_overlay(image, cam, heatmap_path)

    return answer, confidence, heatmap_path,hm_img

In [None]:
torch.cuda.empty_cache()

In [None]:
import re
import json
results = []
for ex in tqdm(val_set_task2, desc="Subtask 2 Evaluation"):
    q = ex["question"]
    img = ex["image"].convert(
        "RGB") if ex["image"].mode != "RGB" else ex["image"]

    # Step 1: answer + confidence + heatmap
    ans, conf, heatmap_path,hm_img = generate_with_confidence_and_cam(q, img, ex["val_id"])
    #print(ans)
    # Step 2: explanation text
    expl_prompt = f"""
    <image>
    <image>

    You are a clinical assistant analyzing colonoscopy findings.
    Image 1 = original colonoscopy image.
    Image 2 = Grad-CAM heatmap highlighting the most relevant region.

    ONLY output a JSON object with the following keys:
    - "location": the anatomical location of the highlighted lesion (e.g., sigmoid colon, rectum).
    - "morphology": the lesion’s type, shape, and size (e.g., sessile polyp, round, small).
    - "mucosal_features": mucosal patterns, color, vascular features, or surface irregularities.

    Do not include any explanations, extra text, or formatting.
    Do not repeat the question or the answer.

    Return a single JSON object, like this:
    {{
      "location": "sigmoid colon, central area",
      "morphology": "sessile polyp, round",
      "mucosal_features": "erythematous, whitish patches"
    }}
    """



    inputs = processor(text=[expl_prompt], images=[img,hm_img], return_tensors="pt").to(device)
    with torch.no_grad():
      out = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=False,
        repetition_penalty=1.1
      )
    explanation_raw = processor.tokenizer.decode(out[0], skip_special_tokens=True)
    matches = re.findall(r"\{.*?\}", explanation_raw, re.S)

    if matches or len(matches)==2:
        explanation = matches[-1]  # take the last JSON block only
    else:
        explanation = "{}"

    #print("Raw:", explanation_raw)
    #print("Cleaned:", explanation)
    #print("--------------------------------------------------------------------------------------")
    result = {
        "val_id": ex["val_id"],
        "img_id": ex["img_id"],
        "question": q,
        "answer": ans,
        "textual_explanation": explanation,
        "visual_explanation": [{
            "type": "heatmap",
            "data": heatmap_path,
            "description": "Grad-CAM highlighting the region that influenced the prediction."
        }],
        "confidence_score": round(conf, 3)
    }
    results.append(result)

  colormap = cm.get_cmap("jet")
  cam_resized = (cam_resized - cam_resized.min()) / (cam_resized.max() - cam_resized.min() + 1e-8)
Subtask 2 Evaluation: 100%|██████████| 1500/1500 [1:46:05<00:00,  4.24s/it]


In [None]:
import time
import subprocess
import sys
gpu_name = torch.cuda.get_device_name(
    0) if torch.cuda.is_available() else "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"


def get_mem(): return torch.cuda.memory_allocated(device) / \
    (1024 ** 2) if torch.cuda.is_available() else 0


initial_mem = get_mem()

In [None]:
start_time, post_model_mem = time.time(), get_mem()
total_time, final_mem = round(
    time.time() - start_time, 4), round(get_mem() - post_model_mem, 2)
model_mem_used = round(post_model_mem - initial_mem, 2)

In [None]:
total_time, final_mem = round(
    time.time() - start_time, 4), round(get_mem() - post_model_mem, 2)
model_mem_used = round(post_model_mem - initial_mem, 2)

In [None]:
import platform
SUBMISSION_INFO = {
    # 🔹 TODO: PARTICIPANTS MUST ADD PROPER SUBMISSION INFO FOR THE SUBMISSION 🔹
    # This will be visible to the organizers
    # DONT change the keys, only add your info
    "Participant_Names": "Sivasriraman P, Vishnu Murugesh V and Vishwajith L K",
    "Affiliations": "SSNCE",
    "Contact_emails": ["sivasriraman2370066@ssn.edu.in"],
    # But, the first email only will be used for correspondance
    "Team_Name": "EndoVision",
    "Country": "India",
    "Notes_to_organizers": '''
         We have finetuned google's paligemma2 model
        '''
}
output_data = {"submission_info": SUBMISSION_INFO,
               "predictions": results, "total_time": total_time, "time_per_item": total_time / len(val_set_task2),
               "memory_used_mb": final_mem, "model_memory_mb": model_mem_used, "gpu_name": gpu_name,
               "debug": {
                   "packages": json.loads(subprocess.check_output([sys.executable, "-m", "pip", "list", "--format=json"])),
                   "system": {
                       "python": platform.python_version(),
                       "os": platform.system(),
                       "platform": platform.platform(),
                       "arch": platform.machine()
                   }}}


with open("predictions_2.json", "w") as f:
    json.dump(output_data, f, indent=4)