In [1]:
!pip install -q torch torchvision torchaudio
!pip install -q transformers datasets accelerate Pillow huggingface_hub evaluate bert-score

In [2]:
import os
import json
import torch
import torch.nn as nn
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPVisionModel, AutoImageProcessor
from datasets import load_dataset
from huggingface_hub import snapshot_download

In [3]:
import torch
import matplotlib.pyplot as plt
from transformers import AutoProcessor, AutoModelForImageTextToText
from datasets import load_dataset
import random

model_id = "unsloth/medgemma-4b-it"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading model on {device}...")
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
).to(device)


Loading model on cuda...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

In [5]:
print("Loading dataset...")
dataset = load_dataset('flaviagiammarino/vqa-rad', split='test', streaming=False)


Loading dataset...


In [24]:
import evaluate
from tqdm import tqdm

bertscore_metric = evaluate.load("bertscore")

predictions = []
references = []

samples_to_evaluate = dataset


In [25]:

print(f"Starting inference on {len(samples_to_evaluate)} samples...")

for item in tqdm(samples_to_evaluate):
    image = item['image'].convert("RGB")
    question = item['question']
    ground_truth = str(item['answer'])

    # --- Подготовка промпта ---
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Answer the given question as short as possible, do not give any explanation just straight answer in no more than 5 words. Question: " + question}
            ]
        }
    ]

    text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

    inputs = processor(
        text=text_prompt,
        images=image,
        return_tensors="pt"
    ).to(device)

    # --- Генерация ---
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=20)

    generated_text = processor.decode(
        outputs[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    ).strip()

    predictions.append(generated_text)
    references.append(ground_truth)


Starting inference on 451 samples...


100%|██████████| 451/451 [30:32<00:00,  4.06s/it]


In [39]:

# --- Подсчет метрик ---

# 1. Accuracy (Exact Match)
# Приводим к нижнему регистру для честного сравнения (например, "Yes" == "yes")
correct_count = 0
for pred, ref in zip(predictions, references):
    if pred.lower().strip() == ref.lower().strip():
        correct_count += 1

accuracy = correct_count / len(predictions)


In [31]:

# 3. BERTScore
print("Calculating BERTScore (this might take a moment to download the model)...")
bertscore_results = bertscore_metric.compute(
    predictions=predictions,
    references=references,
    lang="en",
    verbose=True
)

# Усредняем F1 score для BERTScore
mean_bertscore = sum(bertscore_results['f1']) / len(bertscore_results['f1'])


Calculating BERTScore (this might take a moment to download the model)...
calculating scores...
computing bert embedding.


  0%|          | 0/6 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/8 [00:00<?, ?it/s]

done in 2523.88 seconds, 0.18 sentences/sec


In [48]:
# --- Вывод результатов ---
print("\n" + "="*30)
print(f"Results for {len(predictions)} samples:")
print("="*30)
print(f"Accuracy:   {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"BERTScore (F1 mean): {mean_bertscore:.4f}")

# Пример нескольких выводов
print("\nSample predictions:")
for i in range(min(7, len(predictions))):
    print(f"Q: {samples_to_evaluate[i]['question']}")
    print(f"GT: {references[i]} | Pred: {predictions[i]}")
    print("-" * 20)


Results for 451 samples:
Accuracy:   0.1552 (15.52%)
BERTScore (F1 mean): 0.8900

Sample predictions:
Q: is there evidence of an aortic aneurysm?
GT: yes | Pred: No evidence of aortic aneurysm.
--------------------
Q: is there airspace consolidation on the left side?
GT: yes | Pred: Yes
--------------------
Q: is there any intraparenchymal abnormalities in the lung fields?
GT: no | Pred: No intraparenchymal abnormalities.
--------------------
Q: which side of the heart border is obscured?
GT: right | Pred: Right heart border
--------------------
Q: where are the kidney?
GT: not seen here | Pred: Left and right of the image.
--------------------
Q: are the kidneys present in this image?
GT: no | Pred: Yes, kidneys are present.
--------------------
Q: is the colon more prominent on the patient's right or left side?
GT: left | Pred: Left side.
--------------------
