In [None]:
# If you're in Colab, run this cell first
!pip install -q "transformers>=4.45.0" "datasets>=2.19.0" \
  bitsandbytes peft trl qwen-vl-utils accelerate pillow


In [None]:
!pip install hf_transfer

Collecting hf_transfer
  Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading hf_transfer-0.1.9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m67.8 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: hf_transfer
Successfully installed hf_transfer-0.1.9


In [None]:
# ============================
# 0. (Optional) Install deps
# ============================
# If running in Colab or a fresh env, uncomment this block:
# !pip install -q "transformers>=4.45.0" "datasets>=2.19.0" \
#   peft trl qwen-vl-utils accelerate pillow

# ============================
# 1. Imports & basic setup
# ============================
import os
import re
import torch
from datasets import load_dataset

from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
)

from peft import LoraConfig
from trl import SFTConfig, SFTTrainer

from qwen_vl_utils import process_vision_info

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# ============================
# 2. Load ScienceQA dataset
# ============================
# We use derek-thomas/ScienceQA which includes images + QA fields
dataset_id = "derek-thomas/ScienceQA"
raw_datasets = load_dataset(dataset_id)

print(raw_datasets)

# Keep only examples that actually have an image
def has_image(example):
    return example["image"] is not None

train_ds = raw_datasets["train"].filter(has_image)
val_ds   = raw_datasets["validation"].filter(has_image)
test_ds  = raw_datasets["test"].filter(has_image)

print("Train size:", len(train_ds))
print("Val size:", len(val_ds))
print("Test size:", len(test_ds))
print("Example keys:", train_ds.column_names)

# ============================
# 3. Format data: letter + explanation
# ============================
system_message = """You are a helpful science tutor.
You see a question, several answer choices, and often an image and a hint/lecture.
Your job is to pick the single correct multiple-choice option.
Start your answer with ONLY the letter of the correct option (A, B, C, D, etc.),
then a period, then a short explanation.
Example: "C. Because ..."
"""

LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def index_to_letter(idx: int) -> str:
    return LETTERS[idx]

def build_question_text(example) -> str:
    """
    Build the user-facing question text with choices, hint, lecture.
    """
    choices = example["choices"]
    options_text = "\n".join(
        f"({LETTERS[i]}) {choice}" for i, choice in enumerate(choices)
    )

    hint = example.get("hint", "")
    lecture = example.get("lecture", "")

    hint_part = f"\nHint: {hint}" if hint else ""
    lecture_part = f"\nLecture: {lecture}" if lecture else ""

    return (
        f"Question: {example['question']}\n\n"
        f"Choices:\n{options_text}"
        f"{hint_part}{lecture_part}\n\n"
        f"Respond starting with the letter, then a period, then a brief explanation."
    )

def format_example(example):
    """
    Convert a raw ScienceQA row into the format:
      {
        "images": [...],
        "messages": [ {role, content: [{type, ...}, ...]}, ... ]
      }
    for Qwen2-VL + TRL SFTTrainer.
    """
    correct_letter = index_to_letter(int(example["answer"]))

    # ScienceQA usually has a "solution" field (text explanation).
    explanation = example.get("solution", "") or example.get("explanation", "")
    if explanation:
        assistant_text = f"{correct_letter}. {explanation}"
    else:
        # Fallback to letter only if no explanation
        assistant_text = correct_letter

    messages = [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": system_message},
            ],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": example["image"]},
                {"type": "text", "text": build_question_text(example)},
            ],
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": assistant_text},
            ],
        },
    ]

    return {
        "images": [example["image"]],
        "messages": messages,
    }

print("Formatting datasets...")
train_data = [format_example(ex) for ex in train_ds]
eval_data  = [format_example(ex) for ex in val_ds]
test_data  = [format_example(ex) for ex in test_ds]

print("Example formatted messages:")
print(train_data[0]["messages"][0])  # system
print(train_data[0]["messages"][1])  # user
print(train_data[0]["messages"][2])  # assistant

# ============================
# 4. Load Qwen2-VL-2B-Instruct (bf16, no quantization)
# ============================
model_id = "Qwen/Qwen2-VL-2B-Instruct"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",          # H200 can handle this easily
    torch_dtype=torch.bfloat16, # native bf16 on H200
)

processor = Qwen2VLProcessor.from_pretrained(model_id)

# ============================
# 5. LoRA configuration (no 4-bit needed)
# ============================
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj"],  # standard for Qwen2-VL
    task_type="CAUSAL_LM",
)

# ============================
# 6. SFTTraining configuration
# ============================
output_dir = "qwen2-vl-2b-scienceqa-lora-expl"

training_args = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,      # H200: you can increase if you want
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,      # effective batch ~16
    learning_rate=1e-4,                 # 1e-4–2e-4 typical for LoRA
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=1000,
    bf16=True,
    gradient_checkpointing=False,       # not needed on H200
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    optim="adamw_torch_fused",
    push_to_hub=False,
)

# ============================
# 7. Build SFTTrainer and train
# ============================
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    processing_class=processor,  # TRL picks a VLM collator for Qwen2-VL
)

print("Starting training...")
trainer.train()
print("Training finished.")

# Save LoRA adapter (and tokenizer/processor config)
trainer.save_model(output_dir)
processor.save_pretrained(output_dir)

print(f"Saved fine-tuned adapter to: {output_dir}")

# ============================
# 8. Inference: answer + explanation
# ============================
# Reload model + adapter (for clarity; you can also reuse `model` from above)
print("Loading fine-tuned model for inference...")
ft_model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
ft_model.load_adapter(output_dir)
ft_model.eval()

ft_processor = Qwen2VLProcessor.from_pretrained(model_id)

def generate_answer_with_explanation(example, max_new_tokens=64):
    """
    Run inference on a single ScienceQA example.
    Returns text like:
      "C. Because ..."
    """
    # Format in the same way as training, but we only use system+user messages
    formatted = format_example(example)
    conv_for_gen = formatted["messages"][:2]  # system + user

    # Build chat-style prompt
    text_prompt = ft_processor.apply_chat_template(
        conv_for_gen,
        tokenize=False,
        add_generation_prompt=True,
    )

    # Process vision inputs
    image_inputs, _ = process_vision_info(formatted["messages"])

    inputs = ft_processor(
        text=[text_prompt],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        gen_ids = ft_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    # Remove prompt tokens from output
    trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, gen_ids)
    ]

    out_text = ft_processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0].strip()

    return out_text

def parse_letter_and_explanation(text):
    """
    Extract the leading letter (A/B/C/...) and the explanation.
    Falls back gracefully if the pattern is weird.
    """
    # Match patterns like:
    #   "C. Because ..."
    #   "C - Because ..."
    #   "C) Because ..."
    m = re.match(r"\s*([A-Z])\s*[\.\-\):]\s*(.*)", text, flags=re.DOTALL)
    if m:
        letter = m.group(1)
        explanation = m.group(2).strip()
    else:
        # Fallback: first capital letter
        m2 = re.search(r"[A-Z]", text)
        if m2:
            letter = m2.group(0)
            explanation = text[m2.end():].strip()
        else:
            letter = None
            explanation = text.strip()
    return letter, explanation

# ============================
# 9. Quick test on validation example
# ============================
print("\n=== Quick validation example ===")
example = val_ds[0]

model_output = generate_answer_with_explanation(example)
pred_letter, pred_expl = parse_letter_and_explanation(model_output)

true_letter = index_to_letter(int(example["answer"]))

print("Question:", example["question"])
print("Choices:", example["choices"])
print("True answer letter:", true_letter)
print("Model raw output:   ", model_output)
print("Parsed letter:      ", pred_letter)
print("Parsed explanation: ", pred_expl)


Using device: cuda


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-1028f23e353fbe(…):   0%|          | 0.00/377M [00:00<?, ?B/s]

data/validation-00000-of-00001-6c7328ff6(…):   0%|          | 0.00/126M [00:00<?, ?B/s]

data/test-00000-of-00001-f0e719df791966f(…):   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 12726
    })
    validation: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 4241
    })
    test: Dataset({
        features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution'],
        num_rows: 4241
    })
})


Filter:   0%|          | 0/12726 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4241 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4241 [00:00<?, ? examples/s]

Train size: 6218
Val size: 2097
Test size: 2017
Example keys: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution']
Formatting datasets...


`torch_dtype` is deprecated! Use `dtype` instead!


Example formatted messages:
{'role': 'system', 'content': [{'type': 'text', 'text': 'You are a helpful science tutor.\nYou see a question, several answer choices, and often an image and a hint/lecture.\nYour job is to pick the single correct multiple-choice option.\nStart your answer with ONLY the letter of the correct option (A, B, C, D, etc.),\nthen a period, then a short explanation.\nExample: "C. Because ..."\n'}]}
{'role': 'user', 'content': [{'type': 'image', 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=750x429 at 0x71BA5C85E150>}, {'type': 'text', 'text': 'Question: Which of these states is farthest north?\n\nChoices:\n(A) West Virginia\n(B) Louisiana\n(C) Arizona\n(D) Oklahoma\nLecture: Maps have four cardinal directions, or main directions. Those directions are north, south, east, and west.\nA compass rose is a set of arrows that point to the cardinal directions. A compass rose usually shows only the first letter of each cardinal direction.\nThe north arrow po

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Starting training...


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
200,3.0967,3.340326,3.467196,1737553.0,0.541637
400,3.0629,3.262531,3.380091,3462723.0,0.55793
600,3.2061,3.233695,3.345482,5181757.0,0.564243
800,3.4126,3.217886,3.329388,6934906.0,0.567789
1000,3.3719,3.209956,3.318748,8673048.0,0.569656


Training finished.
Saved fine-tuned adapter to: qwen2-vl-2b-scienceqa-lora-expl
Loading fine-tuned model for inference...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


=== Quick validation example ===
Question: Which animal's mouth is also adapted for bottom feeding?
Choices: ['discus', 'armored catfish']
True answer letter: B
Model raw output:    B. Look at the picture of the sturgeon.
The sturgeon has a long, thin mouth. Its mouth is adapted for bottom feeding. The sturgeon uses its mouth to catch small fish and other food that lives at the bottom of the water.
Now look at each animal. Figure out which animal has a similar
Parsed letter:       B
Parsed explanation:  Look at the picture of the sturgeon.
The sturgeon has a long, thin mouth. Its mouth is adapted for bottom feeding. The sturgeon uses its mouth to catch small fish and other food that lives at the bottom of the water.
Now look at each animal. Figure out which animal has a similar


In [None]:
from tqdm import tqdm

def eval_scienceqa_split(dataset, max_new_tokens=64, num_samples=None):
    """
    Compute MC accuracy on a ScienceQA split (e.g. val_ds or test_ds).

    Args:
        dataset: Hugging Face dataset (val_ds/test_ds).
        max_new_tokens: generation length (increase if explanations are truncated).
        num_samples: limit for quick testing (e.g. 200). If None, use full split.
    """
    n = len(dataset) if num_samples is None else min(num_samples, len(dataset))
    correct = 0

    for i in tqdm(range(n), desc="Evaluating"):
        ex = dataset[i]
        true_letter = index_to_letter(int(ex["answer"]))

        out_text = generate_answer_with_explanation(ex, max_new_tokens=max_new_tokens)
        pred_letter, _ = parse_letter_and_explanation(out_text)

        if pred_letter == true_letter:
            correct += 1

    acc = correct / n
    print(f"Accuracy on {n} examples: {acc:.4f}")
    return acc

# Quick check on first 200 validation examples
val_acc_200 = eval_scienceqa_split(val_ds, max_new_tokens=64, num_samples=200)

# Full validation accuracy (comment in when you're ready)
# val_acc_full = eval_scienceqa_split(val_ds, max_new_tokens=64, num_samples=None)


Evaluating: 100%|██████████| 200/200 [02:32<00:00,  1.31it/s]

Accuracy on 200 examples: 0.6650





In [None]:
from PIL import Image

def build_custom_question_text(question, choices, hint=None, lecture=None):
    options_text = "\n".join(
        f"({LETTERS[i]}) {choice}" for i, choice in enumerate(choices)
    )
    hint_part = f"\nHint: {hint}" if hint else ""
    lecture_part = f"\nLecture: {lecture}" if lecture else ""

    return (
        f"Question: {question}\n\n"
        f"Choices:\n{options_text}"
        f"{hint_part}{lecture_part}\n\n"
        f"Respond starting with the letter, then a period, then a brief explanation."
    )

def answer_custom_question(
    image: Image.Image | None,
    question: str,
    choices: list[str],
    hint: str | None = None,
    lecture: str | None = None,
    max_new_tokens: int = 96,
):
    # Build messages in the same format as during training
    user_content = []
    if image is not None:
        user_content.append({"type": "image", "image": image})
    user_content.append(
        {"type": "text", "text": build_custom_question_text(question, choices, hint, lecture)}
    )

    messages = [
        {"role": "system", "content": [{"type": "text", "text": system_message}]},
        {"role": "user", "content": user_content},
    ]

    text_prompt = ft_processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    if image is not None:
        image_inputs, _ = process_vision_info(messages)
    else:
        image_inputs = None

    inputs = ft_processor(
        text=[text_prompt],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        gen_ids = ft_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, gen_ids)
    ]

    out_text = ft_processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0].strip()

    letter, expl = parse_letter_and_explanation(out_text)
    return letter, expl, out_text


In [None]:
# With an image
img = val_ds[0]["image"]  # or Image.open("your_image.png")
q = "Which animal's mouth is also adapted for bottom feeding?"
choices = ["discus", "armored catfish"]

letter, expl, raw = answer_custom_question(img, q, choices)
print("Predicted letter:", letter)
print("Explanation:", expl)
print("Raw model text:", raw)

# Without image
letter2, expl2, raw2 = answer_custom_question(
    image=None,
    question="Which state is farthest north?",
    choices=["West Virginia", "Louisiana", "Arizona", "Oklahoma"],
)
print("Predicted letter:", letter2)
print("Explanation:", expl2)


Predicted letter: B
Explanation: The mouth of an armored catfish is adapted for bottom feeding. It has sharp teeth that can cut through the bottom of a pond or lake.
Raw model text: B. The mouth of an armored catfish is adapted for bottom feeding. It has sharp teeth that can cut through the bottom of a pond or lake.
Predicted letter: D
Explanation: This state is farthest north: Oklahoma.
View more states and their locations, and learn about their climates, economies, and cultures.


In [None]:
!pip install torchmetrics summac detoxify


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting summac
  Downloading summac-0.0.4-py3-none-any.whl.metadata (5.3 kB)
Collecting detoxify
  Downloading detoxify-0.5.2-py3-none-any.whl.metadata (13 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting nltk>=3.6.6 (from summac)
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting huggingface-hub<=0.17.0 (from summac)
  Downloading huggingface_hub-0.17.0-py3-none-any.whl.metadata (13 kB)
Collecting sentencepiece (from summac)
  Downloading sentencepiece-0.2.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting protobuf (from summac)
  Downloading protobuf-6.33.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting click (from nltk>=3.6.6->summac)
  Downloading click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Collecting joblib (f

In [None]:
import torch
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from qwen_vl_utils import process_vision_info

device = "cuda" if torch.cuda.is_available() else "cpu"

BASE_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"

base_model = Qwen2VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    dtype=torch.bfloat16,
)
base_model.eval()

base_processor = Qwen2VLProcessor.from_pretrained(BASE_MODEL_ID)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def generate_answer_with_explanation_generic(model, processor, example, max_new_tokens=64):
    """
    Uses the same formatting as training (system + user) to generate answer+explanation.
    """
    formatted = format_example(example)          # reuses your training formatter
    conv_for_gen = formatted["messages"][:2]     # system + user only

    text_prompt = processor.apply_chat_template(
        conv_for_gen,
        tokenize=False,
        add_generation_prompt=True,
    )

    image_inputs, _ = process_vision_info(formatted["messages"])

    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, gen_ids)
    ]

    out_text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0].strip()

    return out_text


In [None]:
import re

def parse_letter_and_explanation(text):
    m = re.match(r"\s*([A-Z])\s*[\.\-\):]\s*(.*)", text, flags=re.DOTALL)
    if m:
        letter = m.group(1)
        explanation = m.group(2).strip()
    else:
        m2 = re.search(r"[A-Z]", text)
        if m2:
            letter = m2.group(0)
            explanation = text[m2.end():].strip()
        else:
            letter = None
            explanation = text.strip()
    return letter, explanation


In [None]:
from tqdm import tqdm

def collect_outputs(dataset, num_samples=None, max_new_tokens=64):
    """
    Collect model outputs + metadata for both base and fine-tuned models.
    """
    n = len(dataset) if num_samples is None else min(num_samples, len(dataset))
    records = []

    for i in tqdm(range(n), desc="Collecting outputs"):
        ex = dataset[i]
        true_letter = index_to_letter(int(ex["answer"]))
        solution = ex.get("solution", "")

        # Base
        base_raw = generate_answer_with_explanation_generic(
            base_model, base_processor, ex, max_new_tokens=max_new_tokens
        )
        base_letter, base_expl = parse_letter_and_explanation(base_raw)

        # Finetuned
        ft_raw = generate_answer_with_explanation_generic(
            ft_model, ft_processor, ex, max_new_tokens=max_new_tokens
        )
        ft_letter, ft_expl = parse_letter_and_explanation(ft_raw)

        # Source text we’ll use for hallucination metrics
        source_text = build_question_text(ex) + "\n\n" + (solution or "")

        records.append(
            {
                "example": ex,
                "true_letter": true_letter,
                "solution": solution,
                "source_text": source_text,

                "base_raw": base_raw,
                "base_letter": base_letter,
                "base_expl": base_expl,

                "ft_raw": ft_raw,
                "ft_letter": ft_letter,
                "ft_expl": ft_expl,
            }
        )

    return records

# Example: collect on first 500 validation examples
val_records = collect_outputs(val_ds, num_samples=500, max_new_tokens=96)


Collecting outputs: 100%|██████████| 500/500 [09:49<00:00,  1.18s/it]


In [None]:
from collections import Counter
import numpy as np

def _token_f1(pred, gold):
    pred_tokens = pred.lower().split()
    gold_tokens = gold.lower().split()
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0

    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def vqa_metrics(records, model_prefix="base"):
    em_list = []
    f1_list = []

    for r in records:
        true_letter = r["true_letter"]
        sol = r["solution"] or ""

        pred_letter = r[f"{model_prefix}_letter"]
        pred_expl   = r[f"{model_prefix}_expl"] or ""

        em_list.append(1.0 if pred_letter == true_letter else 0.0)
        if sol:
            f1_list.append(_token_f1(pred_expl, sol))
        else:
            # If no solution text, ignore in F1
            pass

    em = float(np.mean(em_list))
    f1 = float(np.mean(f1_list)) if f1_list else 0.0
    return {"VQA_EM": em, "VQA_F1_explanation": f1}

base_vqa = vqa_metrics(val_records, "base")
ft_vqa   = vqa_metrics(val_records, "ft")

print("Base VQA:", base_vqa)
print("FT   VQA:", ft_vqa)


Base VQA: {'VQA_EM': 0.652, 'VQA_F1_explanation': 0.20842234513052912}
FT   VQA: {'VQA_EM': 0.696, 'VQA_F1_explanation': 0.7628293064480157}


In [None]:
!pip install torchvision


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
from torchvision import transforms
import numpy as np
import torch
from torchmetrics.multimodal import CLIPScore

# create metric once
clip_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16").to(device)
to_tensor = transforms.ToTensor()

def clipscore_metric(records, model_prefix="base"):
    """
    Compute CLIPScore over a batch of (image, text) pairs.

    - Converts PIL images to torch tensors (C, H, W)
    - Calls CLIPScore once over lists of images/texts
    """
    images = []
    texts = []

    for r in records:
        img_pil = r["example"]["image"]       # PIL.Image
        text = r[f"{model_prefix}_raw"]       # string

        img_tensor = to_tensor(img_pil)       # (C, H, W), float32 in [0,1]
        images.append(img_tensor)
        texts.append(text)

    with torch.no_grad():
        # CLIPScore can take List[Tensor] + List[str]
        score = clip_metric(images, texts)

    return {"CLIPScore": float(score.item())}

# now this should work:
base_clip = clipscore_metric(val_records, "base")
ft_clip   = clipscore_metric(val_records, "ft")

print("Base CLIP:", base_clip)
print("FT   CLIP:", ft_clip)


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
Token indices sequence length is longer than the specified maximum sequence length for this model (98 > 77). Running this sequence through the model will result in indexing errors


Base CLIP: {'CLIPScore': 23.061771392822266}
FT   CLIP: {'CLIPScore': 22.325023651123047}


In [None]:
from summac.model import SummaCZS

summac_model = SummaCZS(granularity="sentence", model_name="vitc")

def summac_metric(records, model_prefix="base"):
    sources = [r["source_text"] for r in records]
    summaries = [r[f"{model_prefix}_expl"] or "" for r in records]

    scores = summac_model.score(
        sources=sources,
        summaries=summaries,
    )["scores"]

    return {"SummaC_mean": float(np.mean(scores))}

base_summac = summac_metric(val_records, "base")
ft_summac   = summac_metric(val_records, "ft")

print("Base SummaC:", base_summac)
print("FT   SummaC:", ft_summac)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

factcc_model_id = "manueldeprada/FactCC"
factcc_tokenizer = AutoTokenizer.from_pretrained(factcc_model_id)
factcc_model = AutoModelForSequenceClassification.from_pretrained(factcc_model_id).to(device)
factcc_model.eval()

def factcc_metric(records, model_prefix="base", batch_size=16):
    probs_all = []
    for i in range(0, len(records), batch_size):
        batch = records[i:i+batch_size]
        sources = [r["source_text"] for r in batch]
        summaries = [r[f"{model_prefix}_expl"] or "" for r in batch]

        enc = factcc_tokenizer(
            sources,
            summaries,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to(device)

        with torch.no_grad():
            logits = factcc_model(**enc).logits
            probs = F.softmax(logits, dim=-1)[:, 1]  # assume label 1 = factual
        probs_all.extend(probs.cpu().tolist())

    return {
        "FactCC_mean_prob_factual": float(np.mean(probs_all))
    }

base_factcc = factcc_metric(val_records, "base")
ft_factcc   = factcc_metric(val_records, "ft")

print("Base FactCC:", base_factcc)
print("FT   FactCC:", ft_factcc)


In [None]:
from openai import OpenAI
client = OpenAI()   # assumes OPENAI_API_KEY env var

GEVAL_PROMPT = """
You are grading answers to science visual question answering problems.

You will receive:
- the question and answer choices
- a reference correct answer letter
- the model's predicted letter and explanation

For each output, give three scores from 1 to 5:
- Fluency: Is the explanation clear, grammatical and well-written?
- Relevance: Does the explanation focus on the question and image, rather than irrelevant details?
- Correctness: Is the predicted answer letter correct and is the explanation logically consistent with the source?

Respond in JSON with keys: fluency, relevance, correctness.
"""

def geval_score_one(record, model_prefix="base"):
    ex = record["example"]
    q_block = build_question_text(ex)
    ref_letter = record["true_letter"]

    pred_raw = record[f"{model_prefix}_raw"]

    user_content = f"""
QUESTION & CONTEXT:
{q_block}

REFERENCE ANSWER LETTER: {ref_letter}

MODEL OUTPUT:
{pred_raw}
"""

    resp = client.responses.create(
        model="gpt-4o-mini",
        input=[
            {
                "role": "system",
                "content": GEVAL_PROMPT,
            },
            {
                "role": "user",
                "content": user_content,
            },
        ]
    )

    text = resp.output[0].content[0].text
    # parse JSON from `text`
    import json
    scores = json.loads(text)
    return scores["fluency"], scores["relevance"], scores["correctness"]


In [None]:
def geval_metric(records, model_prefix="base", num_samples=100):
    fl, rel, corr = [], [], []
    for r in records[:num_samples]:
        f, r_, c = geval_score_one(r, model_prefix)
        fl.append(f); rel.append(r_); corr.append(c)
    return {
        "G_eval_fluency":   float(np.mean(fl)),
        "G_eval_relevance": float(np.mean(rel)),
        "G_eval_correctness": float(np.mean(corr)),
    }

base_geval = geval_metric(val_records, "base", num_samples=100)
ft_geval   = geval_metric(val_records, "ft", num_samples=100)

print("Base G-Eval:", base_geval)
print("FT   G-Eval:", ft_geval)


In [None]:
from detoxify import Detoxify

tox_model = Detoxify('original')  # or 'unbiased'

def toxicity_metric(records, model_prefix="base"):
    texts = [r[f"{model_prefix}_raw"] for r in records]
    scores = tox_model.predict(texts)["toxicity"]   # np.array
    return {
        "toxicity_mean": float(np.mean(scores)),
        "toxicity_95p": float(np.percentile(scores, 95)),
    }

base_tox = toxicity_metric(val_records, "base")
ft_tox   = toxicity_metric(val_records, "ft")

print("Base toxicity:", base_tox)
print("FT   toxicity:", ft_tox)


In [None]:
from transformers import pipeline

nsfw_pipe = pipeline(
    "image-classification",
    model="Falconsai/nsfw_image_detection",
    device=0 if device == "cuda" else -1,
)

def nsfw_metric(records, threshold=0.5):
    nsfw_probs = []
    for r in records:
        img = r["example"]["image"]
        preds = nsfw_pipe(img)
        # assumes label 'nsfw' exists
        prob_nsfw = next(p["score"] for p in preds if "nsfw" in p["label"].lower())
        nsfw_probs.append(prob_nsfw)

    return {
        "NSFW_mean": float(np.mean(nsfw_probs)),
        "NSFW_95p": float(np.percentile(nsfw_probs, 95)),
    }

images_nsfw = nsfw_metric(val_records)
print("Images NSFW stats:", images_nsfw)


In [None]:
def merge_metrics(*metric_dicts):
    merged = {}
    for d in metric_dicts:
        merged.update(d)
    return merged

base_all = merge_metrics(base_vqa, base_clip, base_summac, base_factcc)
ft_all   = merge_metrics(ft_vqa,   ft_clip,  ft_summac,  ft_factcc)

print("\n=== Metric comparison (validation, ~500 examples) ===")
for k in sorted(base_all.keys()):
    print(f"{k:30s}  base={base_all[k]:.4f}   ft={ft_all[k]:.4f}")


In [None]:
OPENAI_API_KEY="OPENAI_API_KEY"

In [None]:
from openai import OpenAI

def load_openai_client_from_variable(key):
    if not key:
        print("No API key provided — skipping LLM-as-judge scoring.")
        return None
    return OpenAI(api_key=key)

client = load_openai_client_from_variable(OPENAI_API_KEY)


In [None]:
!pip install torchmetrics summac detoxify openai torchvision

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting openai
  Downloading openai-2.9.0-py3-none-any.whl.metadata (29 kB)
Collecting jiter<1,>=0.10.0 (from openai)
  Downloading jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting annotated-types>=0.6.0 (from pydantic<3,>=1.9.0->openai)
  Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting pydantic-core==2.41.5 (from pydantic<3,>=1.9.0->openai)
  Downloading pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting typing-inspection>=0.4.2 (from pydantic<3,>=1.9.0->openai)
  Downloading typing_inspection-0.4.2-py3-none-any.whl.metadata (2.6 kB)
Downloading openai-2.9.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m29.9 MB/s[0m  [33m0:00:00[0m
[?25hDownloading jiter-0.12.0-cp312-cp312

In [None]:
# =========================================
# ONE-CELL EVAL: base vs finetuned Qwen2-VL
# =========================================
# If needed, install once (then comment out):
# !pip install torchmetrics summac detoxify openai torchvision

import os, json, re
from collections import Counter

import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
)
from qwen_vl_utils import process_vision_info
from torchmetrics.multimodal import CLIPScore
from torchvision import transforms
from summac.model import SummaCZS
from detoxify import Detoxify

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------
# 0. Dataset-dependent helpers
# -----------------------------
# Re-declare these to match training (safe to override)

system_message = """You are a helpful science tutor.
You see a question, several answer choices, and often an image and a hint/lecture.
Your job is to pick the single correct multiple-choice option.
Start your answer with ONLY the letter of the correct option (A, B, C, D, etc.),
then a period, then a short explanation.
Example: "C. Because ..."
"""

LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def index_to_letter(idx: int) -> str:
    return LETTERS[idx]

def build_question_text(example) -> str:
    choices = example["choices"]
    options_text = "\n".join(
        f"({LETTERS[i]}) {choice}" for i, choice in enumerate(choices)
    )
    hint = example.get("hint", "")
    lecture = example.get("lecture", "")
    hint_part = f"\nHint: {hint}" if hint else ""
    lecture_part = f"\nLecture: {lecture}" if lecture else ""
    return (
        f"Question: {example['question']}\n\n"
        f"Choices:\n{options_text}"
        f"{hint_part}{lecture_part}\n\n"
        f"Respond starting with the letter, then a period, then a brief explanation."
    )

def format_example(example):
    correct_letter = index_to_letter(int(example["answer"]))
    explanation = example.get("solution", "") or example.get("explanation", "")
    if explanation:
        assistant_text = f"{correct_letter}. {explanation}"
    else:
        assistant_text = correct_letter

    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": example["image"]},
                {"type": "text", "text": build_question_text(example)},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": assistant_text}],
        },
    ]
    return {"images": [example["image"]], "messages": messages}

def parse_letter_and_explanation(text: str):
    m = re.match(r"\s*([A-Z])\s*[\.\-\):]\s*(.*)", text, flags=re.DOTALL)
    if m:
        letter = m.group(1)
        explanation = m.group(2).strip()
    else:
        m2 = re.search(r"[A-Z]", text)
        if m2:
            letter = m2.group(0)
            explanation = text[m2.end():].strip()
        else:
            letter = None
            explanation = text.strip()
    return letter, explanation

# --------------------------------
# 1. Load base & finetuned models
# --------------------------------
BASE_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
FT_ADAPTER_DIR = "qwen2-vl-2b-scienceqa-lora-expl"  # change if you used another dir

print("Loading base model...")
base_model = Qwen2VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
base_model.eval()
base_processor = Qwen2VLProcessor.from_pretrained(BASE_MODEL_ID)

print("Loading finetuned model + adapter...")
ft_model = Qwen2VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
ft_model.load_adapter(FT_ADAPTER_DIR)
ft_model.eval()
ft_processor = Qwen2VLProcessor.from_pretrained(BASE_MODEL_ID)

# -----------------------------
# 2. Shared generation helper
# -----------------------------
def generate_answer_with_explanation_generic(model, processor, example, max_new_tokens=96):
    formatted = format_example(example)
    conv_for_gen = formatted["messages"][:2]  # system + user

    text_prompt = processor.apply_chat_template(
        conv_for_gen,
        tokenize=False,
        add_generation_prompt=True,
    )

    image_inputs, _ = process_vision_info(formatted["messages"])

    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, gen_ids)
    ]

    out_text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0].strip()

    return out_text

# ----------------------------------------
# 3. Collect outputs (base & finetuned)
# ----------------------------------------
def collect_outputs(dataset, num_samples=None, max_new_tokens=96):
    n = len(dataset) if num_samples is None else min(num_samples, len(dataset))
    records = []
    for i in tqdm(range(n), desc="Collecting outputs"):
        ex = dataset[i]
        true_letter = index_to_letter(int(ex["answer"]))
        solution = ex.get("solution", "")

        base_raw = generate_answer_with_explanation_generic(
            base_model, base_processor, ex, max_new_tokens=max_new_tokens
        )
        base_letter, base_expl = parse_letter_and_explanation(base_raw)

        ft_raw = generate_answer_with_explanation_generic(
            ft_model, ft_processor, ex, max_new_tokens=max_new_tokens
        )
        ft_letter, ft_expl = parse_letter_and_explanation(ft_raw)

        source_text = build_question_text(ex) + "\n\n" + (solution or "")

        records.append(
            {
                "example": ex,
                "true_letter": true_letter,
                "solution": solution,
                "source_text": source_text,
                "base_raw": base_raw,
                "base_letter": base_letter,
                "base_expl": base_expl,
                "ft_raw": ft_raw,
                "ft_letter": ft_letter,
                "ft_expl": ft_expl,
            }
        )
    return records

# Choose number of validation examples to evaluate
NUM_VAL_EXAMPLES = 300   # adjust as you like
print(f"Collecting outputs on {NUM_VAL_EXAMPLES} val examples...")
val_records = collect_outputs(val_ds, num_samples=NUM_VAL_EXAMPLES, max_new_tokens=96)

# --------------------------------
# 4. Metric 1 – VQA EM / F1
# --------------------------------
def _token_f1(pred, gold):
    pred_tokens = pred.lower().split()
    gold_tokens = gold.lower().split()
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def vqa_metrics(records, model_prefix="base"):
    em_list, f1_list = [], []
    for r in records:
        true_letter = r["true_letter"]
        sol = r["solution"] or ""
        pred_letter = r[f"{model_prefix}_letter"]
        pred_expl = r[f"{model_prefix}_expl"] or ""
        em_list.append(1.0 if pred_letter == true_letter else 0.0)
        if sol:
            f1_list.append(_token_f1(pred_expl, sol))
    em = float(np.mean(em_list))
    f1 = float(np.mean(f1_list)) if f1_list else 0.0
    return {"VQA_EM": em, "VQA_F1_expl": f1}

# --------------------------------
# 5. Metric 2 – CLIPScore
# --------------------------------
clip_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16").to(device)
to_tensor = transforms.ToTensor()

def clipscore_metric(records, model_prefix="base"):
    images, texts = [], []
    for r in records:
        img_pil = r["example"]["image"]
        text = r[f"{model_prefix}_raw"]
        img_tensor = to_tensor(img_pil)  # (C, H, W)
        images.append(img_tensor)
        texts.append(text)
    with torch.no_grad():
        score = clip_metric(images, texts)
    return {"CLIPScore": float(score.item())}

# ----------------------------------------------
# 6. Metric 3 – SummaC & FactCC (hallucination)
# ----------------------------------------------
print("Loading SummaC model...")
summac_model = SummaCZS(granularity="sentence", model_name="vitc")

def summac_metric(records, model_prefix="base"):
    sources = [r["source_text"] for r in records]
    summaries = [r[f"{model_prefix}_expl"] or "" for r in records]
    scores = summac_model.score(sources=sources, summaries=summaries)["scores"]
    return {"SummaC_mean": float(np.mean(scores))}

print("Loading FactCC model...")
factcc_model_id = "manueldeprada/FactCC"  # change if needed
factcc_tokenizer = AutoTokenizer.from_pretrained(factcc_model_id)
factcc_model = AutoModelForSequenceClassification.from_pretrained(factcc_model_id).to(device)
factcc_model.eval()

def factcc_metric(records, model_prefix="base", batch_size=16):
    probs_all = []
    for i in range(0, len(records), batch_size):
        batch = records[i:i+batch_size]
        sources = [r["source_text"] for r in batch]
        summaries = [r[f"{model_prefix}_expl"] or "" for r in batch]
        enc = factcc_tokenizer(
            sources,
            summaries,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to(device)
        with torch.no_grad():
            logits = factcc_model(**enc).logits
            # assume label 1 = factual
            probs = torch.softmax(logits, dim=-1)[:, 1]
        probs_all.extend(probs.cpu().tolist())
    return {"FactCC_mean_prob_factual": float(np.mean(probs_all))}

# ---------------------------------------------------
# 7. Metric 4 – G-Eval (LLM-as-judge) [optional]
# ---------------------------------------------------
def maybe_load_openai_client():
    try:
        from openai import OpenAI
        if os.getenv("OPENAI_API_KEY"):
            return OpenAI()
    except ImportError:
        return None
    return None

client = maybe_load_openai_client()

GEVAL_SYSTEM_PROMPT = """
You are grading answers to science visual question answering problems.

You will receive:
- the question and answer choices
- the reference correct answer letter
- the model's predicted answer and explanation

For each output, give three scores from 1 to 5:
- Fluency: Is the explanation clear, grammatical and well-written?
- Relevance: Does the explanation focus on the question and image/context?
- Correctness: Is the predicted answer letter correct and is the explanation logically consistent with the context?

Respond ONLY in JSON with keys: fluency, relevance, correctness.
"""

def geval_score_one(record, model_prefix="base"):
    ex = record["example"]
    q_block = build_question_text(ex)
    ref_letter = record["true_letter"]
    pred_raw = record[f"{model_prefix}_raw"]

    user_content = f"""
QUESTION & CONTEXT:
{q_block}

REFERENCE ANSWER LETTER: {ref_letter}

MODEL OUTPUT:
{pred_raw}
"""

    resp = client.responses.create(
        model="gpt-4o-mini",
        input=[
            {"role": "system", "content": GEVAL_SYSTEM_PROMPT},
            {"role": "user", "content": user_content},
        ],
    )
    text = resp.output[0].content[0].text
    scores = json.loads(text)
    return scores["fluency"], scores["relevance"], scores["correctness"]

def geval_metric(records, model_prefix="base", num_samples=50):
    if client is None:
        print("⚠️ Skipping G-Eval: OpenAI client or API key not available.")
        return {}
    fl, rel, corr = [], [], []
    for r in records[:num_samples]:
        f, r_, c = geval_score_one(r, model_prefix)
        fl.append(f); rel.append(r_); corr.append(c)
    return {
        "G_eval_fluency": float(np.mean(fl)),
        "G_eval_relevance": float(np.mean(rel)),
        "G_eval_correctness": float(np.mean(corr)),
    }

# ---------------------------------------------------
# 8. Metric 5 – Safety: toxicity + NSFW image detection
# ---------------------------------------------------
print("Loading Detoxify toxicity model...")
tox_model = Detoxify("unbiased")

def toxicity_metric(records, model_prefix="base"):
    texts = [r[f"{model_prefix}_raw"] for r in records]
    scores = tox_model.predict(texts)["toxicity"]  # numpy array
    return {
        "toxicity_mean": float(np.mean(scores)),
        "toxicity_95p": float(np.percentile(scores, 95)),
    }

print("Loading NSFW image detection pipeline...")
nsfw_pipe = pipeline(
    "image-classification",
    model="Falconsai/nsfw_image_detection",
    device=0 if device.type == "cuda" else -1,
)

def nsfw_metric(records, threshold=0.5):
    nsfw_probs = []
    for r in records:
        img = r["example"]["image"]
        preds = nsfw_pipe(img)
        prob_nsfw = 0.0
        for p in preds:
            if "nsfw" in p["label"].lower():
                prob_nsfw = p["score"]
                break
        nsfw_probs.append(prob_nsfw)
    return {
        "NSFW_mean": float(np.mean(nsfw_probs)),
        "NSFW_95p": float(np.percentile(nsfw_probs, 95)),
    }

# -------------------------------------------
# 9. Compute ALL metrics & print comparison
# -------------------------------------------
def merge_metrics(*metric_dicts):
    merged = {}
    for d in metric_dicts:
        merged.update(d)
    return merged

print("\nComputing metrics...")

base_vqa   = vqa_metrics(val_records, "base")
ft_vqa     = vqa_metrics(val_records, "ft")

base_clip  = clipscore_metric(val_records, "base")
ft_clip    = clipscore_metric(val_records, "ft")

base_summ  = summac_metric(val_records, "base")
ft_summ    = summac_metric(val_records, "ft")

base_fact  = factcc_metric(val_records, "base")
ft_fact    = factcc_metric(val_records, "ft")

base_tox   = toxicity_metric(val_records, "base")
ft_tox     = toxicity_metric(val_records, "ft")

images_nsfw = nsfw_metric(val_records)

base_geval = geval_metric(val_records, "base", num_samples=30)
ft_geval   = geval_metric(val_records, "ft",   num_samples=30)

base_all = merge_metrics(base_vqa, base_clip, base_summ, base_fact, base_tox, base_geval)
ft_all   = merge_metrics(ft_vqa,   ft_clip,  ft_summ,  ft_fact,  ft_tox,  ft_geval)

print("\n=== METRIC COMPARISON (base vs finetuned) on val subset ===")
for k in sorted(base_all.keys()):
    print(f"{k:28s}  base={base_all[k]:.4f}   ft={ft_all[k]:.4f}")

print("\n=== IMAGE SAFETY (dataset images) ===")
for k, v in images_nsfw.items():
    print(f"{k:28s}  {v:.4f}")


ModuleNotFoundError: No module named 'summac.model'

In [None]:
# ======================================
# ONE-CELL EVAL: BASE VS FINETUNED QWEN2-VL
# ======================================
# If needed, install once in a separate cell:
# !pip install torchmetrics summac detoxify openai torchvision

import os, json, re
from collections import Counter

import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

from transformers import (
    Qwen2VLForConditionalGeneration,
    Qwen2VLProcessor,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
)
from qwen_vl_utils import process_vision_info

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------
# Soft imports / feature flags
# -----------------------------
# CLIPScore + torchvision
try:
    from torchmetrics.multimodal import CLIPScore
    from torchvision import transforms
    HAS_CLIP = True
except Exception as e:
    print("⚠️ CLIPScore not available:", e)
    HAS_CLIP = False

# SummaC
try:
    from summac.model import SummaCZS
    HAS_SUMMAC = True
except Exception as e:
    print("⚠️ SummaC not available:", e)
    HAS_SUMMAC = False

# Detoxify
try:
    from detoxify import Detoxify
    HAS_DETOXIFY = True
except Exception as e:
    print("⚠️ Detoxify not available:", e)
    HAS_DETOXIFY = False

# OpenAI client (from OPENAI_API_KEY variable in notebook, not env)
def maybe_load_openai_client_from_var():
    try:
        from openai import OpenAI
    except Exception:
        print("⚠️ openai package not available: G-Eval will be skipped.")
        return None
    key = globals().get("OPENAI_API_KEY", None)
    if not key:
        print("⚠️ OPENAI_API_KEY variable not set: G-Eval will be skipped.")
        return None
    return OpenAI(api_key=key)

client = maybe_load_openai_client_from_var()

# -----------------------------
# Dataset helpers (match training)
# -----------------------------
system_message = """You are a helpful science tutor.
You see a question, several answer choices, and often an image and a hint/lecture.
Your job is to pick the single correct multiple-choice option.
Start your answer with ONLY the letter of the correct option (A, B, C, D, etc.),
then a period, then a short explanation.
Example: "C. Because ..."
"""

LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def index_to_letter(idx: int) -> str:
    return LETTERS[idx]

def build_question_text(example) -> str:
    choices = example["choices"]
    options_text = "\n".join(
        f"({LETTERS[i]}) {choice}" for i, choice in enumerate(choices)
    )
    hint = example.get("hint", "")
    lecture = example.get("lecture", "")
    hint_part = f"\nHint: {hint}" if hint else ""
    lecture_part = f"\nLecture: {lecture}" if lecture else ""
    return (
        f"Question: {example['question']}\n\n"
        f"Choices:\n{options_text}"
        f"{hint_part}{lecture_part}\n\n"
        f"Respond starting with the letter, then a period, then a brief explanation."
    )

def format_example(example):
    correct_letter = index_to_letter(int(example["answer"]))
    explanation = example.get("solution", "") or example.get("explanation", "")
    if explanation:
        assistant_text = f"{correct_letter}. {explanation}"
    else:
        assistant_text = correct_letter

    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_message}],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": example["image"]},
                {"type": "text", "text": build_question_text(example)},
            ],
        },
        {
            "role": "assistant",
            "content": [{"type": "text", "text": assistant_text}],
        },
    ]
    return {"images": [example["image"]], "messages": messages}

def parse_letter_and_explanation(text: str):
    m = re.match(r"\s*([A-Z])\s*[\.\-\):]\s*(.*)", text, flags=re.DOTALL)
    if m:
        letter = m.group(1)
        explanation = m.group(2).strip()
    else:
        m2 = re.search(r"[A-Z]", text)
        if m2:
            letter = m2.group(0)
            explanation = text[m2.end():].strip()
        else:
            letter = None
            explanation = text.strip()
    return letter, explanation

# -----------------------------
# 1. Load base & finetuned
# -----------------------------
BASE_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
FT_ADAPTER_DIR = "qwen2-vl-2b-scienceqa-lora-expl"  # change if needed

print("Loading base model...")
base_model = Qwen2VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
base_model.eval()
base_processor = Qwen2VLProcessor.from_pretrained(BASE_MODEL_ID)

print("Loading finetuned model + adapter...")
ft_model = Qwen2VLForConditionalGeneration.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
ft_model.load_adapter(FT_ADAPTER_DIR)
ft_model.eval()
ft_processor = Qwen2VLProcessor.from_pretrained(BASE_MODEL_ID)

# -----------------------------
# 2. Shared generation helper
# -----------------------------
def generate_answer_with_explanation_generic(model, processor, example, max_new_tokens=96):
    formatted = format_example(example)
    conv_for_gen = formatted["messages"][:2]  # system + user

    text_prompt = processor.apply_chat_template(
        conv_for_gen,
        tokenize=False,
        add_generation_prompt=True,
    )

    image_inputs, _ = process_vision_info(formatted["messages"])

    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, gen_ids)
    ]

    out_text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0].strip()

    return out_text

# -----------------------------
# 3. Collect outputs
# -----------------------------
def collect_outputs(dataset, num_samples=None, max_new_tokens=96):
    n = len(dataset) if num_samples is None else min(num_samples, len(dataset))
    records = []
    for i in tqdm(range(n), desc="Collecting outputs"):
        ex = dataset[i]
        true_letter = index_to_letter(int(ex["answer"]))
        solution = ex.get("solution", "")

        base_raw = generate_answer_with_explanation_generic(
            base_model, base_processor, ex, max_new_tokens=max_new_tokens
        )
        base_letter, base_expl = parse_letter_and_explanation(base_raw)

        ft_raw = generate_answer_with_explanation_generic(
            ft_model, ft_processor, ex, max_new_tokens=max_new_tokens
        )
        ft_letter, ft_expl = parse_letter_and_explanation(ft_raw)

        source_text = build_question_text(ex) + "\n\n" + (solution or "")

        records.append(
            {
                "example": ex,
                "true_letter": true_letter,
                "solution": solution,
                "source_text": source_text,
                "base_raw": base_raw,
                "base_letter": base_letter,
                "base_expl": base_expl,
                "ft_raw": ft_raw,
                "ft_letter": ft_letter,
                "ft_expl": ft_expl,
            }
        )
    return records

NUM_VAL_EXAMPLES = 300  # change if you want more/less
print(f"Collecting outputs on {NUM_VAL_EXAMPLES} val examples...")
val_records = collect_outputs(val_ds, num_samples=NUM_VAL_EXAMPLES, max_new_tokens=96)




# -----------------------------
# 6. SummaC & FactCC
# -----------------------------
if HAS_SUMMAC:
    print("Loading SummaC model...")
    summac_model = SummaCZS(granularity="sentence", model_name="vitc")

    def summac_metric(records, model_prefix="base"):
        sources = [r["source_text"] for r in records]
        summaries = [r[f"{model_prefix}_expl"] or "" for r in records]
        scores = summac_model.score(sources=sources, summaries=summaries)["scores"]
        return {"SummaC_mean": float(np.mean(scores))}
else:
    def summac_metric(records, model_prefix="base"):
        return {}

print("Loading FactCC model...")
factcc_model_id = "manueldeprada/FactCC"
factcc_tokenizer = AutoTokenizer.from_pretrained(factcc_model_id)
factcc_model = AutoModelForSequenceClassification.from_pretrained(factcc_model_id).to(device)
factcc_model.eval()

def factcc_metric(records, model_prefix="base", batch_size=16):
    probs_all = []
    for i in range(0, len(records), batch_size):
        batch = records[i:i+batch_size]
        sources = [r["source_text"] for r in batch]
        summaries = [r[f"{model_prefix}_expl"] or "" for r in batch]
        enc = factcc_tokenizer(
            sources,
            summaries,
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to(device)
        with torch.no_grad():
            logits = factcc_model(**enc).logits
            probs = torch.softmax(logits, dim=-1)[:, 1]  # assume label 1 = factual
        probs_all.extend(probs.cpu().tolist())
    return {"FactCC_mean_prob_factual": float(np.mean(probs_all))}





# -----------------------------
# 9. Compute all metrics
# -----------------------------
def merge_metrics(*metric_dicts):
    merged = {}
    for d in metric_dicts:
        merged.update(d)
    return merged

print("\nComputing metrics...")

base_vqa   = vqa_metrics(val_records, "base")
ft_vqa     = vqa_metrics(val_records, "ft")

base_clip  = clipscore_metric(val_records, "base")
ft_clip    = clipscore_metric(val_records, "ft")

base_summ  = summac_metric(val_records, "base")
ft_summ    = summac_metric(val_records, "ft")

base_fact  = factcc_metric(val_records, "base")
ft_fact    = factcc_metric(val_records, "ft")

base_tox   = toxicity_metric(val_records, "base")
ft_tox     = toxicity_metric(val_records, "ft")

images_nsfw = nsfw_metric(val_records) if HAS_NSFW else {}

base_geval = geval_metric(val_records, "base", num_samples=20)
ft_geval   = geval_metric(val_records, "ft",   num_samples=20)

base_all = merge_metrics(base_vqa, base_clip, base_summ, base_fact, base_tox, base_geval)
ft_all   = merge_metrics(ft_vqa,   ft_clip,  ft_summ,  ft_fact,  ft_tox,  ft_geval)

print("\n=== METRIC COMPARISON (base vs finetuned) on val subset ===")
for k in sorted(base_all.keys()):
    print(f"{k:28s}  base={base_all[k]:.4f}   ft={ft_all[k]:.4f}")

if HAS_NSFW:
    print("\n=== IMAGE SAFETY (NSFW) ===")
    for k, v in images_nsfw.items():
        print(f"{k:28s}  {v:.4f}")
else:
    print("\n=== IMAGE SAFETY (NSFW) ===")
    print("NSFW metric skipped (pipeline not available).")


Using device: cuda
⚠️ SummaC not available: No module named 'summac.model'
Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading finetuned model + adapter...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Collecting outputs on 300 val examples...


Collecting outputs: 100%|██████████| 300/300 [06:13<00:00,  1.25s/it]


Loading FactCC model...
Loading Detoxify toxicity model...
Loading NSFW image detection pipeline...
⚠️ NSFW pipeline not available, skipping NSFW metric: No module named 'transformers.models.ijepa'

Computing metrics...


Token indices sequence length is longer than the specified maximum sequence length for this model (97 > 77). Running this sequence through the model will result in indexing errors


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# -----------------------------
# 4. VQA metrics (EM / F1)
# -----------------------------
def _token_f1(pred, gold):
    pred_tokens = pred.lower().split()
    gold_tokens = gold.lower().split()
    if len(pred_tokens) == 0 or len(gold_tokens) == 0:
        return 0.0
    common = Counter(pred_tokens) & Counter(gold_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gold_tokens)
    return 2 * precision * recall / (precision + recall)

def vqa_metrics(records, model_prefix="base"):
    em_list, f1_list = [], []
    for r in records:
        true_letter = r["true_letter"]
        sol = r["solution"] or ""
        pred_letter = r[f"{model_prefix}_letter"]
        pred_expl = r[f"{model_prefix}_expl"] or ""
        em_list.append(1.0 if pred_letter == true_letter else 0.0)
        if sol:
            f1_list.append(_token_f1(pred_expl, sol))
    em = float(np.mean(em_list))
    f1 = float(np.mean(f1_list)) if f1_list else 0.0
    return {"VQA_EM": em, "VQA_F1_expl": f1}


In [None]:
# -----------------------------
# 5. CLIPScore (if available)
# -----------------------------
if HAS_CLIP:
    clip_metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16").to(device)
    to_tensor = transforms.ToTensor()

    def clipscore_metric(records, model_prefix="base"):
        images, texts = [], []
        for r in records:
            img_pil = r["example"]["image"]
            text = r[f"{model_prefix}_raw"]
            img_tensor = to_tensor(img_pil)
            images.append(img_tensor)
            texts.append(text)
        with torch.no_grad():
            score = clip_metric(images, texts)
        return {"CLIPScore": float(score.item())}
else:
    def clipscore_metric(records, model_prefix="base"):
        return {}

In [None]:
# -----------------------------
# 7. G-Eval (LLM-as-judge, optional)
# -----------------------------
GEVAL_SYSTEM_PROMPT = """
You are grading answers to science visual question answering problems.

You will receive:
- the question and answer choices
- the reference correct answer letter
- the model's predicted answer and explanation

For each output, give three scores from 1 to 5:
- Fluency: Is the explanation clear, grammatical and well-written?
- Relevance: Does the explanation focus on the question and image/context?
- Correctness: Is the predicted answer letter correct and is the explanation logically consistent with the context?

Respond ONLY in JSON with keys: fluency, relevance, correctness.
"""

def geval_score_one(record, model_prefix="base"):
    ex = record["example"]
    q_block = build_question_text(ex)
    ref_letter = record["true_letter"]
    pred_raw = record[f"{model_prefix}_raw"]

    user_content = f"""
QUESTION & CONTEXT:
{q_block}

REFERENCE ANSWER LETTER: {ref_letter}

MODEL OUTPUT:
{pred_raw}
"""
    resp = client.responses.create(
        model="gpt-4o-mini",
        input=[
            {"role": "system", "content": GEVAL_SYSTEM_PROMPT},
            {"role": "user", "content": user_content},
        ],
    )
    text = resp.output[0].content[0].text
    scores = json.loads(text)
    return scores["fluency"], scores["relevance"], scores["correctness"]

def geval_metric(records, model_prefix="base", num_samples=30):
    if client is None:
        return {}
    fl, rel, corr = [], [], []
    for r in records[:num_samples]:
        f, r_, c = geval_score_one(r, model_prefix)
        fl.append(f); rel.append(r_); corr.append(c)
    return {
        "G_eval_fluency": float(np.mean(fl)),
        "G_eval_relevance": float(np.mean(rel)),
        "G_eval_correctness": float(np.mean(corr)),
    }

In [None]:
# -----------------------------
# 8. Safety: toxicity + NSFW
# -----------------------------
if HAS_DETOXIFY:
    print("Loading Detoxify toxicity model...")
    tox_model = Detoxify("unbiased")

    def toxicity_metric(records, model_prefix="base"):
        texts = [r[f"{model_prefix}_raw"] for r in records]
        scores = tox_model.predict(texts)["toxicity"]
        return {
            "toxicity_mean": float(np.mean(scores)),
            "toxicity_95p": float(np.percentile(scores, 95)),
        }
else:
    def toxicity_metric(records, model_prefix="base"):
        return {}

# NSFW image detection: wrap in try/except so it NEVER crashes
HAS_NSFW = True
try:
    print("Loading NSFW image detection pipeline...")
    nsfw_pipe = pipeline(
        "image-classification",
        model="Falconsai/nsfw_image_detection",
        device=0 if device.type == "cuda" else -1,
    )

    def nsfw_metric(records):
        nsfw_probs = []
        for r in records:
            img = r["example"]["image"]
            preds = nsfw_pipe(img)
            prob_nsfw = 0.0
            for p in preds:
                if "nsfw" in p["label"].lower():
                    prob_nsfw = p["score"]
                    break
            nsfw_probs.append(prob_nsfw)
        return {
            "NSFW_mean": float(np.mean(nsfw_probs)),
            "NSFW_95p": float(np.percentile(nsfw_probs, 95)),
        }
except Exception as e:
    HAS_NSFW = False
    print("⚠️ NSFW pipeline not available, skipping NSFW metric:", e)

    def nsfw_metric(records):
        return {}

Loading Detoxify toxicity model...
Loading NSFW image detection pipeline...
⚠️ NSFW pipeline not available, skipping NSFW metric: No module named 'transformers.models.ijepa'


In [None]:
def geval_metric(records, model_prefix="base", num_samples=30):
    return {}


In [None]:
print("\nComputing metrics...")

base_vqa   = vqa_metrics(val_records, "base")
ft_vqa     = vqa_metrics(val_records, "ft")

base_clip  = clipscore_metric(val_records, "base")
ft_clip    = clipscore_metric(val_records, "ft")

base_summ  = summac_metric(val_records, "base")
ft_summ    = summac_metric(val_records, "ft")

base_fact  = factcc_metric(val_records, "base")
ft_fact    = factcc_metric(val_records, "ft")

base_tox   = toxicity_metric(val_records, "base")
ft_tox     = toxicity_metric(val_records, "ft")

images_nsfw = nsfw_metric(val_records) if HAS_NSFW else {}

base_geval = geval_metric(val_records, "base", num_samples=20)
ft_geval   = geval_metric(val_records, "ft",   num_samples=20)

base_all = merge_metrics(base_vqa, base_clip, base_summ, base_fact, base_tox, base_geval)
ft_all   = merge_metrics(ft_vqa,   ft_clip,  ft_summ,  ft_fact,  ft_tox,  ft_geval)

print("\n=== METRIC COMPARISON (base vs finetuned) on val subset ===")
for k in sorted(base_all.keys()):
    print(f"{k:28s}  base={base_all[k]:.4f}   ft={ft_all[k]:.4f}")

if HAS_NSFW:
    print("\n=== IMAGE SAFETY (NSFW) ===")
    for k, v in images_nsfw.items():
        print(f"{k:28s}  {v:.4f}")
else:
    print("\n=== IMAGE SAFETY (NSFW) ===")
    print("NSFW metric skipped (pipeline not available).")



Computing metrics...

=== METRIC COMPARISON (base vs finetuned) on val subset ===
CLIPScore                     base=23.0492   ft=22.4087
FactCC_mean_prob_factual      base=0.4172   ft=0.1877
VQA_EM                        base=0.6500   ft=0.6733
VQA_F1_expl                   base=0.2056   ft=0.7695
toxicity_95p                  base=0.0143   ft=0.0050
toxicity_mean                 base=0.0031   ft=0.0018

=== IMAGE SAFETY (NSFW) ===
NSFW metric skipped (pipeline not available).


In [None]:
# ============================
# VQA-SPECIFIC EVAL SUITE
# ============================
import numpy as np
from collections import defaultdict, Counter
import re
from tqdm import tqdm
from qwen_vl_utils import process_vision_info
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -----------------------------
# Helpers (reuse from training)
# -----------------------------
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def index_to_letter(idx: int) -> str:
    return LETTERS[idx]

system_message = """You are a helpful science tutor.
You see a question, several answer choices, and often an image and a hint/lecture.
Your job is to pick the single correct multiple-choice option.
Start your answer with ONLY the letter of the correct option (A, B, C, D, etc.),
then a period, then a short explanation.
Example: "C. Because ..."
"""

def build_question_text(example) -> str:
    choices = example["choices"]
    options_text = "\n".join(
        f"({LETTERS[i]}) {choice}" for i, choice in enumerate(choices)
    )
    hint = example.get("hint", "")
    lecture = example.get("lecture", "")
    hint_part = f"\nHint: {hint}" if hint else ""
    lecture_part = f"\nLecture: {lecture}" if lecture else ""
    return (
        f"Question: {example['question']}\n\n"
        f"Choices:\n{options_text}"
        f"{hint_part}{lecture_part}\n\n"
        f"Respond starting with the letter, then a period, then a brief explanation."
    )

def parse_letter_and_explanation(text: str):
    m = re.match(r"\s*([A-Z])\s*[\.\-\):]\s*(.*)", text, flags=re.DOTALL)
    if m:
        letter = m.group(1)
        explanation = m.group(2).strip()
    else:
        m2 = re.search(r"[A-Z]", text)
        if m2:
            letter = m2.group(0)
            explanation = text[m2.end():].strip()
        else:
            letter = None
            explanation = text.strip()
    return letter, explanation

# -----------------------------
# Generation with/without image
# -----------------------------
def generate_answer_mc(
    model,
    processor,
    example,
    use_image: bool = True,
    max_new_tokens: int = 32,
):
    """
    Generate "letter + explanation" answer.
    If use_image=False, we drop the image and do text-only QA.
    """
    # Build messages
    user_content = []
    if use_image:
        user_content.append({"type": "image", "image": example["image"]})
    user_content.append({"type": "text", "text": build_question_text(example)})

    messages = [
        {"role": "system", "content": [{"type": "text", "text": system_message}]},
        {"role": "user", "content": user_content},
    ]

    # Chat template
    text_prompt = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    if use_image:
        image_inputs, _ = process_vision_info(messages)
    else:
        image_inputs = None

    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        return_tensors="pt",
    ).to(device)

    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )

    trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, gen_ids)
    ]

    out_text = processor.batch_decode(
        trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0].strip()

    letter, expl = parse_letter_and_explanation(out_text)
    return letter, expl, out_text

# -----------------------------
# Collect VQA outputs
# -----------------------------
def collect_vqa_records(dataset, num_samples=None, max_new_tokens=32):
    """
    For each example:
      - base model with image
      - base model without image
      - finetuned with image
      - finetuned without image
    """
    n = len(dataset) if num_samples is None else min(num_samples, len(dataset))
    records = []
    for i in tqdm(range(n), desc="Collecting VQA outputs (img/no-img)"):
        ex = dataset[i]
        true_letter = index_to_letter(int(ex["answer"]))

        # Base with image
        b_img_letter, b_img_expl, b_img_raw = generate_answer_mc(
            base_model, base_processor, ex, use_image=True, max_new_tokens=max_new_tokens
        )
        # Base without image (text-only)
        b_txt_letter, b_txt_expl, b_txt_raw = generate_answer_mc(
            base_model, base_processor, ex, use_image=False, max_new_tokens=max_new_tokens
        )

        # Finetuned with image
        ft_img_letter, ft_img_expl, ft_img_raw = generate_answer_mc(
            ft_model, ft_processor, ex, use_image=True, max_new_tokens=max_new_tokens
        )
        # Finetuned without image
        ft_txt_letter, ft_txt_expl, ft_txt_raw = generate_answer_mc(
            ft_model, ft_processor, ex, use_image=False, max_new_tokens=max_new_tokens
        )

        records.append(
            {
                "example": ex,
                "true_letter": true_letter,

                "base_letter_img": b_img_letter,
                "base_expl_img": b_img_expl,
                "base_raw_img": b_img_raw,

                "base_letter_txt": b_txt_letter,
                "base_expl_txt": b_txt_expl,
                "base_raw_txt": b_txt_raw,

                "ft_letter_img": ft_img_letter,
                "ft_expl_img": ft_img_expl,
                "ft_raw_img": ft_img_raw,

                "ft_letter_txt": ft_txt_letter,
                "ft_expl_txt": ft_txt_expl,
                "ft_raw_txt": ft_txt_raw,
            }
        )
    return records

# Run collection (small subset for speed)
NUM_VAL_EXAMPLES_VQA = 300  # change as you like
vqa_records = collect_vqa_records(val_ds, num_samples=NUM_VAL_EXAMPLES_VQA, max_new_tokens=32)

# -----------------------------
# Metric helpers
# -----------------------------
def accuracy(records, pred_key):
    vals = []
    for r in records:
        true_l = r["true_letter"]
        pred_l = r[pred_key]
        vals.append(1.0 if pred_l == true_l else 0.0)
    return float(np.mean(vals))

def accuracy_by_group(records, pred_key, group_key):
    """
    Accuracy broken down by a field in the ScienceQA example:
      - group_key can be 'subject', 'grade', 'task', 'category', etc.
    """
    groups = defaultdict(list)
    for r in records:
        ex = r["example"]
        group_val = ex.get(group_key, None)
        if group_val is None:
            continue
        true_l = r["true_letter"]
        pred_l = r[pred_key]
        groups[group_val].append(1.0 if pred_l == true_l else 0.0)
    return {g: float(np.mean(vs)) for g, vs in groups.items() if vs}

def accuracy_by_true_option(records, pred_key):
    """
    Measures how well the model does for each correct option (A/B/C/D...).
    This catches bias like "model rarely chooses option D as correct".
    """
    groups = defaultdict(list)
    for r in records:
        true_l = r["true_letter"]
        pred_l = r[pred_key]
        groups[true_l].append(1.0 if pred_l == true_l else 0.0)
    return {opt: float(np.mean(vs)) for opt, vs in groups.items() if vs}

def prediction_distribution(records, pred_key):
    """
    How often the model outputs each option overall.
    """
    counts = Counter()
    for r in records:
        pred_l = r[pred_key]
        if pred_l is not None:
            counts[pred_l] += 1
    total = sum(counts.values())
    return {opt: cnt / total for opt, cnt in counts.items()}

# -----------------------------
# 1. Overall accuracy (img vs no-img)
# -----------------------------
print("\n=== Overall MC Accuracy (img vs no-image) ===")
for model_prefix, key_img, key_txt in [
    ("BASE", "base_letter_img", "base_letter_txt"),
    ("FINETUNED", "ft_letter_img", "ft_letter_txt"),
]:
    acc_img = accuracy(vqa_records, key_img)
    acc_txt = accuracy(vqa_records, key_txt)
    print(f"{model_prefix:10s}  with image: {acc_img:.4f}   text-only: {acc_txt:.4f}   Δ={acc_img-acc_txt:+.4f}")

# -----------------------------
# 2. Per-subject & per-grade accuracy
# -----------------------------
print("\n=== Per-subject accuracy (with image) ===")
base_subj = accuracy_by_group(vqa_records, "base_letter_img", "subject")
ft_subj   = accuracy_by_group(vqa_records, "ft_letter_img", "subject")

for subj in sorted(set(base_subj.keys()) | set(ft_subj.keys())):
    b = base_subj.get(subj, float("nan"))
    f = ft_subj.get(subj, float("nan"))
    print(f"{subj:15s}  base={b:.4f}   ft={f:.4f}")

print("\n=== Per-grade accuracy (with image) ===")
base_grade = accuracy_by_group(vqa_records, "base_letter_img", "grade")
ft_grade   = accuracy_by_group(vqa_records, "ft_letter_img", "grade")

for g in sorted(set(base_grade.keys()) | set(ft_grade.keys())):
    b = base_grade.get(g, float("nan"))
    f = ft_grade.get(g, float("nan"))
    print(f"Grade {str(g):5s}  base={b:.4f}   ft={f:.4f}")

# -----------------------------
# 3. Per-option accuracy + bias
# -----------------------------
print("\n=== Per-true-option accuracy (with image) ===")
base_opt = accuracy_by_true_option(vqa_records, "base_letter_img")
ft_opt   = accuracy_by_true_option(vqa_records, "ft_letter_img")

for opt in sorted(set(base_opt.keys()) | set(ft_opt.keys())):
    b = base_opt.get(opt, float("nan"))
    f = ft_opt.get(opt, float("nan"))
    print(f"Option {opt}:  base={b:.4f}   ft={f:.4f}")

print("\n=== Prediction distribution (what the model selects) ===")
print("BASE with image:")
print(prediction_distribution(vqa_records, "base_letter_img"))
print("FINETUNED with image:")
print(prediction_distribution(vqa_records, "ft_letter_img"))

# -----------------------------
# 4. Image reliance metric
# -----------------------------
print("\n=== Image reliance (accuracy drop if image removed) ===")
for model_prefix, key_img, key_txt in [
    ("BASE", "base_letter_img", "base_letter_txt"),
    ("FINETUNED", "ft_letter_img", "ft_letter_txt"),
]:
    acc_img = accuracy(vqa_records, key_img)
    acc_txt = accuracy(vqa_records, key_txt)
    print(f"{model_prefix:10s}  Δ (img - no-img) = {acc_img - acc_txt:+.4f}")


Collecting VQA outputs (img/no-img): 100%|██████████| 300/300 [07:38<00:00,  1.53s/it]


=== Overall MC Accuracy (img vs no-image) ===
BASE        with image: 0.6500   text-only: 0.5633   Δ=+0.0867
FINETUNED   with image: 0.6733   text-only: 0.5567   Δ=+0.1167

=== Per-subject accuracy (with image) ===
language science  base=0.7143   ft=0.7143
natural science  base=0.5722   ft=0.5979
social science   base=0.7980   ft=0.8182

=== Per-grade accuracy (with image) ===
Grade grade1  base=0.0000   ft=0.0000
Grade grade2  base=0.6667   ft=0.7143
Grade grade3  base=0.6744   ft=0.7442
Grade grade4  base=0.7612   ft=0.7015
Grade grade5  base=0.7907   ft=0.7674
Grade grade6  base=0.4444   ft=0.6000
Grade grade7  base=0.6486   ft=0.5946
Grade grade8  base=0.5349   ft=0.6047

=== Per-true-option accuracy (with image) ===
Option A:  base=0.5684   ft=0.6211
Option B:  base=0.7109   ft=0.6719
Option C:  base=0.5510   ft=0.6327
Option D:  base=0.8519   ft=0.9630
Option E:  base=0.0000   ft=0.0000

=== Prediction distribution (what the model selects) ===
BASE with image:
{'B': 0.4433333333




In [None]:
# ============================================
# UNIFIED MULTI-BENCHMARK EVALUATION SCRIPT
# Qwen2-VL-2B (Finetuned) VQA Benchmark Suite
# ============================================
import json
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

import torch
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor

device = "cuda"

# ================================
# Load finetuned model + processor
# ================================
FT_ADAPTER_DIR = "qwen2-vl-2b-scienceqa-lora-expl"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.load_adapter(FT_ADAPTER_DIR)
model.eval()

processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

# ================================
# Universal VQA inference wrapper
# ================================
def vlm_answer(image, question, max_new_tokens=64):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": question}
            ]
        }
    ]
    text_prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    img_inputs, _ = process_vision_info(messages)

    inputs = processor(
        text=[text_prompt],
        images=img_inputs,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=max_new_tokens)

    decoded = processor.batch_decode(
        out[:, inputs.input_ids.shape[1]:],
        skip_special_tokens=True,
    )[0].strip()
    return decoded

# ================================================
# 1) SCIENCEQA OFFICIAL ACCURACY (val or test set)
# ================================================
lettermap = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def evaluate_scienceqa(dataset):
    correct = 0
    for ex in tqdm(dataset, desc="ScienceQA"):
        img = ex["image"]
        q = ex["question"]
        choices = ex["choices"]
        gold = lettermap[int(ex["answer"])]

        q_full = q + "\n" + "\n".join(
            f"({lettermap[i]}) {c}" for i, c in enumerate(choices)
        ) + "\nAnswer with the letter only."

        pred = vlm_answer(img, q_full)
        pred_letter = next((L for L in lettermap if L in pred), None)

        if pred_letter == gold:
            correct += 1
    return correct / len(dataset)

# ===============================================
# 2) MMBench Accuracy
# ===============================================
def evaluate_mmbench(path):
    df = pd.read_csv(path, sep="\t")
    correct = 0

    for _, row in tqdm(df.iterrows(), total=len(df), desc="MMBench"):
        img = Image.open(row["img_path"]).convert("RGB")
        choices = [row[f"choice_{i}"] for i in range(1, 9) if pd.notna(row.get(f"choice_{i}"))]
        ans = row["answer"]

        q_full = row["question"] + "\n" + "\n".join(
            f"({lettermap[i]}) {choices[i]}" for i in range(len(choices))
        ) + "\nAnswer with the letter only."

        pred = vlm_answer(img, q_full)
        pred_letter = next((L for L in lettermap if L in pred), None)

        if pred_letter == ans:
            correct += 1

    return correct / len(df)

# ===============================================
# 3) SEED-Bench (image subset)
# ===============================================
def evaluate_seed(path):
    with open(path) as f:
        data = json.load(f)

    correct = 0
    for item in tqdm(data, desc="SEED"):
        img = Image.open(item["image"]).convert("RGB")
        q = item["question"]
        choices = item["choices"]
        gold = item["answer"]  # A/B/C/D

        q_full = q + "\n" + "\n".join(
            f"({lettermap[i]}) {choices[i]}" for i in range(len(choices))
        ) + "\nAnswer with the letter only."

        pred = vlm_answer(img, q_full)
        pred_letter = next((L for L in "ABCD" if L in pred), None)

        if pred_letter == gold:
            correct += 1

    return correct / len(data)

# ===============================================
# 4) MMMU-Lite
# ===============================================
def evaluate_mmmu(path):
    with open(path) as f:
        data = json.load(f)

    correct = 0
    for ex in tqdm(data, desc="MMMU-Lite"):
        img = Image.open(ex["image"]).convert("RGB")
        q = ex["question"]
        choices = ex["options"]
        gold = ex["answer"]

        q_full = q + "\n" + "\n".join(
            f"({lettermap[i]}) {choices[i]}" for i in range(len(choices))
        ) + "\nAnswer with the letter only."

        pred = vlm_answer(img, q_full)
        pred_letter = next((L for L in lettermap if L in pred), None)

        if pred_letter == gold:
            correct += 1

    return correct / len(data)

# ===============================================
# 5) ChartQA (free-form answer)
# ===============================================
def evaluate_chartqa(path):
    with open(path) as f:
        data = json.load(f)

    correct = 0
    for ex in tqdm(data, desc="ChartQA"):
        img = Image.open(ex["img_path"]).convert("RGB")
        q = ex["query"]
        gold = ex["label"].lower().strip()

        pred = vlm_answer(img, q).lower().strip()

        if pred == gold:
            correct += 1

    return correct / len(data)

# ===============================================
# RUN ALL BENCHMARKS
# ===============================================

results = {}

# You MUST set paths here:
PATH_MMBENCH = "PATH_MMBENCH = "https://raw.githubusercontent.com/open-compass/opencompass/main/mmbench/mmbench_dev_en_20231003.tsv"
PATH_SEED = "SEED-Bench-Image.json"
PATH_MMMU = "MMMU_lite.json"
PATH_CHARTQA = "ChartQA/test.json"

results["ScienceQA"] = evaluate_scienceqa(val_ds)
results["MMBench"] = evaluate_mmbench(PATH_MMBENCH)
results["SEED-Bench"] = evaluate_seed(PATH_SEED)
results["MMMU-Lite"] = evaluate_mmmu(PATH_MMMU)
results["ChartQA"] = evaluate_chartqa(PATH_CHARTQA)

print("\n=== UNIFIED RESULTS TABLE ===")
for k, v in results.items():
    print(f"{k:15s} = {v:.4f}")

# ===============================================
# RADAR PLOT
# ===============================================

labels = list(results.keys())
scores = [results[k] for k in labels]

angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist()
scores += scores[:1]
angles += angles[:1]

fig = plt.figure(figsize=(7, 7))
ax = plt.subplot(111, polar=True)

ax.plot(angles, scores, linewidth=2)
ax.fill(angles, scores, alpha=0.25)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
ax.set_title("Qwen2-VL-2B Finetuned — VQA Radar Plot")

plt.show()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ScienceQA: 100%|██████████| 2097/2097 [02:15<00:00, 15.47it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'mmbench_dev_en_20231003.tsv'

In [None]:
# ===============================================
# RUN ALL BENCHMARKS
# ===============================================

results = {}

# You MUST set paths here:
PATH_MMBENCH = "https://raw.githubusercontent.com/open-compass/opencompass/main/mmbench/mmbench_dev_en_20231003.tsv"
PATH_SEED = "SEED-Bench-Image.json"
PATH_MMMU = "MMMU_lite.json"
PATH_CHARTQA = "ChartQA/test.json"

results["ScienceQA"] = evaluate_scienceqa(val_ds)
results["MMBench"] = evaluate_mmbench(PATH_MMBENCH)
results["SEED-Bench"] = evaluate_seed(PATH_SEED)
results["MMMU-Lite"] = evaluate_mmmu(PATH_MMMU)
results["ChartQA"] = evaluate_chartqa(PATH_CHARTQA)

print("\n=== UNIFIED RESULTS TABLE ===")
for k, v in results.items():
    print(f"{k:15s} = {v:.4f}")

# ===============================================
# RADAR PLOT
# ===============================================

labels = list(results.keys())
scores = [results[k] for k in labels]

angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist()
scores += scores[:1]
angles += angles[:1]

fig = plt.figure(figsize=(7, 7))
ax = plt.subplot(111, polar=True)

ax.plot(angles, scores, linewidth=2)
ax.fill(angles, scores, alpha=0.25)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
ax.set_title("Qwen2-VL-2B Finetuned — VQA Radar Plot")

plt.show()


ScienceQA: 100%|██████████| 2097/2097 [02:14<00:00, 15.58it/s]


HTTPError: HTTP Error 404: Not Found

In [None]:
!pip install matplotlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting matplotlib
  Downloading matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.61.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (113 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (6.3 kB)
Downloading matplotlib-3.10.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m147.5 MB/s[0m  [33m0:00:00[0m
[?25hDownloading contourpy-1.3.3-cp312-cp312-man

In [None]:
# Merge LoRA + base weights into a standalone model
merged_model = ft_model.merge_and_unload()

save_path = "qwen2vl-2b-scienceqa-merged"
merged_model.save_pretrained(save_path)
processor.save_pretrained(save_path)

print("Saved merged model to:", save_path)


In [None]:
from google.colab import drive
drive.mount('/content/drive')


ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip install pydrive2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pydrive2
  Downloading PyDrive2-1.21.3-py3-none-any.whl.metadata (7.0 kB)
Collecting google-api-python-client>=1.12.5 (from pydrive2)
  Downloading google_api_python_client-2.187.0-py3-none-any.whl.metadata (7.0 kB)
Collecting oauth2client>=4.0.0 (from pydrive2)
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pyOpenSSL<=24.2.1,>=19.1.0 (from pydrive2)
  Downloading pyOpenSSL-24.2.1-py3-none-any.whl.metadata (13 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0,>=1.32.0 (from google-api-python-client>=1.12.5->pydrive2)
  Downloading google_auth-2.43.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client>=1.12.5->pydrive2)
  Downloading google_auth_httplib2-0.2.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client>=1.12.5->pydrive2)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadat

In [None]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

gauth = GoogleAuth()
gauth.LocalWebserverAuth()  # opens a browser to log into Google
drive = GoogleDrive(gauth)


InvalidConfigError: Invalid client secrets file ('Error opening file', 'client_secrets.json', 'No such file or directory', 2)

In [None]:
MODEL_DIR = "qwen2vl-2b-scienceqa-merged"  # change if needed


In [None]:
import os

path = "/workspace/qwen2-vl-2b-scienceqa-lora-expl"
print("Exists:", os.path.exists(path))
print("Contents:", os.listdir(path))


Exists: True
Contents: ['training_args.bin', 'video_preprocessor_config.json', 'tokenizer.json', 'merges.txt', 'vocab.json', 'added_tokens.json', 'special_tokens_map.json', 'tokenizer_config.json', 'chat_template.jinja', 'preprocessor_config.json', 'adapter_config.json', 'adapter_model.safetensors', 'checkpoint-1167', 'checkpoint-1000', 'README.md']


In [None]:
import shutil

MODEL_DIR = "/workspace/qwen2-vl-2b-scienceqa-lora-expl"
ZIP_PATH = "/workspace/qwen2-vl-2b-scienceqa-lora-expl.zip"

shutil.make_archive(
    base_name=ZIP_PATH.replace(".zip", ""),
    format="zip",
    root_dir="/workspace",
    base_dir="qwen2-vl-2b-scienceqa-lora-expl"
)

print("ZIP saved to:", ZIP_PATH)


ZIP saved to: /workspace/qwen2-vl-2b-scienceqa-lora-expl.zip


In [None]:
merged = ft_model.merge_and_unload()
merged.save_pretrained("/workspace/qwen2-vl-2b-scienceqa-merged")
processor.save_pretrained("/workspace/qwen2-vl-2b-scienceqa-merged")


AttributeError: 'Qwen2VLForConditionalGeneration' object has no attribute 'merge_and_unload'

In [None]:
import shutil, os

MODEL_DIR_NAME = "qwen2-vl-2b-scienceqa-lora-expl"
ROOT_DIR = "/workspace"  # as shown in your screenshot

MODEL_DIR = os.path.join(ROOT_DIR, MODEL_DIR_NAME)
ZIP_BASE = os.path.join(ROOT_DIR, MODEL_DIR_NAME)  # without .zip extension

print("Zipping:", MODEL_DIR)

shutil.make_archive(
    base_name=ZIP_BASE,   # /workspace/qwen2-vl-2b-scienceqa-lora-expl
    format="zip",
    root_dir=ROOT_DIR,    # /workspace
    base_dir=MODEL_DIR_NAME,  # qwen2-vl-2b-scienceqa-lora-expl
)

print("ZIP created at:", ZIP_BASE + ".zip")


Zipping: /workspace/qwen2-vl-2b-scienceqa-lora-expl
ZIP created at: /workspace/qwen2-vl-2b-scienceqa-lora-expl.zip


In [2]:
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
from peft import PeftModel
import torch

base = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# path to unzipped folder
adapter_dir = "/content/drive/MyDrive/qwen2-vl-2b-scienceqa-lora-expl"

model = PeftModel.from_pretrained(base, adapter_dir)
processor = Qwen2VLProcessor.from_pretrained(adapter_dir)


`torch_dtype` is deprecated! Use `dtype` instead!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

The tokenizer you are loading from '/content/drive/MyDrive/qwen2-vl-2b-scienceqa-lora-expl' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e.  This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
