In [None]:
import os
import glob
import math
import re
import random
import logging
import warnings
from typing import List, Tuple

import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from PIL import Image
import torchvision.transforms as transforms
from tqdm import tqdm

from peft import LoraConfig, get_peft_model, PeftModel
from transformers import (
    AutoProcessor,
    LlavaForConditionalGeneration,
    get_linear_schedule_with_warmup,
)
from torch.cuda.amp import GradScaler, autocast

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

# Silence tokenizer parallelism warning after fork/spawn
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

torch.backends.cuda.matmul.allow_tf32 = True


# Dataset
class VLM_QADataset(Dataset):
    def __init__(self, image_paths: List[str], metadata_df: pd.DataFrame, is_train: bool = True):
        self.image_paths: List[str] = []
        self.questions: List[str] = []
        self.answers: List[str] = []

        if is_train:
            self.transform = transforms.Compose([
                transforms.Resize((336, 336)),

            ])
        else:
            self.transform = transforms.Compose([transforms.Resize((336, 336))])

        mdx = metadata_df.set_index("Patient")
        for img_path in image_paths:
            pid_folder = os.path.basename(os.path.dirname(img_path))
            pid_key = "_".join(pid_folder.split("_")[0:3])
            if pid_key in mdx.index:
                row = mdx.loc[[pid_key]].iloc[0]
                grade = row.get("neoplasm_histologic_grade")
                if pd.notna(grade) and int(grade) in [1, 2]:
                    self.image_paths.append(img_path)
                    q = "What is the histologic grade of the brain tumor in the MRI: one or two?"
                    a = f"The grade of the tumor is {'two' if int(grade) == 2 else 'one'}."
                    self.questions.append(q)
                    self.answers.append(a)

    def __len__(self) -> int:
        return len(self.image_paths)

    def __getitem__(self, idx: int):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        image = self.transform(image)
        return image, self.questions[idx], self.answers[idx]

def vlm_collate_fn_for_training(batch):
    images, questions, answers = zip(*batch)
    return list(images), list(questions), list(answers)


def vlm_collate_fn_for_evaluation(batch):
    images, questions, answers = zip(*batch)
    return list(images), list(questions), list(answers)

# Batch build on MAIN process

def build_training_batch_cpu_main(images, questions, answers, processor: AutoProcessor):
    """Tokenize WITH images on CPU in the main process. Returns CPU tensors; move to GPU later."""
    prompts = [f"USER: <image>\n{q}\nASSISTANT:" for q in questions]
    full_texts = [
        f"USER: <image>\n{q}\nASSISTANT: {a}{processor.tokenizer.eos_token}"
        for q, a in zip(questions, answers)
    ]

    toks_prompt = processor(text=prompts, images=images, return_tensors="pt", padding=True)
    toks_full = processor(text=full_texts, images=images, return_tensors="pt", padding=True)

    labels = toks_full.input_ids.clone()
    prompt_lens = torch.sum(toks_prompt.attention_mask, dim=1)
    for i in range(labels.size(0)):
        labels[i, : prompt_lens[i]] = -100
    labels[labels == processor.tokenizer.pad_token_id] = -100

    batch_cpu = {
        "input_ids": toks_full.input_ids,
        "pixel_values": toks_full.pixel_values,
        "attention_mask": toks_full.attention_mask,
        "labels": labels,
    }
    return batch_cpu

# Eval helpers & metrics


def _assistant_span(text: str) -> str:
    if not isinstance(text, str):
        return ""
    parts = text.split("ASSISTANT:")
    span = parts[-1] if parts else text
    return span.strip().lower()


def _has_one_two_flags(answer_text: str) -> Tuple[bool, bool]:
    answer_text = answer_text.replace("\u2019", "'")
    tokens = set(re.findall(r"\b(one|two|1|2)\b", answer_text))
    has_one = ("one" in tokens) or ("1" in tokens)
    has_two = ("two" in tokens) or ("2" in tokens)
    return has_one, has_two


def compute_token_accuracy_shifted(logits: torch.Tensor, labels: torch.Tensor, eos_id: int = None) -> tuple:
    with torch.no_grad():
        logits = logits[:, :-1, :]
        labels = labels[:, 1:]
        if eos_id is not None:
            labels = labels.clone()
            labels[labels == eos_id] = -100
        preds = torch.argmax(logits, dim=-1)
        mask = labels != -100
        correct = (preds[mask] == labels[mask]).sum().item()
        total = mask.sum().item()
        return correct, total


def run_evaluation(model, processor, data_loader: DataLoader, device, description="Evaluating"):
    model.eval()
    vlm_correct = 0
    total_samples = 0
    total_loss_sum = 0.0
    total_loss_count = 0
    total_tok_correct = 0
    total_tok_count = 0

    debug_printed = False

    with torch.no_grad():
        for batch in tqdm(data_loader, desc=description):
            images, questions, answers = batch

            # Generation accuracy (ASSISTANT-only span)
            prompts = [f"USER: <image>\n{q}\nASSISTANT:" for q in questions]
            with autocast():
                gen_inputs = processor(text=prompts, images=images, return_tensors="pt", padding=True).to(device)
                generated_ids = model.generate(
                    **gen_inputs,
                    max_new_tokens=20,
                    pad_token_id=processor.tokenizer.pad_token_id,
                )
            decoded = processor.batch_decode(generated_ids, skip_special_tokens=True)

            for i in range(len(decoded)):
                pred_span = _assistant_span(decoded[i])
                true_span = _assistant_span(f"ASSISTANT: {answers[i]}")
                want_two = (" two" in true_span) or true_span.strip().endswith("two.")
                has_one, has_two = _has_one_two_flags(pred_span)
                ok = (want_two and has_two and not has_one) or ((not want_two) and has_one and not has_two)
                if not debug_printed:
                    print(f"\n[DEBUG]\n  pred_raw=\n{decoded[i]}\n  pred_span=\n{pred_span}\n  true=\n{answers[i]}\n  want_two={want_two}  -> ok={ok}")
                if ok:
                    vlm_correct += 1

            # PPL + token-acc on teacher-forced labels (answer-only tokens contribute)
            full_texts = [
                f"USER: <image>\n{q}\nASSISTANT: {a}{processor.tokenizer.eos_token}"
                for q, a in zip(questions, answers)
            ]
            toks_prompt = processor(text=prompts, images=images, return_tensors="pt", padding=True)
            toks_full = processor(text=full_texts, images=images, return_tensors="pt", padding=True)
            labels = toks_full.input_ids.clone()
            prompt_lens = torch.sum(toks_prompt.attention_mask, dim=1)
            for i in range(labels.size(0)):
                labels[i, : prompt_lens[i]] = -100
            labels[labels == processor.tokenizer.pad_token_id] = -100

            with autocast():
                ce_inputs = {
                    "input_ids": toks_full.input_ids.to(device),
                    "pixel_values": toks_full.pixel_values.to(device, dtype=torch.float16),
                    "attention_mask": toks_full.attention_mask.to(device),
                    "labels": labels.to(device),
                }
                out = model(**ce_inputs, return_dict=True)
                loss = out.loss
                logits = out.logits

            total_loss_sum += loss.item()
            total_loss_count += 1
            c, n = compute_token_accuracy_shifted(logits.detach(), labels.to(logits.device), eos_id=processor.tokenizer.eos_token_id)
            total_tok_correct += c
            total_tok_count += n

            total_samples += len(answers)
            debug_printed = True

    vlm_acc = (vlm_correct / total_samples) * 100 if total_samples else 0.0
    avg_loss = (total_loss_sum / total_loss_count) if total_loss_count else float("inf")
    ppl = math.exp(avg_loss) if avg_loss < 50 else float("inf")
    tok_acc = (total_tok_correct / total_tok_count) * 100 if total_tok_count else 0.0

    print("\n--- Results for {} ---".format(description))
    print(f"  - VLM Grade Accuracy (QA):       {vlm_acc:.2f}%")
    print(f"  - Perplexity (teacher-forced):   {ppl:.4f}")
    print(f"  - Token Accuracy (answer-only):  {tok_acc:.2f}%")
    print("-" * 40)
    return vlm_acc, ppl, tok_acc

# LoRA targeting across LANGUAGE + PROJECTOR +  VISION


def discover_lora_targets(llava_model, include_vision: bool = True) -> List[str]:
    text_keys = {"q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"}
    projector_keys = {"multi_modal_projector"}
    vision_keys = {"q_proj", "k_proj", "v_proj", "out_proj"}  # CLIP-style

    target_suffixes: set[str] = set()

    for name, module in llava_model.named_modules():
        if any(k in name for k in text_keys):
            target_suffixes.add(name.split(".")[-1])
        if any(k in name for k in projector_keys):
            if hasattr(module, "weight") and getattr(module, "weight", None) is not None:
                target_suffixes.add(name.split(".")[-1])
        if include_vision and ("vision_tower" in name) and any(k in name for k in vision_keys):
            target_suffixes.add(name.split(".")[-1])

    if not target_suffixes:
        target_suffixes = text_keys

    return sorted(target_suffixes)


# Main

if __name__ == "__main__":

    config = {
        "device": "cuda:1" if torch.cuda.is_available() else "cpu",
        "base_path": "/home/ealam/Downloads/LGG dataset Cameron/lgg-mri-segmentation/kaggle_3m",
        "local_llava_path": "/home/ealam/Desktop/llava-1.5-7b-local",
        "save_path": "./llava-lora-allparts",
        "csv_path": "/home/ealam/Downloads/LGG dataset Cameron/lgg-mri-segmentation/kaggle_3m/data.csv",
        "learning_rate": 1e-4,
        "batch_size": 4,
        "num_epochs": 25,
        "early_stopping_patience": 5,
        "seed": 42,
        "include_vision_lora": True,
        "num_workers": 0,
    }

    # Seeds
    torch.manual_seed(config["seed"])
    np.random.seed(config["seed"])
    random.seed(config["seed"])
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config["seed"])

    # Gather
    print("Step 1: Gathering and splitting data...")
    all_image_paths = [p.replace("_mask.tif", ".tif") for p in glob.glob(os.path.join(config["base_path"], "*", "*_mask.tif"))]
    all_image_paths = [p for p in all_image_paths if os.path.exists(p)]
    print(f"Found {len(all_image_paths)} total images.")

    usable_paths, unused_paths = train_test_split(all_image_paths, test_size=0.40, random_state=config["seed"])
    print(f"Setting aside {len(unused_paths)} images. Using the remaining {len(usable_paths)} for this experiment.")

    train_val_paths, test_paths = train_test_split(usable_paths, test_size=0.20, random_state=config["seed"])
    train_paths, val_paths = train_test_split(train_val_paths, test_size=0.20, random_state=config["seed"])
    print(f"Splitting usable data into {len(train_paths)} training, {len(val_paths)} validation, and {len(test_paths)} test samples.")

    # Model / processor
    print("\nStep 2: Setting up model and processor...")
    DEVICE = config["device"]
    base_model = LlavaForConditionalGeneration.from_pretrained(
        config["local_llava_path"], torch_dtype=torch.float16, low_cpu_mem_usage=True
    )
    processor = AutoProcessor.from_pretrained(config["local_llava_path"])

    if processor.tokenizer.pad_token is None:
        processor.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        base_model.resize_token_embeddings(len(processor.tokenizer))

    target_modules = discover_lora_targets(base_model, include_vision=config["include_vision_lora"])
    print("LoRA target modules:", target_modules)

    lora_cfg = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=target_modules,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
    peft_model = get_peft_model(base_model, lora_cfg).to(DEVICE)

    # Ensure projector path is trainable (helps reduce randomness at image-text interface)
    for name, p in peft_model.named_parameters():
        if "multi_modal_projector" in name:
            p.requires_grad = True
            if p.dtype != torch.float32:
                p.data = p.data.to(torch.float32)

    peft_model.print_trainable_parameters()

    # Dataloaders
    print("\nStep 3: Preparing DataLoaders...")
    metadata_df = pd.read_csv(config["csv_path"])
    train_ds = VLM_QADataset(train_paths, metadata_df, is_train=True)
    val_ds = VLM_QADataset(val_paths, metadata_df, is_train=False)
    test_ds = VLM_QADataset(test_paths, metadata_df, is_train=False)

    train_loader = DataLoader(
        train_ds,
        batch_size=config["batch_size"],
        shuffle=True,
        num_workers=config["num_workers"],
        pin_memory=True,
        collate_fn=vlm_collate_fn_for_training,
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=config["batch_size"],
        shuffle=False,
        num_workers=config["num_workers"],
        pin_memory=True,
        collate_fn=vlm_collate_fn_for_evaluation,
    )
    test_loader = DataLoader(
        test_ds,
        batch_size=config["batch_size"],
        shuffle=False,
        num_workers=config["num_workers"],
        pin_memory=True,
        collate_fn=vlm_collate_fn_for_evaluation,
    )

    # Train
    print("\nStep 4: Starting fine-tuning with LoRA on language + projector + vision...")
    optimizer = AdamW((p for p in peft_model.parameters() if p.requires_grad), lr=config["learning_rate"])
    scaler = GradScaler()

    num_training_steps = len(train_loader) * config["num_epochs"]
    num_warmup_steps = int(num_training_steps * 0.1)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

    best_val_acc = 0.0
    patience = 0

    def _to_device(batch_cpu):
        out = {}
        for k, v in batch_cpu.items():
            if k == "pixel_values":
                out[k] = v.to(DEVICE, dtype=torch.float16, non_blocking=True)
            elif torch.is_tensor(v):
                out[k] = v.to(DEVICE, non_blocking=True)
            else:
                out[k] = v
        return out

    for epoch in range(config["num_epochs"]):
        peft_model.train()
        total_loss = 0.0

        for images, questions, answers in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            batch_cpu = build_training_batch_cpu_main(images, questions, answers, processor)
            batch = _to_device(batch_cpu)  # move to GPU in main process
            optimizer.zero_grad(set_to_none=True)
            with autocast():
                out = peft_model(**batch, return_dict=True)
                loss = out.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            total_loss += loss.item()

        avg_loss = total_loss / max(1, len(train_loader))
        print(f"\nEpoch {epoch+1} Avg Loss -> {avg_loss:.4f}")

        # Validation (runs generation + PPL + token-acc)
        val_acc, val_ppl, val_tok = run_evaluation(peft_model, processor, val_loader, DEVICE, description="Validation Set Eval")

        if val_acc > best_val_acc:
            print(f"  -> New best validation accuracy ({val_acc:.2f}%). Saving adapters...")
            best_val_acc = val_acc
            patience = 0
            peft_model.save_pretrained(config["save_path"])
            processor.save_pretrained(config["save_path"])
        else:
            patience += 1
            print(f"  -> No improvement for {patience} epoch(s).")
            if patience >= config["early_stopping_patience"]:
                print("\n--- Early stopping triggered. ---")
                break
        print("=" * 80)

    # Final eval
    print("\nStep 5: Loading best adapters for final evaluation...")
    save_path = config["save_path"]
    if os.path.exists(save_path):
        base = LlavaForConditionalGeneration.from_pretrained(config["local_llava_path"], torch_dtype=torch.float16, low_cpu_mem_usage=True)
        final_peft = PeftModel.from_pretrained(base, save_path).to(DEVICE)
        run_evaluation(final_peft, processor, test_loader, DEVICE, description="Final Test Evaluation")
    else:
        print("No adapters were saved.")


Step 1: Gathering and splitting data...
Found 3929 total images.
Setting aside 1572 images. Using the remaining 2357 for this experiment.
Splitting usable data into 1508 training, 377 validation, and 472 test samples.

Step 2: Setting up model and processor...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LoRA target modules: ['down_proj', 'gate_proj', 'k_proj', 'linear_1', 'linear_2', 'o_proj', 'out_proj', 'q_proj', 'up_proj', 'v_proj']
trainable params: 107,651,072 || all params: 7,150,098,432 || trainable%: 1.5056

Step 3: Preparing DataLoaders...

Step 4: Starting fine-tuning with LoRA on language + projector + vision...


Training Epoch 1: 100%|███████████████████████| 368/368 [06:13<00:00,  1.02s/it]



Epoch 1 Avg Loss -> 0.2522


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=False

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=False

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is two.
  want_tw

Validation Set Eval: 100%|██████████████████████| 93/93 [01:50<00:00,  1.19s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       52.56%
  - Perplexity (teacher-forced):   1.0729
  - Token Accuracy (answer-only):  94.73%
----------------------------------------
  -> New best validation accuracy (52.56%). Saving adapters...


Training Epoch 2: 100%|███████████████████████| 368/368 [06:11<00:00,  1.01s/it]



Epoch 2 Avg Loss -> 0.0702


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=False

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       47.98%
  - Perplexity (teacher-forced):   1.0724
  - Token Accuracy (answer-only):  94.25%
----------------------------------------
  -> No improvement for 1 epoch(s).


Training Epoch 3: 100%|███████████████████████| 368/368 [06:09<00:00,  1.00s/it]



Epoch 3 Avg Loss -> 0.0692


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       67.39%
  - Perplexity (teacher-forced):   1.0666
  - Token Accuracy (answer-only):  96.32%
----------------------------------------
  -> New best validation accuracy (67.39%). Saving adapters...


Training Epoch 4: 100%|███████████████████████| 368/368 [06:09<00:00,  1.00s/it]



Epoch 4 Avg Loss -> 0.0636


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:48<00:00,  1.17s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       64.96%
  - Perplexity (teacher-forced):   1.0626
  - Token Accuracy (answer-only):  96.14%
----------------------------------------
  -> No improvement for 1 epoch(s).


Training Epoch 5: 100%|███████████████████████| 368/368 [06:06<00:00,  1.00it/s]



Epoch 5 Avg Loss -> 0.0494


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=False

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=False

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is two.
  want_tw

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.17s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       65.77%
  - Perplexity (teacher-forced):   1.0643
  - Token Accuracy (answer-only):  96.20%
----------------------------------------
  -> No improvement for 2 epoch(s).


Training Epoch 6: 100%|███████████████████████| 368/368 [06:07<00:00,  1.00it/s]



Epoch 6 Avg Loss -> 0.0405


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.17s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       83.02%
  - Perplexity (teacher-forced):   1.0401
  - Token Accuracy (answer-only):  98.14%
----------------------------------------
  -> New best validation accuracy (83.02%). Saving adapters...


Training Epoch 7: 100%|███████████████████████| 368/368 [06:04<00:00,  1.01it/s]



Epoch 7 Avg Loss -> 0.0219


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       80.86%
  - Perplexity (teacher-forced):   1.0457
  - Token Accuracy (answer-only):  97.87%
----------------------------------------
  -> No improvement for 1 epoch(s).


Training Epoch 8: 100%|███████████████████████| 368/368 [06:00<00:00,  1.02it/s]



Epoch 8 Avg Loss -> 0.0150


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:50<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       84.37%
  - Perplexity (teacher-forced):   1.0448
  - Token Accuracy (answer-only):  98.26%
----------------------------------------
  -> New best validation accuracy (84.37%). Saving adapters...


Training Epoch 9: 100%|███████████████████████| 368/368 [05:55<00:00,  1.03it/s]



Epoch 9 Avg Loss -> 0.0070


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       87.60%
  - Perplexity (teacher-forced):   1.0379
  - Token Accuracy (answer-only):  98.62%
----------------------------------------
  -> New best validation accuracy (87.60%). Saving adapters...


Training Epoch 10: 100%|██████████████████████| 368/368 [05:53<00:00,  1.04it/s]



Epoch 10 Avg Loss -> 0.0068


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       89.22%
  - Perplexity (teacher-forced):   1.0341
  - Token Accuracy (answer-only):  98.80%
----------------------------------------
  -> New best validation accuracy (89.22%). Saving adapters...


Training Epoch 11: 100%|██████████████████████| 368/368 [05:52<00:00,  1.04it/s]



Epoch 11 Avg Loss -> 0.0040


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       87.33%
  - Perplexity (teacher-forced):   1.0436
  - Token Accuracy (answer-only):  98.59%
----------------------------------------
  -> No improvement for 1 epoch(s).


Training Epoch 12: 100%|██████████████████████| 368/368 [05:49<00:00,  1.05it/s]



Epoch 12 Avg Loss -> 0.0014


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       86.79%
  - Perplexity (teacher-forced):   1.0629
  - Token Accuracy (answer-only):  98.53%
----------------------------------------
  -> No improvement for 2 epoch(s).


Training Epoch 13: 100%|██████████████████████| 368/368 [05:45<00:00,  1.06it/s]



Epoch 13 Avg Loss -> 0.0009


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       87.33%
  - Perplexity (teacher-forced):   1.0667
  - Token Accuracy (answer-only):  98.59%
----------------------------------------
  -> No improvement for 3 epoch(s).


Training Epoch 14: 100%|██████████████████████| 368/368 [05:52<00:00,  1.04it/s]



Epoch 14 Avg Loss -> 0.0073


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]



--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       85.71%
  - Perplexity (teacher-forced):   1.0637
  - Token Accuracy (answer-only):  98.41%
----------------------------------------
  -> No improvement for 4 epoch(s).


Training Epoch 15: 100%|██████████████████████| 368/368 [05:43<00:00,  1.07it/s]



Epoch 15 Avg Loss -> 0.0002


Validation Set Eval:   0%|                               | 0/93 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=

Validation Set Eval: 100%|██████████████████████| 93/93 [01:49<00:00,  1.18s/it]


--- Results for Validation Set Eval ---
  - VLM Grade Accuracy (QA):       85.98%
  - Perplexity (teacher-forced):   1.0857
  - Token Accuracy (answer-only):  98.44%
----------------------------------------
  -> No improvement for 5 epoch(s).

--- Early stopping triggered. ---

Step 5: Loading best adapters for final evaluation...





Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Final Test Evaluation:   0%|                            | 0/116 [00:00<?, ?it/s]


[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is two.
  pred_span=
the grade of the tumor is two.
  true=
The grade of the tumor is two.
  want_two=True  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two=False  -> ok=True

[DEBUG]
  pred_raw=
USER:  
What is the histologic grade of the brain tumor in the MRI: one or two?
ASSISTANT: The grade of the tumor is one.
  pred_span=
the grade of the tumor is one.
  true=
The grade of the tumor is one.
  want_two

Final Test Evaluation: 100%|██████████████████| 116/116 [02:17<00:00,  1.19s/it]


--- Results for Final Test Evaluation ---
  - VLM Grade Accuracy (QA):       84.45%
  - Perplexity (teacher-forced):   1.0385
  - Token Accuracy (answer-only):  98.27%
----------------------------------------



