# Mind2Web Multimodal Baselines
## Baselines: Text DOM, Image, Multimodal, AXTree, CoT variants
## Models: Qwen2-VL-7B, InternVL2-8B

**Prerequisites:** Your teammate's dataloader should expose a `Mind2WebDataset` class.  
This notebook handles model loading, prompting, inference, and evaluation.

## 0. Imports & Config

In [1]:
import os
import json
import torch
import numpy as np
from pathlib import Path
from tqdm import tqdm
from PIL import Image
from dataclasses import dataclass
from typing import Optional

# HuggingFace
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, AutoModel, AutoTokenizer

# W&B (optional)
# import wandb

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

  import pynvml  # type: ignore[import]


PyTorch version: 2.9.1+cu130
CUDA available: True
GPU: NVIDIA A10G
VRAM: 23.7 GB


In [20]:
# ============================================================
# CONFIGURATION — edit these paths
# ============================================================

CONFIG = {
    # Paths
    "data_root": "/home/ubuntu/mind2web_data",       # your teammate's dataset path
    "results_dir": "/home/ubuntu/results",
    "hf_cache": "/home/ubuntu/hf_cache",

    # Model IDs
    "model_a": "Qwen/Qwen2-VL-7B-Instruct",
    "model_b": "OpenGVLab/InternVL2-8B",

    # Inference
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "dtype": torch.bfloat16,
    "max_new_tokens": 256,          # increase to 512+ for CoT baselines
    "batch_size": 1,                # keep at 1 for simplicity; increase if VRAM allows

    # Experiment
    "max_samples": 100,            # set to e.g. 100 for quick debugging; None = full dataset
    "seed": 42,
}

os.environ["HF_HOME"] = CONFIG["hf_cache"]
Path(CONFIG["results_dir"]).mkdir(parents=True, exist_ok=True)
print("Config loaded.")

Config loaded.


## 1. Dataset Interface
Replace the stub below with your teammate's actual dataloader import once ready.

In [3]:
from datasets import load_dataset
from PIL import Image
import io

class Mind2WebDataset:
    """
    Real loader for osunlp/Multimodal-Mind2Web.
    Available splits: 'train', 'test_task', 'test_website', 'test_domain'
    """
    def __init__(self, split="test_website", streaming=True):
        # We use streaming=True to pull data on-the-fly from AWS/HF
        self.ds = load_dataset("osunlp/Multimodal-Mind2Web", split=split, streaming=streaming)
        self.iter_ds = iter(self.ds)

    def __iter__(self):
        for row in self.ds:
            yield self._process_row(row)

    def _process_row(self, row):
        """Maps HuggingFace row schema to your project's expected interface."""
        # Note: Mind2Web provides 'cleaned_html'. 
        # For AXTree, you'll need a conversion step (see note below).
        
        return {
            "action_uid": row.get("action_uid"),
            "instruction": row.get("confirmed_task"),
            "dom_text": row.get("cleaned_html"),  
            "axtree_text": row.get("axtree", ""), # Check if your version has axtree; otherwise requires conversion
            "screenshot": row.get("image"),       # This is already a PIL Image object
            "candidate_elements": row.get("pos_candidates", []) + row.get("neg_candidates", []),
            "label_element_id": row.get("pos_candidates")[0]["backend_node_id"] if row.get("pos_candidates") else None,
            "label_action": row.get("operation", {}).get("op"),
            "label_value": row.get("operation", {}).get("value"),
        }

    def get_sample(self):
        """Helper to get just one sample for testing."""
        return self._process_row(next(self.iter_ds))

# --- Usage in your Notebook ---
# Since it's an iterable dataset (streaming), you loop through it or use next()
dataset = Mind2WebDataset(split="test_website", streaming=True)
sample = dataset.get_sample()

print(f"Task: {sample['instruction']}")
print(f"Action: {sample['label_action']} on ID {sample['label_element_id']}")
# display(sample['screenshot']) # Use this in Jupyter to see the image

#dataset = Mind2WebDataset(split="test")
#print(f"Dataset size: {len(dataset)} samples")
#print("Sample keys:", list(dataset[0].keys()))

README.md: 0.00B [00:00, ?B/s]



Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

TypeError: string indices must be integers, not 'str'

In [4]:
from datasets import load_dataset
from PIL import Image
import json
import io

# ── Step 1: Inspect the raw schema first ──────────────────────────────────────
# Run this cell alone to see exactly what fields + types HF gives you
_ds = load_dataset("osunlp/Multimodal-Mind2Web", split="test_website", streaming=True)
_raw = next(iter(_ds))

print("=== RAW ROW KEYS & TYPES ===")
for k, v in _raw.items():
    if isinstance(v, list):
        print(f"  {k}: list[{len(v)}]  first item type: {type(v[0]) if v else 'empty'}")
        if v and isinstance(v[0], str):
            print(f"    first item preview: {v[0][:120]}")
    else:
        print(f"  {k}: {type(v).__name__}  =  {str(v)[:120]}")

# ── Step 2: Fixed dataset class ───────────────────────────────────────────────

def _parse_candidate(c):
    """Candidates may be raw dicts or JSON strings depending on HF version."""
    if isinstance(c, str):
        try:
            return json.loads(c)
        except json.JSONDecodeError:
            return {"backend_node_id": c, "tag": "", "text": c}
    return c  # already a dict


def _parse_operation(op):
    """Operation may be a dict or a JSON string."""
    if isinstance(op, str):
        try:
            return json.loads(op)
        except json.JSONDecodeError:
            return {"op": op, "value": ""}
    if op is None:
        return {"op": None, "value": ""}
    return op


class Mind2WebDataset:
    """
    Loader for osunlp/Multimodal-Mind2Web.
    Splits: 'train', 'test_task', 'test_website', 'test_domain'
    """

    def __init__(self, split="test_website", streaming=True):
        self.ds = load_dataset(
            "osunlp/Multimodal-Mind2Web", split=split, streaming=streaming
        )
        self.iter_ds = iter(self.ds)

    def __iter__(self):
        for row in self.ds:
            yield self._process_row(row)

    def _process_row(self, row):
        # Parse candidates (may be JSON strings or dicts)
        pos_raw = row.get("pos_candidates") or []
        neg_raw = row.get("neg_candidates") or []
        pos_candidates = [_parse_candidate(c) for c in pos_raw]
        neg_candidates = [_parse_candidate(c) for c in neg_raw]

        # Normalise candidate schema for the rest of the notebook
        def normalise(c):
            return {
                "element_id": c.get("backend_node_id", ""),
                "tag":        c.get("tag_name", c.get("tag", "")),
                "text":       c.get("text", ""),
            }

        all_candidates = [normalise(c) for c in pos_candidates + neg_candidates]

        # Parse operation
        op = _parse_operation(row.get("operation"))

        # Label element id
        label_element_id = pos_candidates[0].get("backend_node_id") if pos_candidates else None

        # Screenshot — HF returns a PIL Image directly for image columns
        screenshot = row.get("image")
        if isinstance(screenshot, dict) and "bytes" in screenshot:
            screenshot = Image.open(io.BytesIO(screenshot["bytes"])).convert("RGB")

        return {
            "action_uid":        row.get("action_uid", ""),
            "instruction":       row.get("confirmed_task", ""),
            "dom_text":          row.get("cleaned_html", ""),
            "axtree_text":       row.get("axtree", ""),        # empty string if not present
            "screenshot":        screenshot,
            "candidate_elements": all_candidates,
            "label_element_id":  label_element_id,
            "label_action":      op.get("op"),
            "label_value":       op.get("value", ""),
        }

    def get_sample(self):
        """Get one sample for quick testing."""
        return self._process_row(next(self.iter_ds))


# ── Step 3: Test it ───────────────────────────────────────────────────────────
dataset = Mind2WebDataset(split="test_website", streaming=True)
sample = dataset.get_sample()

print("\n=== PROCESSED SAMPLE ===")
print(f"Task       : {sample['instruction']}")
print(f"Action     : {sample['label_action']}")
print(f"Label ID   : {sample['label_element_id']}")
print(f"# candidates: {len(sample['candidate_elements'])}")
print(f"Screenshot : {type(sample['screenshot'])} {getattr(sample['screenshot'], 'size', '')}")
print(f"DOM length : {len(sample['dom_text'])} chars")
print(f"AXTree     : {sample['axtree_text'][:80] if sample['axtree_text'] else '(empty — not in this dataset version)'}")
print(f"\nFirst 3 candidates:")
for c in sample['candidate_elements'][:3]:
    print(f"  {c}")

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

=== RAW ROW KEYS & TYPES ===
  action_uid: str  =  79c4a963-4aa9-49c1-9257-6b0d5069c551
  raw_html: str  =  <html backend_node_id="113">
  <body backend_node_id="188">
    <div backend_node_id="189">
      <div backend_node_id="
  cleaned_html: str  =  <html backend_node_id="113">
  <body backend_node_id="188">
    <div backend_node_id="189">
      <div backend_node_id="
  operation: str  =  {"original_op": "CLICK", "value": "", "op": "CLICK"}
  pos_candidates: list[2]  first item type: <class 'str'>
    first item preview: {"tag": "label", "attributes": "{\"backend_node_id\": \"110\", \"bounding_box_rect\": \"356,461,320,34\", \"class\": \"b
  neg_candidates: list[491]  first item type: <class 'str'>
    first item preview: {"tag": "div", "attributes": "{\"backend_node_id\": \"189\", \"bounding_box_rect\": \"0,0,1280,1080\", \"id\": \"__next\
  website: str  =  tiktok.music
  domain: str  =  Entertainment
  subdomain: str  =  Music
  annotation_id: str  =  013781df-4391-4533-bcb1-15f6

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]


=== PROCESSED SAMPLE ===
Task       : What are the romantic reggae musics from BCD Studio that can be used in tik tok series in andorra
Action     : CLICK
Label ID   : 110
# candidates: 493
Screenshot : <class 'NoneType'> 
DOM length : 85102 chars
AXTree     : (empty — not in this dataset version)

First 3 candidates:
  {'element_id': '110', 'tag': 'label', 'text': ''}
  {'element_id': '828', 'tag': 'input', 'text': ''}
  {'element_id': '189', 'tag': 'div', 'text': ''}


## 2. Prompt Templates
One template per baseline type. Each returns a string prompt (and optionally an image).

In [21]:
def format_candidates(candidate_elements):
    """Format candidate elements into a numbered list for the prompt."""
    lines = []
    for i, el in enumerate(candidate_elements):
        lines.append(f"[{i}] id={el['element_id']} tag={el['tag']} text=\"{el['text']}\"")
    return "\n".join(lines)


SYSTEM_PROMPT = (
    "You are a web agent. Given a task and webpage information, predict the next action.\n"
    "Respond ONLY in JSON format: {\"element_id\": \"<id>\", \"action\": \"<click|type|select>\", \"value\": \"<optional>\"}\n"
    "Do not include any explanation outside the JSON."
)

SYSTEM_PROMPT_COT = (
    "You are a web agent. Given a task and webpage information, predict the next action.\n"
    "First, reason step by step about which element to interact with and why.\n"
    "Then respond with a JSON on the final line: {\"element_id\": \"<id>\", \"action\": \"<click|type|select>\", \"value\": \"<optional>\"}"
)

MAX_DOM_CHARS = 8000   # ~2000 tokens, leaves room for model weights
MAX_AX_CHARS  = 6000

def make_prompt(sample, baseline_type, use_cot=False):
    """
    Returns: (text_prompt: str, image: PIL.Image or None)

    baseline_type options:
        'text_dom'       -> Baseline 1: DOM text only
        'image_only'     -> Baseline 2: Image only
        'multimodal_dom' -> Baselines 3/4: Image + DOM
        'axtree_only'    -> Baseline 5: AXTree text only
        'multimodal_ax'  -> Baselines 6/7: Image + AXTree
    """
    instruction = sample["instruction"]
    candidates  = format_candidates(sample["candidate_elements"])
    dom_text    = sample["dom_text"][:MAX_DOM_CHARS] 
    axtree_text = sample["axtree_text"][:MAX_AX_CHARS] 
    image       = sample["screenshot"]

    sys = SYSTEM_PROMPT_COT if use_cot else SYSTEM_PROMPT

    if baseline_type == "text_dom":
        prompt = (
            f"{sys}\n\n"
            f"Task: {instruction}\n\n"
            f"DOM:\n{dom_text}\n\n"
            f"Candidate elements:\n{candidates}\n\n"
            f"Next action:"
        )
        return prompt, None

    elif baseline_type == "image_only":
        prompt = (
            f"{sys}\n\n"
            f"Task: {instruction}\n\n"
            f"Candidate elements:\n{candidates}\n\n"
            "The screenshot of the current webpage is provided.\n"
            f"Next action:"
        )
        return prompt, image

    elif baseline_type == "multimodal_dom":
        prompt = (
            f"{sys}\n\n"
            f"Task: {instruction}\n\n"
            f"DOM:\n{dom_text}\n\n"
            f"Candidate elements:\n{candidates}\n\n"
            "The screenshot of the current webpage is provided.\n"
            f"Next action:"
        )
        return prompt, image

    elif baseline_type == "axtree_only":
        prompt = (
            f"{sys}\n\n"
            f"Task: {instruction}\n\n"
            f"Accessibility Tree:\n{axtree_text}\n\n"
            f"Candidate elements:\n{candidates}\n\n"
            f"Next action:"
        )
        return prompt, None

    elif baseline_type == "multimodal_ax":
        prompt = (
            f"{sys}\n\n"
            f"Task: {instruction}\n\n"
            f"Accessibility Tree:\n{axtree_text}\n\n"
            f"Candidate elements:\n{candidates}\n\n"
            "The screenshot of the current webpage is provided.\n"
            f"Next action:"
        )
        return prompt, image

    else:
        raise ValueError(f"Unknown baseline_type: {baseline_type}")


print("Prompt templates defined.")

Prompt templates defined.


## 3. Model Wrappers
Unified `predict()` interface for both Qwen2-VL and InternVL2.

In [22]:
class Qwen2VLWrapper:
    """Wrapper for Qwen2-VL-7B-Instruct"""

    def __init__(self, model_id=CONFIG["model_a"]):
        print(f"Loading Qwen2-VL from {model_id}...")
        from qwen_vl_utils import process_vision_info
        self.process_vision_info = process_vision_info

        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=CONFIG["dtype"],
            device_map="auto",
        )
        self.model.eval()
        print("Qwen2-VL loaded.")

    @torch.inference_mode()
    def predict(self, text_prompt, image=None, max_new_tokens=CONFIG["max_new_tokens"]):
        messages = []
        content = []
        if image is not None:
            content.append({"type": "image", "image": image})
        content.append({"type": "text", "text": text_prompt})
        messages.append({"role": "user", "content": content})

        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = self.process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            return_tensors="pt",
        ).to(CONFIG["device"])

        generated = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )
        out = self.processor.decode(
            generated[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
        )
        return out.strip()


# class InternVL2Wrapper:
#     """Wrapper for InternVL2-8B"""

#     def __init__(self, model_id=CONFIG["model_b"]):
#         print(f"Loading InternVL2 from {model_id}...")
#         self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
#         self.model = AutoModel.from_pretrained(
#             model_id,
#             torch_dtype=CONFIG["dtype"],
#             device_map="auto",
#             trust_remote_code=True,
#         )
#         self.model.eval()
#         print("InternVL2 loaded.")

#     def _load_image(self, image, max_num=6):
#         """Convert PIL image to InternVL2 pixel_values format."""
#         import torchvision.transforms as T
#         from torchvision.transforms.functional import InterpolationMode
#         IMAGENET_MEAN = (0.485, 0.456, 0.406)
#         IMAGENET_STD  = (0.229, 0.224, 0.225)
#         transform = T.Compose([
#             T.Resize((448, 448), interpolation=InterpolationMode.BICUBIC),
#             T.ToTensor(),
#             T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
#         ])
#         pixel_values = transform(image.convert("RGB")).unsqueeze(0)
#         return pixel_values.to(CONFIG["dtype"]).to(CONFIG["device"])

#     @torch.inference_mode()
#     def predict(self, text_prompt, image=None, max_new_tokens=CONFIG["max_new_tokens"]):
#         gen_config = {"max_new_tokens": max_new_tokens, "do_sample": False}

#         if image is not None:
#             pixel_values = self._load_image(image)
#             prompt = f"<image>\n{text_prompt}"
#             response = self.model.chat(
#                 self.tokenizer, pixel_values, prompt, gen_config
#             )
#         else:
#             pixel_values = None
#             response = self.model.chat(
#                 self.tokenizer, pixel_values, text_prompt, gen_config
#             )
#         return response.strip()


print("Model wrappers defined. Run the next cell to load a model.")

Model wrappers defined. Run the next cell to load a model.


## 4. Output Parsing & Evaluation

In [23]:
import re

def parse_prediction(raw_output):
    """
    Extract JSON from model output.
    Returns dict with keys: element_id, action, value
    Returns None on parse failure.
    """
    # Try to find JSON in the output (handles CoT where JSON is at the end)
    json_pattern = r'\{[^{}]*\}'
    matches = re.findall(json_pattern, raw_output, re.DOTALL)

    for match in reversed(matches):  # take last JSON block (CoT reasoning first, JSON last)
        try:
            parsed = json.loads(match)
            if "element_id" in parsed and "action" in parsed:
                parsed.setdefault("value", "")
                return parsed
        except json.JSONDecodeError:
            continue
    return None


def evaluate_prediction(pred, sample):
    """
    Returns dict of per-sample metrics.
    """
    if pred is None:
        return {"element_acc": 0, "action_acc": 0, "exact_match": 0, "parse_fail": 1}

    element_correct = (str(pred["element_id"]) == str(sample["label_element_id"]))
    action_correct  = (pred["action"].lower() == sample["label_action"].lower())
    exact_match     = element_correct and action_correct

    return {
        "element_acc": int(element_correct),
        "action_acc":  int(action_correct),
        "exact_match": int(exact_match),
        "parse_fail":  0,
    }


def aggregate_metrics(results):
    keys = ["element_acc", "action_acc", "exact_match", "parse_fail"]
    agg = {k: np.mean([r[k] for r in results]) for k in keys}
    agg["n"] = len(results)
    return agg


print("Evaluation functions defined.")

Evaluation functions defined.


## 5. Main Inference Loop

In [8]:
# def run_baseline(
#     model_wrapper,
#     dataset,
#     baseline_type,
#     baseline_name,
#     use_cot=False,
#     max_samples=CONFIG["max_samples"],
#     results_dir=CONFIG["results_dir"],
# ):
#     """
#     Run inference for one baseline and save results.

#     Args:
#         model_wrapper : Qwen2VLWrapper or InternVL2Wrapper instance
#         dataset       : Mind2WebDataset
#         baseline_type : str — one of 'text_dom', 'image_only', 'multimodal_dom',
#                                'axtree_only', 'multimodal_ax'
#         baseline_name : str — used for output file naming (e.g. 'B1_text_dom_qwen')
#         use_cot       : bool — whether to use chain-of-thought prompt
#         max_samples   : int or None — limit for debugging
#     """
#     max_new_tokens = 512 if use_cot else CONFIG["max_new_tokens"]
#     n = min(len(dataset), max_samples) if max_samples else len(dataset)

#     all_results = []
#     raw_outputs = []

#     for i in tqdm(range(n), desc=baseline_name):
#         sample = dataset[i]
#         prompt, image = make_prompt(sample, baseline_type, use_cot=use_cot)

#         try:
#             raw = model_wrapper.predict(prompt, image=image, max_new_tokens=max_new_tokens)
#         except Exception as e:
#             print(f"[{i}] Inference error: {e}")
#             raw = ""

#         pred    = parse_prediction(raw)
#         metrics = evaluate_prediction(pred, sample)

#         all_results.append(metrics)
#         raw_outputs.append({
#             "task_id":   sample["task_id"],
#             "raw":       raw,
#             "parsed":    pred,
#             "gt_element": sample["label_element_id"],
#             "gt_action":  sample["label_action"],
#             **metrics,
#         })

#     # Save raw outputs
#     out_path = Path(results_dir) / f"{baseline_name}_raw.jsonl"
#     with open(out_path, "w") as f:
#         for row in raw_outputs:
#             f.write(json.dumps(row) + "\n")

#     # Aggregate and save summary
#     summary = aggregate_metrics(all_results)
#     summary_path = Path(results_dir) / f"{baseline_name}_summary.json"
#     with open(summary_path, "w") as f:
#         json.dump({"baseline": baseline_name, **summary}, f, indent=2)

#     print(f"\n{'='*50}")
#     print(f"Baseline: {baseline_name}")
#     print(f"  Element Acc : {summary['element_acc']:.3f}")
#     print(f"  Action Acc  : {summary['action_acc']:.3f}")
#     print(f"  Exact Match : {summary['exact_match']:.3f}")
#     print(f"  Parse Fail  : {summary['parse_fail']:.3f}")
#     print(f"  N samples   : {summary['n']}")
#     print(f"{'='*50}\n")

#     return summary


# print("run_baseline() defined.")

run_baseline() defined.


In [24]:
def run_baseline(
    model_wrapper,
    dataset,
    baseline_type,
    baseline_name,
    use_cot=False,
    max_samples=CONFIG["max_samples"],
    results_dir=CONFIG["results_dir"],
):
    max_new_tokens = 512 if use_cot else CONFIG["max_new_tokens"]

    all_results = []
    raw_outputs = []

    # Streaming-compatible loop — use max_samples to cap, or run until exhausted
    dataset_iter = iter(dataset)
    pbar = tqdm(desc=baseline_name, total=max_samples)

    for i, sample in enumerate(dataset_iter):
        if max_samples and i >= max_samples:
            break

        prompt, image = make_prompt(sample, baseline_type, use_cot=use_cot)

        try:
            raw = model_wrapper.predict(prompt, image=image, max_new_tokens=max_new_tokens)
        except Exception as e:
            print(f"[{i}] Inference error: {e}")
            raw = ""
        finally:
            torch.cuda.empty_cache()

        pred    = parse_prediction(raw)
        metrics = evaluate_prediction(pred, sample)

        all_results.append(metrics)
        raw_outputs.append({
            "task_id":    sample.get("action_uid", str(i)),
            "raw":        raw,
            "parsed":     pred,
            "gt_element": sample["label_element_id"],
            "gt_action":  sample["label_action"],
            **metrics,
        })
        pbar.update(1)

    pbar.close()

    # Save raw outputs
    out_path = Path(results_dir) / f"{baseline_name}_raw.jsonl"
    with open(out_path, "w") as f:
        for row in raw_outputs:
            f.write(json.dumps(row) + "\n")

    # Aggregate and save summary
    summary = aggregate_metrics(all_results)
    summary_path = Path(results_dir) / f"{baseline_name}_summary.json"
    with open(summary_path, "w") as f:
        json.dump({"baseline": baseline_name, **summary}, f, indent=2)

    print(f"\n{'='*50}")
    print(f"Baseline: {baseline_name}")
    print(f"  Element Acc : {summary['element_acc']:.3f}")
    print(f"  Action Acc  : {summary['action_acc']:.3f}")
    print(f"  Exact Match : {summary['exact_match']:.3f}")
    print(f"  Parse Fail  : {summary['parse_fail']:.3f}")
    print(f"  N samples   : {summary['n']}")
    print(f"{'='*50}\n")

    return summary

## 6. Run All Baselines

The 10 baselines map as follows:

| # | Description | baseline_type | use_cot | model |
|---|---|---|---|---|
| 1 | Text DOM only | `text_dom` | False | either |
| 2 | Image only | `image_only` | False | either |
| 3 | Multimodal - Model A | `multimodal_dom` | False | Qwen2-VL |
| 4 | Multimodal - Model B | `multimodal_dom` | False | InternVL2 |
| 5 | AXTree only | `axtree_only` | False | either |
| 6 | Multimodal + AXTree - Model A | `multimodal_ax` | False | Qwen2-VL |
| 7 | Multimodal + AXTree - Model B | `multimodal_ax` | False | InternVL2 |
| 8 | 3 + CoT | `multimodal_dom` | True | Qwen2-VL |
| 9 | 5 + CoT - Model A | `multimodal_ax` | True | Qwen2-VL |
| 10 | 5 + CoT - Model B | `multimodal_ax` | True | InternVL2 |

In [9]:
# ============================================================
# STEP 1: Load Model A (Qwen2-VL)
# Run this cell, then run Qwen baselines below.
# Then unload and load Model B to save VRAM.
# ============================================================
model_a = Qwen2VLWrapper()
all_summaries = {}

Loading Qwen2-VL from Qwen/Qwen2-VL-7B-Instruct...


The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/730 [00:00<?, ?it/s]

Qwen2-VL loaded.


In [25]:
model_a

<__main__.Qwen2VLWrapper at 0x7f05b1bd2300>

In [26]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [27]:
# Baseline 1: Text DOM only (model-agnostic, run with model_a)
all_summaries["B1_text_dom"] = run_baseline(
    model_a, dataset,
    baseline_type="text_dom",
    baseline_name="B1_text_dom_qwen"
)

B1_text_dom_qwen:  24%|███████████████▎                                                | 24/100 [01:22<05:16,  4.16s/it]

[24] Inference error: CUDA out of memory. Tried to allocate 1.21 GiB. GPU 0 has a total capacity of 22.06 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 20.91 GiB memory in use. Of the allocated memory 18.95 GiB is allocated by PyTorch, and 1.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  30%|███████████████████▏                                            | 30/100 [01:42<03:30,  3.01s/it]

[29] Inference error: CUDA out of memory. Tried to allocate 1.58 GiB. GPU 0 has a total capacity of 22.06 GiB of which 411.44 MiB is free. Including non-PyTorch memory, this process has 21.65 GiB memory in use. Of the allocated memory 19.94 GiB is allocated by PyTorch, and 1.41 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  31%|███████████████████▊                                            | 31/100 [01:42<02:40,  2.32s/it]

[30] Inference error: CUDA out of memory. Tried to allocate 1.61 GiB. GPU 0 has a total capacity of 22.06 GiB of which 351.44 MiB is free. Including non-PyTorch memory, this process has 21.71 GiB memory in use. Of the allocated memory 20.02 GiB is allocated by PyTorch, and 1.39 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  32%|████████████████████▍                                           | 32/100 [01:43<02:04,  1.83s/it]

[31] Inference error: CUDA out of memory. Tried to allocate 1.61 GiB. GPU 0 has a total capacity of 22.06 GiB of which 335.44 MiB is free. Including non-PyTorch memory, this process has 21.72 GiB memory in use. Of the allocated memory 20.04 GiB is allocated by PyTorch, and 1.38 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  34%|█████████████████████▊                                          | 34/100 [01:53<03:46,  3.44s/it]

[34] Inference error: CUDA out of memory. Tried to allocate 1.06 GiB. GPU 0 has a total capacity of 22.06 GiB of which 369.44 MiB is free. Including non-PyTorch memory, this process has 21.69 GiB memory in use. Of the allocated memory 19.42 GiB is allocated by PyTorch, and 1.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  37%|███████████████████████▋                                        | 37/100 [02:03<03:10,  3.03s/it]

[36] Inference error: CUDA out of memory. Tried to allocate 1.51 GiB. GPU 0 has a total capacity of 22.06 GiB of which 543.44 MiB is free. Including non-PyTorch memory, this process has 21.52 GiB memory in use. Of the allocated memory 19.77 GiB is allocated by PyTorch, and 1.44 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  66%|██████████████████████████████████████████▏                     | 66/100 [03:57<02:07,  3.75s/it]

[66] Inference error: CUDA out of memory. Tried to allocate 1.11 GiB. GPU 0 has a total capacity of 22.06 GiB of which 219.44 MiB is free. Including non-PyTorch memory, this process has 21.84 GiB memory in use. Of the allocated memory 19.22 GiB is allocated by PyTorch, and 2.32 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  71%|█████████████████████████████████████████████▍                  | 71/100 [04:14<01:24,  2.91s/it]

[70] Inference error: CUDA out of memory. Tried to allocate 1.55 GiB. GPU 0 has a total capacity of 22.06 GiB of which 467.44 MiB is free. Including non-PyTorch memory, this process has 21.59 GiB memory in use. Of the allocated memory 19.87 GiB is allocated by PyTorch, and 1.42 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  72%|██████████████████████████████████████████████                  | 72/100 [04:14<01:02,  2.23s/it]

[71] Inference error: CUDA out of memory. Tried to allocate 1.52 GiB. GPU 0 has a total capacity of 22.06 GiB of which 535.44 MiB is free. Including non-PyTorch memory, this process has 21.53 GiB memory in use. Of the allocated memory 19.78 GiB is allocated by PyTorch, and 1.44 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  77%|█████████████████████████████████████████████████▎              | 77/100 [04:36<01:16,  3.32s/it]

[77] Inference error: CUDA out of memory. Tried to allocate 1.40 GiB. GPU 0 has a total capacity of 22.06 GiB of which 767.44 MiB is free. Including non-PyTorch memory, this process has 21.30 GiB memory in use. Of the allocated memory 19.48 GiB is allocated by PyTorch, and 1.52 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  78%|█████████████████████████████████████████████████▉              | 78/100 [04:37<00:54,  2.50s/it]

[78] Inference error: CUDA out of memory. Tried to allocate 1.46 GiB. GPU 0 has a total capacity of 22.06 GiB of which 655.44 MiB is free. Including non-PyTorch memory, this process has 21.41 GiB memory in use. Of the allocated memory 19.63 GiB is allocated by PyTorch, and 1.48 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  84%|█████████████████████████████████████████████████████▊          | 84/100 [04:53<00:39,  2.46s/it]

[83] Inference error: CUDA out of memory. Tried to allocate 1.67 GiB. GPU 0 has a total capacity of 22.06 GiB of which 215.44 MiB is free. Including non-PyTorch memory, this process has 21.84 GiB memory in use. Of the allocated memory 20.20 GiB is allocated by PyTorch, and 1.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  85%|██████████████████████████████████████████████████████▍         | 85/100 [04:53<00:29,  1.95s/it]

[84] Inference error: CUDA out of memory. Tried to allocate 1.73 GiB. GPU 0 has a total capacity of 22.06 GiB of which 99.44 MiB is free. Including non-PyTorch memory, this process has 21.95 GiB memory in use. Of the allocated memory 20.35 GiB is allocated by PyTorch, and 1.30 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen:  86%|███████████████████████████████████████████████████████         | 86/100 [04:54<00:22,  1.58s/it]

[85] Inference error: CUDA out of memory. Tried to allocate 1.66 GiB. GPU 0 has a total capacity of 22.06 GiB of which 239.44 MiB is free. Including non-PyTorch memory, this process has 21.82 GiB memory in use. Of the allocated memory 20.17 GiB is allocated by PyTorch, and 1.35 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
[86] Inference error: CUDA out of memory. Tried to allocate 1.37 GiB. GPU 0 has a total capacity of 22.06 GiB of which 835.44 MiB is free. Including non-PyTorch memory, this process has 21.23 GiB memory in use. Of the allocated memory 19.39 GiB is allocated by PyTorch, and 1.54 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avo

B1_text_dom_qwen:  90%|█████████████████████████████████████████████████████████▌      | 90/100 [05:12<00:47,  4.70s/it]

[90] Inference error: CUDA out of memory. Tried to allocate 930.00 MiB. GPU 0 has a total capacity of 22.06 GiB of which 849.44 MiB is free. Including non-PyTorch memory, this process has 21.22 GiB memory in use. Of the allocated memory 19.17 GiB is allocated by PyTorch, and 1.75 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


B1_text_dom_qwen: 100%|███████████████████████████████████████████████████████████████| 100/100 [06:06<00:00,  3.67s/it]


Baseline: B1_text_dom_qwen
  Element Acc : 0.090
  Action Acc  : 0.520
  Exact Match : 0.090
  Parse Fail  : 0.160
  N samples   : 100






In [None]:
import json
from pathlib import Path

summary_path = Path(CONFIG["results_dir"]) / "B1_text_dom_qwen_summary.json"
with open(summary_path) as f:
    summary = json.load(f)

print("="*40)
print(f"Baseline : {summary['baseline']}")
print(f"N samples: {summary['n']}")
print(f"Element Acc : {summary['element_acc']:.3f}  ({int(summary['element_acc']*summary['n'])}/{int(summary['n'])})")
print(f"Action Acc  : {summary['action_acc']:.3f}  ({int(summary['action_acc']*summary['n'])}/{int(summary['n'])})")
print(f"Exact Match : {summary['exact_match']:.3f}  ({int(summary['exact_match']*summary['n'])}/{int(summary['n'])})")
print(f"Parse Fail  : {summary['parse_fail']:.3f}  ({int(summary['parse_fail']*summary['n'])}/{int(summary['n'])})")
print("="*40)

In [None]:
# Baseline 2: Image only
all_summaries["B2_image_only"] = run_baseline(
    model_a, dataset,
    baseline_type="image_only",
    baseline_name="B2_image_only_qwen"
)

In [None]:
# Baseline 3: Multimodal DOM - Model A (Qwen2-VL)
all_summaries["B3_multimodal_dom_qwen"] = run_baseline(
    model_a, dataset,
    baseline_type="multimodal_dom",
    baseline_name="B3_multimodal_dom_qwen"
)

In [None]:
# Baseline 5: AXTree only
all_summaries["B5_axtree_only"] = run_baseline(
    model_a, dataset,
    baseline_type="axtree_only",
    baseline_name="B5_axtree_only_qwen"
)

In [None]:
# Baseline 6: Multimodal AXTree - Model A (Qwen2-VL)
all_summaries["B6_multimodal_ax_qwen"] = run_baseline(
    model_a, dataset,
    baseline_type="multimodal_ax",
    baseline_name="B6_multimodal_ax_qwen"
)

In [None]:
# Baseline 8: Multimodal DOM + CoT (Model A)
all_summaries["B8_multimodal_dom_cot_qwen"] = run_baseline(
    model_a, dataset,
    baseline_type="multimodal_dom",
    baseline_name="B8_multimodal_dom_cot_qwen",
    use_cot=True
)

In [None]:
# Baseline 9: AXTree + CoT - Model A (Qwen2-VL)
all_summaries["B9_axtree_cot_qwen"] = run_baseline(
    model_a, dataset,
    baseline_type="multimodal_ax",
    baseline_name="B9_axtree_cot_qwen",
    use_cot=True
)

In [None]:
# Unload Model A to free VRAM before loading Model B
del model_a
torch.cuda.empty_cache()
print("Model A unloaded. VRAM freed.")
print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f} GB")

In [None]:
# ============================================================
# STEP 2: Load Model B (InternVL2)
# ============================================================
model_b = InternVL2Wrapper()

In [None]:
# Baseline 4: Multimodal DOM - Model B (InternVL2)
all_summaries["B4_multimodal_dom_internvl"] = run_baseline(
    model_b, dataset,
    baseline_type="multimodal_dom",
    baseline_name="B4_multimodal_dom_internvl"
)

In [None]:
# Baseline 7: Multimodal AXTree - Model B (InternVL2)
all_summaries["B7_multimodal_ax_internvl"] = run_baseline(
    model_b, dataset,
    baseline_type="multimodal_ax",
    baseline_name="B7_multimodal_ax_internvl"
)

In [None]:
# Baseline 10: AXTree + CoT - Model B (InternVL2)
all_summaries["B10_axtree_cot_internvl"] = run_baseline(
    model_b, dataset,
    baseline_type="multimodal_ax",
    baseline_name="B10_axtree_cot_internvl",
    use_cot=True
)

## 7. Results Summary Table

In [None]:
import pandas as pd

rows = []
baseline_labels = {
    "B1_text_dom":               "B1: Text DOM only",
    "B2_image_only":             "B2: Image only",
    "B3_multimodal_dom_qwen":    "B3: Multimodal DOM (Qwen2-VL)",
    "B4_multimodal_dom_internvl":"B4: Multimodal DOM (InternVL2)",
    "B5_axtree_only":            "B5: AXTree only",
    "B6_multimodal_ax_qwen":     "B6: Multimodal AXTree (Qwen2-VL)",
    "B7_multimodal_ax_internvl": "B7: Multimodal AXTree (InternVL2)",
    "B8_multimodal_dom_cot_qwen":"B8: Multimodal DOM + CoT (Qwen2-VL)",
    "B9_axtree_cot_qwen":        "B9: AXTree + CoT (Qwen2-VL)",
    "B10_axtree_cot_internvl":   "B10: AXTree + CoT (InternVL2)",
}

for key, label in baseline_labels.items():
    if key in all_summaries:
        s = all_summaries[key]
        rows.append({
            "Baseline": label,
            "Element Acc": f"{s['element_acc']:.3f}",
            "Action Acc":  f"{s['action_acc']:.3f}",
            "Exact Match": f"{s['exact_match']:.3f}",
            "Parse Fail":  f"{s['parse_fail']:.3f}",
            "N": int(s['n']),
        })

df = pd.DataFrame(rows)
print(df.to_string(index=False))

# Save to CSV
df.to_csv(Path(CONFIG["results_dir"]) / "all_baselines_summary.csv", index=False)
print("\nSaved to all_baselines_summary.csv")

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 5))
x = range(len(df))
ax.bar(x, df["Exact Match"].astype(float), color="steelblue", alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels([f"B{i+1}" for i in x], fontsize=9)
ax.set_ylabel("Exact Match Accuracy")
ax.set_title("Mind2Web Baselines — Exact Match Accuracy")
ax.set_ylim(0, 1)

for i, row in df.iterrows():
    ax.text(i, float(row["Exact Match"]) + 0.01, row["Exact Match"], ha="center", fontsize=8)

plt.tight_layout()
plt.savefig(Path(CONFIG["results_dir"]) / "baseline_results.png", dpi=150)
plt.show()
print("Plot saved.")

## 8. Debugging Utilities

In [None]:
# Inspect a single sample and model output
sample = dataset[0]
print("Instruction:", sample["instruction"])
print("Label element:", sample["label_element_id"])
print("Label action:", sample["label_action"])

prompt, image = make_prompt(sample, "multimodal_ax", use_cot=False)
print("\n--- Prompt (first 800 chars) ---")
print(prompt[:800])

if image:
    display(image.resize((400, 225)))

In [None]:
# Load and inspect a saved results file
results_file = Path(CONFIG["results_dir"]) / "B6_multimodal_ax_qwen_raw.jsonl"
if results_file.exists():
    with open(results_file) as f:
        lines = [json.loads(l) for l in f]
    
    failures = [l for l in lines if l["element_acc"] == 0]
    print(f"Total: {len(lines)}, Failures: {len(failures)}")
    print("\nExample failure:")
    print(json.dumps(failures[0], indent=2))