# World Model Policy Evaluation

This notebook demonstrates how to rollout OpenVLA in the Bridge (WidowX) environment from within our world model, and how to grade it using a VLM.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
from PIL import Image
import requests
import mediapy as media
from tqdm.notebook import tqdm
import torch
import einops
import base64
import math
import re
import cv2
from openai import OpenAI
import random
from pathlib import Path
import json

## Action scaling

In [None]:
def rescale_bridge_action(
    a,
    wv_lo=-0.05,
    wv_hi=+0.05,
    wv_post_scale_max=+1.75,
    wv_post_scale_min=-1.75,
    rd_lo=-0.25,
    rd_hi=+0.25,
    rd_post_scale_max=+1.4,
    rd_post_scale_min=-1.4):
    """
    Rescale Bridge (WidowX) action to the ranges expected by the world model.
    We need to call this function on the unnormalized action values returned by the policy.
    """
    # rescale end effector
    a[:3] = (a[:3] - wv_lo) / (wv_hi - wv_lo) * (
        wv_post_scale_max - wv_post_scale_min
    ) + wv_post_scale_min
    # rescale joint rotations
    a[3:6] = (a[3:6] - rd_lo) / (rd_hi - rd_lo) * (
        rd_post_scale_max - rd_post_scale_min
    ) + rd_post_scale_min
    # threshold the gripper
    a[6] = torch.where(a[6] > 0.8, -1.0, +1.0)
    return a


## VLM evaluation utils

Set your OpenAI key:
```
export OPENAI_API_KEY=<your key here>
```

In [None]:
def encode_video(video, stride=20):
    frames, idx = [], 0
    for idx, frame in enumerate(video):
        if idx % stride == 0:
            if (frame == 0).all():
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            _, buf = cv2.imencode(".jpg", frame)
            frames.append(base64.b64encode(buf).decode())
    return frames


def predict(video, task, n=5):
    frames = encode_video(video)
    prompt = f"""
Here is a sequence of frames from a robot policy which has been rolled out in a video-generation-based world model. I need your help determining whether the policy is successful. How successfully does the robot complete the following task?
Task: {task["instruction"]}
Subtasks: {task["subtasks"]}

For each subtask, give the model a score of 0 (fail) or 1 (success). Explain your reasoning. Finally, output "Total Score: <score>", where <score> is the sum of the scores on the subtasks above.

Note: Since this video was generated by a video prediction model (conditioned on robot actions), it may contain some artifacts due to the video model capacity.
Note: For the final score output, make sure to follow the "Total Score: <score>" format exactly so the score can be parsed out using a regex.
""".strip()
    client = OpenAI()
    messages = [
        {
            "role": "user",
            "content": [prompt, *[{"image": f, "resize": 256} for f in frames]],
        }
    ]
    print(prompt)
    res = client.chat.completions.create(model="gpt-4o", messages=messages, n=n)
    total_score = 0
    n_scores = 0
    for c in res.choices:
        print(c.message.content)
        m = re.search(r"Total Score: (\d+)", c.message.content.replace("**", ""))
        if m:
            total_score += (
                int(m.group(1)) / 2.0
            )  # divide by 2 since there are 2 subtasks
            n_scores += 1
    avg_score = total_score / n_scores
    print(f"Average Score: {avg_score}")
    return avg_score

## Init world model

In [None]:
from world_model import WorldModel
import os

CHECKPOINTS_TO_KWARGS = {
    "bridge_v2_ckpt.pt": {  # The demo model checkpoint from our original arxiv release.
        "use_pixel_rope": True,
    },
    "200k_20frame_cfg_bridgev2_ckpt.pt": {  # New in-progress model with CFG and EMA.
        "use_pixel_rope": False,
        "default_cfg": 3.0,
    },
}
FILESERVER_URL = "https://85daf289d906.ngrok.app"  # This might change.

ckpt_path = "200k_20frame_cfg_bridgev2_ckpt.pt"  # Take your pick from above.
if not Path(ckpt_path).exists():
    ckpt_url = FILESERVER_URL + "/" + ckpt_path
    print(f"{ckpt_url=}")
    os.system(f"wget {ckpt_url}")

wm = WorldModel(ckpt_path, **CHECKPOINTS_TO_KWARGS[ckpt_path])

In [None]:
from state_based_model import StateDynamicsModel

sm = StateDynamicsModel(
    state_dim=7,
    action_dim=10,
    d_model=256,
    nhead=4,
    num_layers=12,
    frame_skip=1,
    ckpt="/scratch/as20482/bridge_v2_state_based_1frame.pt"
).cuda()

## Load the tasks

In [None]:
import os

def load_tasks(root):
    for file in os.listdir(root):
        if file.endswith(".png"):
            base = os.path.splitext(file)[0]
            yield os.path.join(root, base)

In [None]:
TASKS = {
    "/vast/as20482/data/bridge/robot_evaluation/put_carrot_on_plate/": {
        "instruction": "put carrot on plate",
        "subtasks": ["Pick up the carrot.", "Place the carrot on the plate."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/put_eggplant_into_pot_or_pan/": {
        "instruction": "put eggplant into pot or pan",
        "subtasks": ["Pick up the eggplant.", "Place the eggplant into the pot or pan."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/close_microwave/": {
        "instruction": "close microwave",
        "subtasks": ["Push the microwave door.", "Close the microwave door completely."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/stack_blocks/": {
        "instruction": "stack blocks",
        "subtasks": ["Pick up a block.", "Place a block on top of another block."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/close_oven/": {
        "instruction": "close oven",
        "subtasks": ["Push the oven door.", "Close the oven door completely."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/open_microwave/": {
        "instruction": "open the microwave",
        "subtasks": ["Grab microwave handle.", "Pull microwave open."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/sweep_into_pile/": {
        "instruction": "sweep into pile",
        "subtasks": ["Grab the beam.", "Use the beam to sweep into pile."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/fold_cloth/": {
        "instruction": "fold cloth",
        "subtasks": ["Pick up one end of the cloth.", "Fold the cloth over."],
    },
    "/vast/as20482/data/bridge/robot_evaluation/open_oven/": {
        "instruction": "open oven",
        "subtasks": ["Grab oven handle.", "Pull oven open."],
    },
}

In [None]:
tasks = []
base_path = "/vast/as20482/data/bridge/robot_evaluation/open_oven/"
for task in load_tasks(base_path):
    tasks.append(
        {
            "im_0_path": task,
            "instruction": TASKS[base_path]["instruction"],
            "subtasks": TASKS[base_path]["subtasks"],
        }
    )

## OpenVLA

In [None]:
def evaluate_openvla(wm, vla, tasks, rollout_length=40):
    """
    Rollout an OpenVLA model on a list of tasks, and return the score on each task.
    Arguments:
        wm: WorldModel
        vla: An OpenVLA model from `transformers`
        tasks: A list of N tasks in loaded from a json. See "put_carrot_on_plate.json" for an example of the format.
    Returns:
        scores: A list of N scores from the VLM corresponding to each input task.
    """
    scores = []
    for task_i in tqdm(tasks, desc="completing tasks"):
        start_frame = np.array(
            Image.open(task_i["im_0_path"]+".png").resize(
                (256, 256)
            )
        )
        media.show_image(start_frame)
        wm.reset(torch.from_numpy(start_frame).cuda().float() / 255.0)

        frames = [start_frame]
        for step in tqdm(range(rollout_length)):
            curr_frame = Image.fromarray(frames[-1])
            prompt = f"In: What action should the robot take to {task_i['instruction']}?\nOut:"
            inputs = processor(prompt, curr_frame).to(
                device="cuda", dtype=torch.bfloat16
            )
            actions = vla.predict_action(
                **inputs, unnorm_key="bridge_orig", do_sample=False
            )

            a = torch.tensor(actions)[0].cuda()
            # NOTE: OpenVLA outputs 7-dim actions, while the world model was trained with up to 10-dim actions.
            a = torch.cat([a, a.new_zeros(3)], dim=-1)  # pad with zeros
            a = rescale_bridge_action(a)

            for i, x in wm.generate_chunk(a):
                new_frame = x[0, 0].cpu().numpy()
                new_frame = np.clip(new_frame * 255, 0, 255).astype(np.uint8)
                frames.append(new_frame)
        rollout_video = np.stack(frames)
        media.show_video(rollout_video, fps=20)
        avg_score = predict(rollout_video, task=task_i)
        scores.append(avg_score)
    return np.array(scores)

In [None]:
from transformers import AutoModelForVision2Seq, AutoProcessor

# Load Processor & VLA
processor = AutoProcessor.from_pretrained("openvla/openvla-7b", trust_remote_code=True)
vla = AutoModelForVision2Seq.from_pretrained(
    "openvla/openvla-7b",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
).cuda()

In [None]:
scores = evaluate_openvla(wm, vla, tasks)
print(f"Example task: {tasks[0]}")
print(f"Mean score: {np.mean(scores)=}")
print(f"STE: {np.std(scores) / len(scores)**0.5=}")

## HPT

In [None]:
from hpt.models.policy import Policy

In [None]:
policy = Policy.from_pretrained_full_model("hpt-large-lang", "bridge").cuda()

In [None]:
def evaluate_hpt(wm, sm, vla, tasks, rollout_length=40):
    scores = []
    for task_i in tqdm(tasks, desc="completing tasks"):
        start_frame = np.array(
            Image.open(task_i["im_0_path"]+".png").resize(
                (256, 256)
            )
        )
        state = np.load(task_i["im_0_path"]+".state.npy")
        media.show_image(start_frame)
        wm.reset(torch.from_numpy(start_frame).cuda().float() / 255.0)

        frames = [start_frame]
        vla.reset()
        for step in tqdm(range(rollout_length)):
            curr_frame = frames[-1]
            prompt = f"In: What action should the robot take to {task_i['instruction']}?\nOut:"
            inputs = {
                'image': curr_frame,
                'state': state,
                'language': prompt
            }
            raw_actions = vla.get_action(inputs, domain="bridge")
            actions = policy.normalizer["bridge"]["action"].normalize(raw_actions)
            a = torch.tensor(actions).cuda()
            a = torch.cat([a, a.new_zeros(4)], dim=-1)  # pad with zeros
            a = rescale_bridge_action(a, wv_lo=-1, wv_hi=1, rd_lo=-1, rd_hi=1)
            state = sm(torch.from_numpy(state).unsqueeze(0).cuda(), a.unsqueeze(0).unsqueeze(0)).detach().cpu().numpy()[0][0]

            for i, x in wm.generate_chunk(a):
                new_frame = x[0, 0].cpu().numpy()
                new_frame = np.clip(new_frame * 255, 0, 255).astype(np.uint8)
                frames.append(new_frame)
        rollout_video = np.stack(frames)
        media.show_video(rollout_video, fps=20)
        avg_score = predict(rollout_video, task=task_i)
        scores.append(avg_score)
    return np.array(scores)

In [None]:
scores = evaluate_hpt(wm, sm, policy, tasks)
print(f"Example task: {tasks[0]}")
print(f"Mean score: {np.mean(scores)=}")
print(f"STE: {np.std(scores) / len(scores)**0.5=}")

## SpatialVLA

In [None]:
from transformers import AutoModelForVision2Seq, AutoProcessor, AutoModel

model_name_or_path="IPEC-COMMUNITY/spatialvla-4b-224-pt"
processor = AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16).eval().cuda()

In [None]:
def normalize_actions(unnorm_actions, statistics, key="bridge_orig/1.0.0"):
    stats = statistics[key]["action"]
    action_low = np.array(stats["q01"])
    action_high = np.array(stats["q99"])
    mask = np.array(stats.get("mask", np.ones_like(action_low)), dtype=bool)

    norm_actions = np.where(
        mask,
        2 * (unnorm_actions - action_low) / (action_high - action_low) - 1,
        unnorm_actions,  # leave unmasked dimensions as-is
    )
    return norm_actions

In [None]:
def evaluate_spatialvla(wm, vla, tasks, rollout_length=40):
    scores = []
    for task_i in tqdm(tasks, desc="completing tasks"):
        start_frame = np.array(
            Image.open(task_i["im_0_path"]+".png").resize(
                (256, 256)
            )
        )
        media.show_image(start_frame)
        wm.reset(torch.from_numpy(start_frame).cuda().float() / 255.0)

        frames = [start_frame]
        for step in tqdm(range(rollout_length)):
            curr_frame = Image.fromarray(frames[-1])
            prompt = f"In: What action should the robot take to {task_i['instruction']}?\nOut:"
            inputs = processor(images=[curr_frame], text=prompt, return_tensors="pt")
            generation_outputs = model.predict_action(inputs)
            unnorm_actions = processor.decode_actions(generation_outputs, unnorm_key="bridge_orig/1.0.0")['actions'][0]
            actions = normalize_actions(unnorm_actions, processor.statistics)
            a = torch.tensor(actions).cuda()
            a = torch.cat([a, a.new_zeros(3)], dim=-1)  # pad with zeros
            a = rescale_bridge_action(a, wv_lo=-1, wv_hi=1, rd_lo=-1, rd_hi=1)

            for i, x in wm.generate_chunk(a):
                new_frame = x[0, 0].cpu().numpy()
                new_frame = np.clip(new_frame * 255, 0, 255).astype(np.uint8)
                frames.append(new_frame)
        rollout_video = np.stack(frames)
        media.show_video(rollout_video, fps=20)
        avg_score = predict(rollout_video, task=task_i)
        scores.append(avg_score)
    return np.array(scores)

In [None]:
scores = evaluate_spatialvla(wm, model, tasks)
print(f"Example task: {tasks[0]}")
print(f"Mean score: {np.mean(scores)=}")
print(f"STE: {np.std(scores) / len(scores)**0.5=}")