# Prismatic-VLM VQA能力测试

本notebook用于测试VLA-Adapter中原始Prismatic-VLM (Qwen2.5-0.5B) 的视觉问答和推理能力。

**目标**：
1. 建立VLM reasoning能力的基线
2. 了解0.5B模型的能力上限
3. 为后续"VLM能力保留"实验提供对比基准


## 1. 环境准备


In [None]:
import os
import sys
import torch
from PIL import Image
import requests
from pathlib import Path
import matplotlib.pyplot as plt

# 确保在项目根目录
PROJECT_ROOT = Path(os.getcwd())
if PROJECT_ROOT.name != "VLA-Adapter":
    PROJECT_ROOT = PROJECT_ROOT.parent
    os.chdir(PROJECT_ROOT)
print(f"Project root: {PROJECT_ROOT}")

# 检查GPU
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


## 2. 加载Prismatic-VLM模型

**注意**: 首次运行需要从HuggingFace下载模型。

模型地址: https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b


In [None]:
# 模型配置
MODEL_PATH = "pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b"

# 检查模型是否已下载
model_dir = PROJECT_ROOT / MODEL_PATH
if not model_dir.exists() or not any(model_dir.iterdir()) if model_dir.exists() else True:
    print("="*60)
    print("模型尚未下载!")
    print("请先运行以下命令下载模型:")
    print("="*60)
    print()
    print("git lfs install")
    print(f"git clone https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b {MODEL_PATH}")
    print()
    print("或者使用huggingface_hub (推荐):")
    print()
    print("from huggingface_hub import snapshot_download")
    print(f'snapshot_download(repo_id="Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b", local_dir="{MODEL_PATH}")')
    print("="*60)
else:
    print(f"模型目录存在: {model_dir}")
    print(f"文件列表: {list(model_dir.iterdir())[:10]}...")


In [None]:
# 如果需要下载模型，取消注释并运行此cell
# from huggingface_hub import snapshot_download
# snapshot_download(
#     repo_id="Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b", 
#     local_dir=str(PROJECT_ROOT / MODEL_PATH)
# )


In [None]:
# 使用prismatic的load函数加载完整VLM
from prismatic.models import load

print("Loading Prismatic-VLM...")
vlm = load(str(PROJECT_ROOT / MODEL_PATH), hf_token="", load_for_training=False)
print("VLM loaded successfully!")

# 移动到GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vlm = vlm.to(device)
vlm.eval()
print(f"Model moved to: {device}")

# 打印模型信息
total_params = sum(p.numel() for p in vlm.parameters())
print(f"Total parameters: {total_params / 1e6:.1f}M")


## 3. 准备测试图像


In [None]:
def load_test_image(source="url"):
    """加载测试图像"""
    if source == "url":
        # 使用网络图片 (一个包含食物的简单图片)
        img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.png"
        image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
    elif source == "libero":
        # 使用项目中的LIBERO数据
        libero_img_path = PROJECT_ROOT / "data_processed" / "bbox_visualization" / "episode_00004_bbox.jpg"
        if libero_img_path.exists():
            image = Image.open(libero_img_path).convert("RGB")
        else:
            print(f"LIBERO image not found: {libero_img_path}")
            print("Using URL image instead...")
            return load_test_image("url")
    else:
        # 使用本地文件
        image = Image.open(source).convert("RGB")
    
    return image

# 加载测试图像 (可改为 "libero" 测试机器人场景，或提供本地路径)
test_image = load_test_image("url")

# 显示图像
plt.figure(figsize=(8, 8))
plt.imshow(test_image)
plt.title(f"Test Image ({test_image.size[0]}x{test_image.size[1]})")
plt.axis('off')
plt.show()


## 4. VQA测试函数


In [None]:
def ask_vlm(vlm, image: Image.Image, question: str, max_new_tokens: int = 128) -> str:
    """
    向VLM提问并获取回答
    
    Args:
        vlm: Prismatic VLM模型
        image: PIL图像
        question: 问题文本 (不需要包含<image>标记，generate方法会自动处理)
        max_new_tokens: 最大生成token数
    
    Returns:
        模型的回答文本
    """
    # 使用VLM的generate方法
    # Prismatic VLM的generate会自动处理图像和prompt的组合
    with torch.inference_mode():
        try:
            response = vlm.generate(
                image,
                question,
                do_sample=False,
                max_new_tokens=max_new_tokens,
                temperature=1.0
            )
        except Exception as e:
            response = f"[Error: {e}]"
    
    return response

print("VQA function defined.")


## 5. 运行VQA测试


In [None]:
# 定义测试问题
test_questions = [
    # 基础识别
    "What do you see in this image?",
    "Describe the main objects in this image.",
    
    # 物体识别
    "What is the main object in the center of the image?",
    "List all objects you can identify.",
    
    # 颜色/属性
    "What colors can you see in this image?",
    "Describe the colors of the objects.",
    
    # 空间关系
    "Describe the spatial layout of objects in this image.",
    "What is on the left side of the image?",
    
    # 计数
    "How many objects are there in this image?",
    
    # 简单推理
    "What is this image about?",
    "What might happen next in this scene?"
]

print(f"Prepared {len(test_questions)} test questions.")


In [None]:
# 运行测试
print("="*70)
print("Prismatic-VLM (Qwen2.5-0.5B) VQA Test Results")
print("="*70)
print()

results = []
for i, question in enumerate(test_questions, 1):
    print(f"[Q{i}] {question}")
    print("-" * 50)
    
    answer = ask_vlm(vlm, test_image, question)
    results.append({"question": question, "answer": answer})
    
    print(f"[A{i}] {answer}")
    print()

print("="*70)
print("Test completed!")
