## 0. Load Dataset

In [1]:
import json
from datasets import Dataset, Features, Value, Image, ClassLabel

In [7]:
DATA_PATH = "/ocean/projects/cis250208p/shared/datasets/SAT/SAT_labeled.jsonl"

train_data = []

labels = ["A", "B", "C", "D"]

with open(DATA_PATH, "r") as f:
    for i, line in enumerate(f):
        train_data.append(json.loads(line))
        train_data[-1]["index"] = i  # add index field

        choices = train_data[-1].get("answer_choices", [])
        train_data[-1].pop("answer_choices")
        for i, label in enumerate(labels):
            train_data[-1][label] = choices[i] if i < len(choices) else None


features = Features({
    "index": Value("int32"),
    "image": Image(),  # will load image paths or binary image data
    "question": Value("string"),
    "A": Value("string"),
    "B": Value("string"),
    "C": Value("string"),
    "D": Value("string"),
    "correct_answer": Value("string"),
    "category": Value("string"),
})

ds = Dataset.from_list(train_data, features=features)

In [8]:
print(ds)

Dataset({
    features: ['image', 'question', 'correct_answer', 'category', 'index', 'A', 'B', 'C', 'D'],
    num_rows: 6494
})


In [3]:
print(ds)

DatasetDict({
    test: Dataset({
        features: ['index', 'question', 'A', 'B', 'C', 'D', 'answer', 'category', 'image_source', 'image_url'],
        num_rows: 5157
    })
})


## 1.APC Pipeline

In [4]:
# import os
# os.environ["PYOPENGL_PLATFORM"] = "osmesa"  # or "egl" if CUDA drivers support EGL
# os.environ["DISPLAY"] = ":0"

import os
os.environ["PYOPENGL_PLATFORM"] = "egl"   # use EGL instead of OSMesa for NVIDIA drivers
os.environ.pop("DISPLAY", None)  

':0'

In [5]:
!nvidia-smi

Tue Oct 21 22:56:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  |   00000000:16:00.0 Off |                    0 |
| N/A   28C    P0             40W /  300W |       1MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
os.environ["DISPLAY"] =':1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # set GPU device
sys.path.append("apc/vision_modules")
import yaml
import re
import requests
from box import Box
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from io import BytesIO
from PIL import Image
# import APC pipeline
from apc.apc_pipeline import APC
from apc.utils import visualize_conversation, create_image_with_text

# set device
device_vlm = "cuda:0"
device_vision = "cuda:0"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# load config
config_path = "apc/configs/qwenvl2_5_7b_instruct.yaml"
with open(config_path, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
config = Box(config)

# load APC pipeline
apc = APC(config, device_vlm=device_vlm, device_vision=device_vision)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
def download_image(image_url):
    response = requests.get(image_url, timeout=20)
    image = Image.open(BytesIO(response.content)).convert("RGB").resize((512, 512))
    return image

In [None]:
results = []

for i, example in enumerate(tqdm(ds.select(range(100)), desc="Evaluating SAT")):
    image = [img.convert("RGB").resize((512, 512)) for img in example["image"]]
    question = example["question"]
    options = [example["A"], example["B"], example["C"], example["D"]]
    correct = example["correct_answer"]
    category = example["category"]


    # plt.imshow(image)
    # plt.axis("off")
    # plt.title(f"Index {i} — Category: {example['category']}")
    # plt.show()
    
    # Build prompt
    prompt = f"From the camera's point of view, {question}\nOptions:\nA. {options[0]}\nB. {options[1]}\nC. {options[2]}\nD. {options[3]}\nPlease answer with only one letter (A/B/C/D)."

    # print("Prompt:")
    # print(prompt)

    # Directory for saving intermediate results
    save_dir = f"outputs/benchmark/{category}_{i}"
    os.makedirs(save_dir, exist_ok=True)

    # Run APC pipeline
    response_text, _ = apc.run_apc(
        image,
        prompt,
        trace_save_dir=save_dir,
        perspective_prompt_type="visual",
        visualize_trace=False,
        visualize_scene_abstraction=False,
        return_conv_history=False,
        logging=False,
    )

    # print("Response", response_text)
    # Extract predicted answer (search for 'A', 'B', 'C', 'D')
    match = re.search(r"\b([ABCD])\b", response_text.upper())
    pred_letter = match.group(1) if match else None

    results.append({
        "index": example["index"],
        "category": category,
        "question": question,
        "prediction": pred_letter,
        "answer": correct,
        "is_correct": pred_letter == correct,
        "response_text": response_text,
    })


Evaluating 3DSRBench:   0%|                                    | 0/100 [00:00<?, ?it/s]


TypeError: run_apc() missing 1 required positional argument: 'prompt'

In [None]:

df = pd.DataFrame(results)
df.to_csv("3DSRBench_raw_predictions.csv", index=False)

# =======================
# Compute metrics
# =======================

metrics = {}

# ---- Overall accuracy ----
metrics["overall_accuracy"] = df["is_correct"].mean()

# ---- Standard deviation of correctness (robustness) ----
metrics["std_dev_accuracy"] = df["is_correct"].std()

# ---- Category-wise accuracy ----
cat_acc = df.groupby("category")["is_correct"].mean().to_dict()
for cat, acc in cat_acc.items():
    metrics[f"acc_{cat}"] = acc

# ---- Prediction noise ----
# Approximation: frequency of inconsistent answers across paraphrased / symmetric questions
# You can tag similar questions manually or heuristically by identical image_source + similar wording
def estimate_prediction_noise(df):
    grouped = df.groupby("image_source")
    noise_values = []
    for _, g in grouped:
        if len(g) > 1:
            acc_var = g["is_correct"].std()
            if not np.isnan(acc_var):
                noise_values.append(acc_var)
    return np.mean(noise_values) if noise_values else 0.0

metrics["prediction_noise"] = estimate_prediction_noise(df)

# ---- Spatial symmetry / opposite consistency ----
# Simple heuristic: if question mentions "left" and another mentions "right" for same image_source
def estimate_spatial_consistency(df):
    left_q = df[df["question"].str.contains("left", case=False, na=False)]
    right_q = df[df["question"].str.contains("right", case=False, na=False)]
    consistency_pairs = 0
    total_pairs = 0
    for _, l in left_q.iterrows():
        r_matches = right_q[right_q["image_source"] == l["image_source"]]
        for _, r in r_matches.iterrows():
            total_pairs += 1
            if l["prediction"] != r["prediction"]:
                consistency_pairs += 1
    if total_pairs == 0:
        return np.nan
    return 1 - (consistency_pairs / total_pairs)

metrics["spatial_consistency"] = estimate_spatial_consistency(df)

# =======================
# Save results
# =======================
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv("3DSRBench_metrics_summary.csv", index=False)

print("\n✅ Evaluation Complete!")
print(metrics_df.T)
