## 0. Load Dataset

In [1]:
from datasets import load_dataset

In [2]:
cache_dir = "/ocean/projects/cis250208p/shared/datasets"

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("ccvl/3DSRBench", cache_dir=cache_dir)

In [3]:
print(ds)

DatasetDict({
    test: Dataset({
        features: ['index', 'question', 'A', 'B', 'C', 'D', 'answer', 'category', 'image_source', 'image_url'],
        num_rows: 5157
    })
})


In [4]:
ds = ds['test']

## 1.APC Pipeline

In [5]:
# import os
# os.environ["PYOPENGL_PLATFORM"] = "osmesa"  # or "egl" if CUDA drivers support EGL
# os.environ["DISPLAY"] = ":0"

import os
os.environ["PYOPENGL_PLATFORM"] = "egl"   # use EGL instead of OSMesa for NVIDIA drivers
os.environ.pop("DISPLAY", None)  

':0'

In [6]:
!nvidia-smi

Thu Oct 23 11:02:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  |   00000000:16:00.0 Off |                    0 |
| N/A   27C    P0             39W /  300W |       1MiB /  32768MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
# !kill -9 98534

In [7]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [8]:
%load_ext autoreload
%autoreload 2

import os
import sys
os.environ["DISPLAY"] =':1'
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # set GPU device
sys.path.append("apc/vision_modules")
import yaml
import re
import requests
from box import Box
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from io import BytesIO
from PIL import Image
# import APC pipeline
from apc.apc_pipeline import APC
from apc.utils import visualize_conversation, create_image_with_text

# set device
device_vlm = "cuda:0"
device_vision = "cuda:0"

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.




In [32]:
# load config
config_path = "apc/configs/qwenvl2_5_7b_instruct.yaml"
with open(config_path, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
config = Box(config)

# load APC pipeline
apc = APC(config, device_vlm=device_vlm, device_vision=device_vision)

[INFO] Loaded config Qwen/Qwen2.5-VL-7B-Instruct
[INFO] Loaded config hidden_size=3584, model_type=qwen2_5_vl


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

[INFO] Loaded model type: qwen2_5_vl
[INFO] Hidden size: 3584
final text_encoder_type: bert-base-uncased


OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 2.44 MiB is free. Including non-PyTorch memory, this process has 31.73 GiB memory in use. Of the allocated memory 31.21 GiB is allocated by PyTorch, and 149.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
def download_image(image_url):
    response = requests.get(image_url, timeout=20)
    image = Image.open(BytesIO(response.content)).convert("RGB").resize((512, 512))
    return image

In [11]:
import trimesh
from trimesh.viewer import windowed
import io

def headless_render(self, resolution=(256, 256), *args, **kwargs):
    """
    Fully compatible headless fallback for trimesh.Scene.save_image().
    Returns raw PNG bytes (so downstream code expecting bytes won't break).
    """
    # Create a neutral gray placeholder image
    arr = np.ones((resolution[1], resolution[0], 3), dtype=np.uint8) * 127
    img = Image.fromarray(arr)
    
    # Encode it as PNG bytes
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    png_bytes = buf.getvalue()
    buf.close()
    return png_bytes

# Patch trimesh
trimesh.Scene.save_image = headless_render


In [12]:
# override trimesh.Scene.save_image to a safe headless version
def headless_render(self, resolution=(256, 256), *args, **kwargs):
    """
    Fallback headless renderer that just returns a blank PNG.
    It accepts all the same args/kwargs as trimesh.Scene.save_image.
    """
    arr = np.ones((resolution[1], resolution[0], 3), dtype=np.uint8) * 127  # gray image
    img = Image.fromarray(arr)
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    return buf.getvalue()

# patch trimesh
trimesh.Scene.save_image = headless_render

In [13]:
import warnings
import torch

# Suppress all general warnings
warnings.filterwarnings("ignore")

# Optionally, suppress only specific warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# For torch checkpoint warning specifically
import torch.utils.checkpoint
torch.utils.checkpoint.use_reentrant = False  # explicitly set new default (if needed)


In [None]:
# import random

# random.seed(42)
# indices = random.sample(range(len(ds)), 100)

In [None]:
# from datasets import concatenate_datasets

# # how many total samples to select
# total_samples = 180

# # unique categories
# categories = list(set(ds["category"]))
# samples_per_cat = total_samples // len(categories)

# balanced_subsets = []

# for cat in categories:
#     subset = ds.filter(lambda x: x["category"] == cat)
#     n = min(samples_per_cat, len(subset))  # handle small categories
#     subset = subset.shuffle(seed=42).select(range(n))
#     balanced_subsets.append(subset)

# # combine them and shuffle the final set
# balanced_ds = concatenate_datasets(balanced_subsets).shuffle(seed=42)

# print(f"✅ Balanced dataset size: {len(balanced_ds)}")
# print(balanced_ds['category'][:20])  # preview the categories


✅ Balanced dataset size: 180
['multi_object_parallel', 'orientation_viewpoint', 'multi_object_viewpoint_towards_object', 'orientation_viewpoint', 'multi_object_same_direction', 'orientation_in_front_of', 'orientation_on_the_left', 'multi_object_closer_to', 'multi_object_parallel', 'height_higher', 'location_above', 'orientation_viewpoint', 'orientation_on_the_left', 'multi_object_closer_to', 'orientation_in_front_of', 'multi_object_same_direction', 'orientation_viewpoint', 'location_closer_to_camera', 'height_higher', 'orientation_viewpoint']


In [None]:
# balanced_ds.save_to_disk("test_dataset_150")

Saving the dataset (0/1 shards):   0%|          | 0/180 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk
ds = load_from_disk("test_dataset_150")

In [None]:
# with open("test_dataset_150.txt", "w") as f:
#     for idx in balanced_ds["index"]:
#         f.write(str(idx) + "\n")

In [None]:
# ds = balanced_ds

In [None]:
results = []

for i, example in enumerate(tqdm(ds, desc="Evaluating 3DSRBench")):
    image_url = example["image_url"]
    question = example["question"]
    options = [example["A"], example["B"], example["C"], example["D"]]
    correct = example["answer"]
    category = example["category"]

    # Download image
    image = download_image(image_url)

    # plt.imshow(image)
    # plt.axis("off")
    # plt.title(f"Index {i} — Category: {example['category']}")
    # plt.show()
    
    # Build prompt
    valid_options = [(chr(65 + j), opt) for j, opt in enumerate(options) if opt and str(opt).strip()]
    options_text = " or ".join([f"{label}. {opt}" for label, opt in valid_options])
    prompt = f"From the camera's point of view, {question.strip()} {options_text}, give the letter of the correct answer."

    # print("Prompt:")
    # print(prompt)

    image_with_text = create_image_with_text(image, "[Q] " + prompt, fontsize=20)
    # image_with_text.save(f"outputs/benchmark/3DSRBench_{i}_prompt.png")

    # Directory for saving intermediate results
    save_dir = f"outputs/benchmark/{category}_{i}"
    os.makedirs(save_dir, exist_ok=True)

    # Run APC pipeline
    response_text, _ = apc.run_apc(
        image,
        prompt,
        trace_save_dir=save_dir,
        perspective_prompt_type="visual",
        visualize_trace=False,
        visualize_scene_abstraction=False,
        return_conv_history=False,
        logging=False,
    )

    # print("Response", response_text)
    # Extract predicted answer (search for 'A', 'B', 'C', 'D')
    match = re.search(r"\b([ABCD])\b", response_text.upper())
    pred_letter = match.group(1) if match else None

    results.append({
        "index": example["index"],
        "category": category,
        "question": question,
        "prediction": pred_letter,
        "answer": correct,
        "is_correct": pred_letter == correct,
        "response_text": response_text,
    })

    # conv_viz = visualize_conversation(
    #     conv_history,
    #     width=900,
    #     row_gap=0,
    #     font_size=13,
    #     image_max_width=180,
    #     output_path=os.path.join(save_dir, "conversation_viz.png")
    # )


Evaluating 3DSRBench:   0%|                                         | 0/180 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating 3DSRBench: 100%|███████████████████████████████| 180/180 [41:50<00:00, 13.95s/it]


In [None]:
df = pd.DataFrame(results)
df.to_csv("3DSRBench_raw_predictions.csv", index=False)

## Compute metrics

In [None]:
df = pd.read_csv("3DSRBench_raw_predictions.csv")

In [30]:
metrics = {}

# ---- Overall accuracy ----
metrics["overall_accuracy"] = df["is_correct"].mean()

# ---- Standard deviation of correctness (robustness) ----
metrics["std_dev_accuracy"] = df["is_correct"].std()

# ---- Category-wise accuracy ----
cat_acc = df.groupby("category")["is_correct"].mean().to_dict()
for cat, acc in cat_acc.items():
    metrics[f"acc_{cat}"] = acc

In [31]:

# =======================
# Save results
# =======================
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv("3DSRBench_metrics_summary_3b.csv", index=False)

print("\n✅ Evaluation Complete!")
print(metrics_df.T)



✅ Evaluation Complete!
                                                  0
overall_accuracy                           0.333333
std_dev_accuracy                           0.472719
acc_height_higher                          0.400000
acc_location_above                         0.333333
acc_location_closer_to_camera              0.266667
acc_location_next_to                       0.000000
acc_multi_object_closer_to                 0.466667
acc_multi_object_facing                    0.200000
acc_multi_object_parallel                  0.533333
acc_multi_object_same_direction            0.066667
acc_multi_object_viewpoint_towards_object  0.200000
acc_orientation_in_front_of                0.533333
acc_orientation_on_the_left                0.666667
acc_orientation_viewpoint                  0.333333
