In [22]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import json
from PIL import Image
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

def load_and_resize_image(image_path, max_size=1000):
    """
    ÂÆâÂÖ®Âä†ËΩΩÂõæÁâáÔºåËá™Âä®Ê£ÄÊü•„ÄÅÈôçÂàÜËæ®Áéá„ÄÇ
    
    Args:
        image_path (str): ÂõæÁâáË∑ØÂæÑ
        max_size (int): Ê®°ÂûãÊîØÊåÅÁöÑÊúÄÂ§ßÂàÜËæ®Áéá (shorter edge)
    
    Returns:
        PIL.Image: RGB ÂõæÁâáÔºåÂ∑≤ resize
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"‚ùå Image not found: {image_path}")

    img = Image.open(image_path).convert("RGB")

    # Â¶ÇÊûúÂõæÁâáÂ§™Â§ßÂàôÁ≠âÊØîÁº©Êîæ
    width, height = img.size
    if max(width, height) > max_size:
        # Á≠âÊØîÁº©ÊîæÔºåÊúÄÂ§ßËæπ= max_size
        scale = max_size / max(width, height)
        new_size = (int(width * scale), int(height * scale))
        img = img.resize(new_size, Image.Resampling.LANCZOS)

    return img

ValueError: The checkpoint you are trying to load has model type `qwen2_5_vl` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

## EgoOrientBench

In [2]:
image_base_path = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/"
json_file = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/benchmark.json"
import json
from PIL import Image
from tqdm.auto import tqdm
with open(json_file, "r") as f:
    data = json.load(f)
import os

def evaluate_spaceom(pipe, data):
    acc = 0
    total = len(data)
    results = []
    for item in tqdm(data):
        image_path = f"{image_base_path}/{item['image']}"
        # img = Image.open(image_path).convert("RGB")
        img = load_and_resize_image(image_path, max_size=224)  # Ensure image is resized correctly

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": item["question"]}
                ]
            },
        ]
        response = pipe(text=messages)
        model_answer = response[0]["generated_text"][-1]["content"].strip()
        results.append({
            "image": item["image"],
            "question": item["question"],
            "answer": model_answer,
            "label": item["label"],
            "Accuracy": item["label"] in model_answer
        })
    #save results to a JSON file
    with open("EgoOrientBench_spaceom_results.json", "w") as f:
        json.dump(results, f, indent=4)
    print(f"Accuracy: {sum(1 for r in results if r['Accuracy']) / total * 100:.2f}%")
    print(f"Total: {total}")
    return results

In [None]:
evaluate_spaceom(pipe, data)

In [4]:
from joblib import Parallel, delayed
import torch
import json
import os
from tqdm.auto import tqdm
from PIL import Image

# Êï∞ÊçÆË∑ØÂæÑ
image_base_path = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/"
json_file = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/benchmark.json"

with open(json_file, "r") as f:
    data = json.load(f)

print(f"Total samples: {len(data)}")

Total samples: 33460


In [7]:
def load_pipe():
    from transformers import pipeline
    pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm",
        device=0  # Ê≥®ÊÑèËøôÈáå‰∏ÄÂÆöÂÜô device=0ÔºåÂõ†‰∏∫ÊØè‰∏™ËøõÁ®ãÈÉΩÂè™ÁúãÂà∞Ëá™Â∑±ÁöÑ GPU
    )
    return pipe

In [8]:
def chunkify(lst, n):
    """ÊääÂàóË°®ÂùáÂàÜ‰∏∫ n ‰ªΩ"""
    return [lst[i::n] for i in range(n)]

def load_and_resize_image(path, max_size=224):
    img = Image.open(path).convert("RGB")
    img.thumbnail((max_size, max_size))
    return img


def worker(gpu_id, data_chunk):
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    import torch

    # TODO: Âú®ËøôÈáåÈáçÊñ∞Âä†ËΩΩ pipe
    # ‰æãÂ¶ÇÔºö
    # from transformers import pipeline
    # pipe = pipeline("your-task", device=0)
    pipe = load_pipe()  # ‰Ω†ÈúÄË¶ÅËá™Â∑±ÂÆûÁé∞Ëøô‰∏™

    results = []
    for item in tqdm(data_chunk, desc=f"GPU {gpu_id}"):
        image_path = f"{image_base_path}/{item['image']}"
        img = load_and_resize_image(image_path, max_size=224)

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": item["question"]}
                ]
            },
        ]
        response = pipe(text=messages)
        model_answer = response[0]["generated_text"][-1]["content"].strip()
        results.append({
            "image": item["image"],
            "question": item["question"],
            "answer": model_answer,
            "label": item["label"],
            "Accuracy": item["label"] in model_answer
        })
    return results

num_gpus = torch.cuda.device_count()
print(f"Detected GPUs: {num_gpus}")

# ÂùáÂåÄÂàÜÂùó
chunks = chunkify(data, num_gpus)

# Âπ∂Ë°åÊâßË°å
all_results = Parallel(n_jobs=num_gpus)(
    delayed(worker)(gpu_id, chunk) for gpu_id, chunk in enumerate(chunks)
)

# Â±ïÂπ≥
flat_results = [item for sublist in all_results for item in sublist]

# ‰øùÂ≠ò
with open("EgoOrientBench_spaceom_results.json", "w") as f:
    json.dump(flat_results, f, indent=4)

acc = sum(1 for r in flat_results if r['Accuracy']) / len(flat_results)
print(f"Accuracy: {acc * 100:.2f}%")
print(f"Total: {len(flat_results)}")

Detected GPUs: 4


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.86it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.66it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.75it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.48it/s]
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
GPU 1:   0%|          | 4/8365 [00:01<44:14,  3.15it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
GPU 0:   0%|          | 10/8365 [00:01<16:19,  8.53it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
GPU 1:   0%|          | 10/8365 [00:02<31:01,  4.49it/s]You seem to be using the pipelines

Accuracy: 12.45%
Total: 33460


GPU 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8365/8365 [36:01<00:00,  3.87it/s]


In [12]:
path = "/home/disheng/Spatial_Survey/Spatial_VLM_Survey/code/evaluation/EgoOrientBench_spaceom_results.json"
data = json.load(open(path, "r"))
acc  = 0
for item in data:
    answer = item["answer"].lower()
    label = item["label"].lower()
    if label in answer:
        acc += 1
print(f"Accuracy: {acc / len(data) * 100:.2f}%")

Accuracy: 40.82%


## GeoMeter

### real data

In [119]:
# %%
# üöÄ 1) ÁéØÂ¢ÉÂáÜÂ§á
from pathlib import Path
from PIL import Image
import json
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline
import re

# %%
# ‚öôÔ∏è 2) ÂèÇÊï∞ÈÖçÁΩÆ
IMAGE_BASE   = Path("/home/disheng/Spatial_Survey/Datasets/GeoMeter/Real/")
JSONL_FILE   = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Real/depth_height_1000_realworld.jsonl"
MODEL_NAME   = "remyxai/SpaceOm"
TASK         = "image-text-to-text"
DEVICE_ID    = 2      # ÂçïÂç°Â∞±Áî® 0
BATCH_SIZE   = 1     # Ê†πÊçÆÊòæÂ≠òË∞É

# %%
# üìñ 3) ËØªÊï∞ÊçÆ
data = []
with open(JSONL_FILE, "r") as f:
    for line in f:
        item = json.loads(line)
        assert "images" in item and "query_text" in item and "target_text" in item
        data.append(item)

# %%
# üîß 4) ÂàùÂßãÂåñ pipeline
pipe = pipeline(
    TASK,
    model=MODEL_NAME,
    device=DEVICE_ID,
    batch_size=BATCH_SIZE,
)

# %%
# üèÉ 5) ÊâπÈáèÊé®ÁêÜ + ËøõÂ∫¶Êù°
results = []
for i in tqdm(range(0, len(data), BATCH_SIZE), desc="Inference"):
    batch = data[i : i + BATCH_SIZE]
    messages = []
    for item in batch:
        img = Image.open(IMAGE_BASE / item["images"][0]).convert("RGB")
        messages.append({
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text",  "text":  item["query_text"]}
            ]
        })

    outputs = pipe(text=messages)

    for item, out in zip(batch, outputs):
        # Âèñ generated_text Ëøô‰∏™ listÔºåÁÑ∂ÂêéÊâæÂá∫ assistant ÈÇ£Êù°
        gen_list = out.get("generated_text", [])
        assistant_entry = next(
            (entry for entry in gen_list if entry.get("role")=="assistant"),
            None
        )
        if assistant_entry is None:
            # ‰∏á‰∏ÄÊ≤°ÊâæÂà∞ÔºåÂ∞±ÈôçÁ∫ßÂ§ÑÁêÜ
            raw_text = str(gen_list)
        else:
            raw_text = assistant_entry.get("content", "")

        # Áî®Ê≠£ÂàôÂéªÊéâÊú´Â∞æÂ§ö‰ΩôÁöÑÈÄóÂè∑„ÄÅÂè•Âè∑
        pred = re.sub(r"[Ôºå,\.„ÄÇ]+$", "", raw_text).strip()

        results.append({
            "image": item["images"][0],
            "query": item["query_text"],
            "pred":  pred,
            "gold":  item["target_text"]
        })

# %%
# üìä 6) ËÆ°ÁÆóÂáÜÁ°ÆÁéá & ‰øùÂ≠ò
accuracy = 0.0
df = pd.DataFrame(results)
for pred, gold in zip(df["pred"], df["gold"]):
    if pred.lower() in gold.lower():
        accuracy += 1
accuracy /= len(df)
print(f"‚ñ∂Ô∏è Overall Accuracy: {accuracy:.2%}")
print("number of accurate samples:", int(accuracy * len(df)))
print("number of samples:", len(df))

# df.to_json("geobench_real_results_with_predictions.json",
#            orient="records", indent=2, force_ascii=False)
# print("‚úÖ ÁªìÊûúÂ∑≤‰øùÂ≠òÂà∞ geobench_real_results_with_predictions.{json,csv}")

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 16.25it/s]
Device set to use cuda:2
Inference: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:18<00:00,  5.50it/s]

‚ñ∂Ô∏è Overall Accuracy: 43.00%
number of accurate samples: 43
number of samples: 100





### synthetic data

### depth

In [None]:

# depth

base_ir = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth"
dir_list = os.listdir(base_ir)

depth_results = {}
depth_3_shapes = {}
depth_5_shapes = {}
for each_sub_set in dir_list: # [images-3-shapes, images-5-shapes]
    each_shape_set_results = {}
    
    for each_example in os.listdir(os.path.join(base_ir, each_sub_set, "prompts")):  #each prompt
        prompts_json = os.path.join(base_ir, each_sub_set, "prompts", each_example)
        with open(prompts_json, "r") as f:
            prompts = json.load(f)

        image_name = prompts["filename"]
        image_path = os.path.join(base_ir, each_sub_set, "imgs", image_name)
        image_labelled_path = os.path.join(base_ir, each_sub_set, "labelled", image_name)
        image_labelled_id_path = os.path.join(base_ir, each_sub_set, "labelled_id", image_name)
        image_labelled_id_reverse_path = os.path.join(base_ir, each_sub_set, "labelled_id_reverse", image_name)

        prompts_plain = prompts["prompts"]
        prompts_labelled = prompts["prompts_labelled"]
        prompts_labelled_id = prompts["prompts_labelled_id"]

        plain_results = {}
        labelled_results = {}
        labelled_id_results = {}
        labelled_id_reverse_results = {}

        for index, each_prompt in enumerate(prompts_plain):
            answer = each_prompt["answer"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]

            plain_results[index] = {
                "image": image_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }
        
        for index, each_prompt in enumerate(prompts_labelled):
            answer = each_prompt["answer"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_labelled_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"] 

            labelled_results[index] = {
                "image": image_labelled_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

        for index, each_prompt in enumerate(prompts_labelled_id):
            answer = each_prompt["answer"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_labelled_id_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]     
            labelled_id_results[index] = {
                "image": image_labelled_id_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

        for index, each_prompt in enumerate(prompts_labelled_id):
            answer = each_prompt["answerReverse"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_labelled_id_reverse_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]
            labelled_id_reverse_results[index] = {
                "image": image_labelled_id_reverse_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }
        if "3-shapes" in each_sub_set:
            depth_3_shapes[each_sub_set] = {
                "plain": plain_results,
                "labelled": labelled_results,
                "labelled_id": labelled_id_results,
                "labelled_id_reverse": labelled_id_reverse_results
            }
        elif "5-shapes" in each_sub_set:
            depth_5_shapes[each_sub_set] = {
                "plain": plain_results,
                "labelled": labelled_results,
                "labelled_id": labelled_id_results,
                "labelled_id_reverse": labelled_id_reverse_results
            }   

    depth_results[each_sub_set] = {
        "plain": plain_results, 
        "labelled": labelled_results,
        "labelled_id": labelled_id_results,
        "labelled_id_reverse": labelled_id_reverse_results
    }

# Save results          
import json
depth_results_path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth/depth_results.json"
with open(depth_results_path, "w") as f:
    json.dump({
        "depth_3_shapes": depth_3_shapes,
        "depth_5_shapes": depth_5_shapes,
        "depth_results": depth_results
    }, f, indent=4)     


In [68]:
def load_and_resize_image(path, max_size=224):
    img = Image.open(path).convert("RGB")
    img.thumbnail((max_size, max_size))
    return img

def chunkify(lst, n):
    return [lst[i::n] for i in range(n)]

def load_pipe():
    from transformers import pipeline
    pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm",
    device=0  # Ê≥®ÊÑèËøôÈáå‰∏ÄÂÆöÂÜô device=0ÔºåÂõ†‰∏∫ÊØè‰∏™ËøõÁ®ãÈÉΩÂè™ÁúãÂà∞Ëá™Â∑±ÁöÑ GPU
    )
    return pipe

In [71]:
def process_subset(subset_name):
    from tqdm.auto import tqdm

    pipe = load_pipe()  # Âè™Âú®ËøôÈáåÂä†ËΩΩÔºåÊØè‰∏™ GPU ‰∏Ä‰ªΩ

    each_shape_set_results = {}

    prompts_dir = os.path.join(base_ir, subset_name, "prompts")
    examples = os.listdir(prompts_dir)

    for each_example in tqdm(examples, desc=f"Subset {subset_name}"):
        with open(os.path.join(prompts_dir, each_example), "r") as f:
            prompts = json.load(f)

        img_base = os.path.join(base_ir, subset_name)
        paths = {
            "plain": os.path.join(img_base, "imgs", prompts["filename"]),
            "labelled": os.path.join(img_base, "labelled", prompts["filename_labelled"]),
            "labelled_id": os.path.join(img_base, "labelled_id", prompts["filename_labelled"]),
            "labelled_reverse_id": os.path.join(img_base, "labelled_reverse_id", prompts["filename_labelled"])
        }

        def run_prompts(prompts_list, img_path, is_reverse=False):
            output = {}
            for index, p in enumerate(prompts_list):
                answer = p["answerReverse"] if is_reverse else p["answer"]
                full_prompt = p["prompt"] + "\nAnswer Set:\n" + ";\n".join(p["answerSet"])
                img = load_and_resize_image(img_path)

                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "image", "image": img},
                        {"type": "text", "text": full_prompt}
                    ]
                }]
                try:
                    inference_result = pipe(text=messages)
                    final_answer = inference_result[0].get("generated_text", [])[-1]["content"]
                except Exception as e:
                    print(f"Error: {e}")
                    final_answer = "ERROR"

                output[index] = {
                    "image": img_path,
                    "prompt": full_prompt,
                    "answer": answer,
                    "pred": final_answer,
                    "accuracy": answer.lower() in final_answer.lower()
                }
            return output

        plain = run_prompts(prompts["prompts"], paths["plain"])
        labelled = run_prompts(prompts["prompts_labelled"], paths["labelled"])
        labelled_id = run_prompts(prompts["prompts_labelled_id"], paths["labelled_id"])
        labelled_reverse_id = run_prompts(prompts["prompts_labelled_id"], paths["labelled_reverse_id"], is_reverse=True)

        each_shape_set_results[each_example] = {
            "plain": plain,
            "labelled": labelled,
            "labelled_id": labelled_id,
            "labelled_reverse_id": labelled_reverse_id
        }

    return (subset_name, each_shape_set_results)


def main_parallel():
    num_gpus = 4  # ‰Ω†Êúâ 4 Âº† GPU
    print(f"Using {num_gpus} GPUs")

    chunks = chunkify(dir_list, num_gpus)

    def worker(gpu_id, chunk):
        import os
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
        results = {}
        for subset in chunk:
            name, res = process_subset(subset)
            results[name] = res
        return results

    from joblib import Parallel, delayed

    all_results = Parallel(n_jobs=num_gpus)(
        delayed(worker)(gpu_id, chunk) for gpu_id, chunk in enumerate(chunks)
    )

    # ÂêàÂπ∂
    depth_results = {}
    for part in all_results:
        depth_results.update(part)

    depth_3_shapes = {k: v for k, v in depth_results.items() if "3-shapes" in k}
    depth_5_shapes = {k: v for k, v in depth_results.items() if "5-shapes" in k}

    save_path = os.path.join(base_ir, "depth_results.json")
    with open(save_path, "w") as f:
        json.dump({
            "depth_3_shapes": depth_3_shapes,
            "depth_5_shapes": depth_5_shapes,
            "depth_results": depth_results
        }, f, indent=4)

    print(f"Saved to: {save_path}")

main_parallel()

Using 4 GPUs


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.59it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.88it/s]
Device set to use cuda:0
Device set to use cuda:0
Subset images-5-shapes:   0%|          | 0/400 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Subset images-3-shapes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [09:03<00:00,  5.43s/it] 
Subset images-5-shapes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 400/400 [1:01:12<00:00,  9.18s/it]


Saved to: /home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth/depth_results.json


In [118]:
path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth/depth_results.json"
import json
from pathlib import Path
from tqdm.auto import tqdm
depth_results = json.load(open(path, "r"))
depth_3_shapes = depth_results["depth_3_shapes"]
depth_5_shapes = depth_results["depth_5_shapes"]
depth_results = depth_results["depth_results"]
acc = 0
total = 0
for json in depth_3_shapes["images-3-shapes"]:
    plain = depth_3_shapes["images-3-shapes"][json]["plain"]
    labelled = depth_3_shapes["images-3-shapes"][json]["labelled"]
    labelled_id = depth_3_shapes["images-3-shapes"][json]["labelled_id"]
    labelled_reverse_id = depth_3_shapes["images-3-shapes"][json]["labelled_reverse_id"]
    acc += sum(1 for v in plain.values() if v["accuracy"])
    acc += sum(1 for v in labelled.values() if v["accuracy"])
    acc += sum(1 for v in labelled_id.values() if v["accuracy"])
    acc += sum(1 for v in labelled_reverse_id.values() if v["accuracy"])
    total += len(plain) + len(labelled) + len(labelled_id) + len(labelled_reverse_id)
print(f"Accuracy: {acc / total * 100:.2f}%")

for json in depth_5_shapes["images-5-shapes"]:
    plain = depth_5_shapes["images-5-shapes"][json]["plain"]
    labelled = depth_5_shapes["images-5-shapes"][json]["labelled"]
    labelled_id = depth_5_shapes["images-5-shapes"][json]["labelled_id"]
    labelled_reverse_id = depth_5_shapes["images-5-shapes"][json]["labelled_reverse_id"]
    acc += sum(1 for v in plain.values() if v["accuracy"])
    acc += sum(1 for v in labelled.values() if v["accuracy"])
    acc += sum(1 for v in labelled_id.values() if v["accuracy"])
    acc += sum(1 for v in labelled_reverse_id.values() if v["accuracy"])
    total += len(plain) + len(labelled) + len(labelled_id) + len(labelled_reverse_id)
print(f"Accuracy: {acc / total * 100:.2f}%")
print("number of Accurate: ", acc)
print("number of Total: ", total)
    



Accuracy: 34.85%
Accuracy: 28.84%
number of Accurate:  4811
number of Total:  16684


### heigh

In [None]:
import os
import json
from tqdm.auto import tqdm

base_path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/height"
sub_classes = os.listdir(base_path)

height_results = {}

print(f"Found {len(sub_classes)} sub-classes: {sub_classes}")

for each_sub_class in sub_classes:
    print(f"Processing {each_sub_class}...")
    sub_class_path = os.path.join(base_path, each_sub_class)
    prompts_dir = os.path.join(sub_class_path, "prompts")
    prompts_files = os.listdir(prompts_dir)

    for each_prompt_file in tqdm(prompts_files, desc=f"{each_sub_class} prompts"):
        prompt_path = os.path.join(prompts_dir, each_prompt_file)
        with open(prompt_path, "r") as f:
            prompt_json = json.load(f)

        image_name = prompt_json["filename"]
        image_path = os.path.join(sub_class_path, "imgs", image_name)
        prompts_list = prompt_json["prompts"]

        for index, prompt in enumerate(prompts_list):
            answer = prompt["answer"]
            answer_set = ";\n".join(prompt["answerSet"])
            full_prompt = prompt["prompt"] + "\nAnswer Set:[\n" + answer_set + "\n]"

            img = load_and_resize_image(image_path)
            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": full_prompt}
                ]
            }]

            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]

            # ËÆ∞ÂΩïÁªìÊûúÔºåÂàÜÂ±ÇÁªìÊûÑ: {sub_class: {image_name: {index: ...}}}
            height_results.setdefault(each_sub_class, {})
            height_results[each_sub_class].setdefault(image_name, {})
            height_results[each_sub_class][image_name][index] = {
                "image": image_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

# ‰øùÂ≠òÁªìÊûú
height_results_path = os.path.join(base_path, "height_results.json")
with open(height_results_path, "w") as f:
    json.dump(height_results, f, indent=4)

print(f"Results saved to: {height_results_path}")

In [114]:
import os
import json
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import torch

# Ë∑ØÂæÑ
base_path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/height"
sub_classes = os.listdir(base_path)
print(f"Found {len(sub_classes)} sub-classes: {sub_classes}")

def chunkify(lst, n):
    """Â∞ÜÂàóË°®ÂùáÂåÄÂàÜÂùó"""
    return [lst[i::n] for i in range(n)]

def process_height_sub_class(sub_class):
    from tqdm.auto import tqdm
    import os

    pipe = load_pipe()  # ÊØèÂº† GPU Áã¨Á´ãÂä†ËΩΩ

    sub_class_path = os.path.join(base_path, sub_class)
    prompts_dir = os.path.join(sub_class_path, "prompts")
    prompt_files = os.listdir(prompts_dir)

    results = {}

    for each_prompt_file in tqdm(prompt_files, desc=f"Sub-class: {sub_class}"):
        with open(os.path.join(prompts_dir, each_prompt_file), "r") as f:
            prompt_json = json.load(f)

        image_name = prompt_json["filename"]
        image_path = os.path.join(sub_class_path, "imgs", image_name)
        prompts_list = prompt_json["prompts"]

        for index, prompt in enumerate(prompts_list):
            answer = prompt["answer"]
            answer_set = ";\n".join(prompt["answerSet"])
            full_prompt = prompt["prompt"] + "\nAnswer Set:[\n" + answer_set + "\n]\nJust select the answer from the set without any explanation."

            img = load_and_resize_image(image_path)
            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": full_prompt}
                ]
            }]

            try:
                inference_result = pipe(text=messages)
                final_answer = inference_result[0].get("generated_text", [])[-1]["content"]
            except Exception as e:
                print(f"Error: {e}")
                final_answer = "ERROR"

            # ÁªìÊûÑ: {sub_class: {image_name: {index: ...}}}
            results.setdefault(sub_class, {})
            results[sub_class].setdefault(image_name, {})
            results[sub_class][image_name][index] = {
                "image": image_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

    return results

def worker(gpu_id, chunk):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    gpu_results = {}

    for sub_class in chunk:
        res = process_height_sub_class(sub_class)
        gpu_results.update(res)

    return gpu_results

def run_height_parallel():
    num_gpus = torch.cuda.device_count()
    print(f"Detected GPUs: {num_gpus}")

    chunks = chunkify(sub_classes, num_gpus)

    all_results = Parallel(n_jobs=num_gpus)(
        delayed(worker)(gpu_id, chunk) for gpu_id, chunk in enumerate(chunks)
    )

    # ÂêàÂπ∂
    height_results = {}
    for part in all_results:
        height_results.update(part)

    # ‰øùÂ≠ò
    save_path = os.path.join(base_path, "height_results.json")
    with open(save_path, "w") as f:
        json.dump(height_results, f, indent=4)

    print(f"Results saved to: {save_path}")


run_height_parallel()

Found 8 sub-classes: ['images-3-stacks-colored', 'images-3-stacks-stepped-colored', 'images-3-stacks-stepped', 'images-5-stacks-stepped', 'images-5-stacks-colored', 'images-3-stacks', 'images-5-stacks', 'images-5-stacks-stepped-colored']
Detected GPUs: 4


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.36it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.82it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.41it/s]
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 18.71it/s]
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Sub-class: images-3-stacks-stepped:   2%|‚ñè         | 4/200 [00:01<01:09,  2.81it/s].60it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Sub-class: images-5-stacks-stepped:   1%|          | 2/200 [00:01<02:41,  1.22it/s]You seem to be using the 

Results saved to: /home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/height/height_results.json


Sub-class: images-5-stacks-colored: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 250/250 [03:00<00:00,  1.39it/s]


In [117]:
acc = 0
total = 0
height_results_path = os.path.join(base_path, "height_results.json")
height_results = json.load(open(height_results_path, "r"))
for sub_class, images in height_results.items():
    for image_name, prompts in images.items():
        for index, result in prompts.items():
            if result["accuracy"]:
                acc += 1
            total += 1
print(f"Accuracy: {acc / total * 100:.2f}%")
print("number of accurate results:", acc)
print("total number of results:", total)

Accuracy: 24.23%
number of accurate results: 1713
total number of results: 7069


In [120]:
number_of_samples = 7069 + 100 + 16684
number_of_accurate_samples = 1713 + 4811 + 43
print("final Accuracy: ", number_of_accurate_samples / number_of_samples * 100)

final Accuracy:  27.53112815997988


## OmniSpatial

In [None]:
# export HF_ENDPOINT="https://hf-mirror.com"
!mkdir -p dataset
!huggingface-cli download --resume-download qizekun/OmniSpatial --local-dir dataset --repo-type dataset
!find dataset/ -name '*.zip' -exec unzip -o {} -d dataset/ \;
!rm -f dataset/*.zip && rm -rf dataset/__MACOSX

In [4]:
!export CUDA_VISIBLE_DEVICES=3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
from PIL import Image
import os

def load_and_resize_image(image_path, max_size=448):
    """
    ÂÆâÂÖ®Âä†ËΩΩÂõæÁâáÔºåËá™Âä®Ê£ÄÊü•„ÄÅÈôçÂàÜËæ®Áéá„ÄÇ
    
    Args:
        image_path (str): ÂõæÁâáË∑ØÂæÑ
        max_size (int): Ê®°ÂûãÊîØÊåÅÁöÑÊúÄÂ§ßÂàÜËæ®Áéá (shorter edge)
    
    Returns:
        PIL.Image: RGB ÂõæÁâáÔºåÂ∑≤ resize
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"‚ùå Image not found: {image_path}")

    img = Image.open(image_path).convert("RGB")

    # Â¶ÇÊûúÂõæÁâáÂ§™Â§ßÂàôÁ≠âÊØîÁº©Êîæ
    width, height = img.size
    if max(width, height) > max_size:
        # Á≠âÊØîÁº©ÊîæÔºåÊúÄÂ§ßËæπ= max_size
        scale = max_size / max(width, height)
        new_size = (int(width * scale), int(height * scale))
        img = img.resize(new_size, Image.Resampling.LANCZOS)

    return img

In [3]:
from tqdm.auto import tqdm
from transformers import pipeline
import json
from PIL import Image
# pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
question_types =  {0:"Dynamic_Reasoning", 
                   1:"Spatial_Interaction", 
                   2:"Complex_Logic", 
                   3:"Perspective_Taking"}
annotation = "/home/disheng/Spatial_Survey/Datasets/OmniSpatial/dataset/data.json"
import json
annotation_data = json.load(open(annotation, "r"))
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
record = {}
for item in tqdm(annotation_data):
    iid = item["id"]
    image_id = item["id"].split("_")[0]
    question_type = item["task_type"]

    question = item["question"]
    options = item["options"]
    full_prompt = f"Question: {question}\nOptions: {"; ".join(options)}. Please only retuen a correct option without analysis."
    label = item["answer"]
    if len(options) != 0:
        label = options[label]
    image_path = f"/home/disheng/Spatial_Survey/Datasets/OmniSpatial/dataset/{question_type}/{image_id}.png"
    img = load_and_resize_image(image_path, max_size=448)  # ‰ΩøÁî®ÂÆâÂÖ®Âä†ËΩΩÂáΩÊï∞
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": full_prompt}
            ]
        },
    ]
    try:
        response = pipe(text=messages)
    except Exception as e:
        print(f"Error processing {iid}: {e}")
        print(image_path)
        print(options)
        print(full_prompt)
        print(label)
        print(messages)
        print()
        continue
    model_answer = response[0]["generated_text"][-1]["content"]
    # record all of information for later analysis
    record[iid] = {
        "image_path": image_path,
        "question_type": question_type,
        "question": question,
        "options": options,
        "label": label,
        "model_answer": model_answer,
        "correct": model_answer == label
    }
# ‰øùÂ≠òÁªìÊûú
import json
with open("omnispatial_results.json", "w") as f:
    json.dump(record, f, indent=2, ensure_ascii=False)  


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:00, 13.64it/s]
Device set to use cuda:0
  1%|          | 10/1533 [00:01<03:29,  7.27it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1533/1533 [04:14<00:00,  6.02it/s]


In [4]:
result = "/home/disheng/Spatial_Survey/omnispatial_results.json"
import json
record = json.load(open(result, "r"))
acc = 0
total = len(record)
correct ={}
wrong = {}
for id, item in record.items():
    label = item["label"]
    model_answer = item["model_answer"]
    if label in model_answer:
        acc += 1
        correct[id] = item
    else:
        wrong[id] = item
accuracy = acc / total
print(f"Accuracy: {accuracy:.2%}")
       


Accuracy: 43.70%


## MM-Vet

In [1]:
from datasets import load_dataset
from tqdm.auto import tqdm
# Load a specific split 
dataset = load_dataset("LLDDSS/Awesome_Spatial_VLMs", split="mm_vet")
from transformers import pipeline
import json
from PIL import Image
from torchvision.transforms.functional import to_pil_image
from PIL import Image
from matplotlib import pyplot as plt


In [2]:
dataset = dataset.with_format("torch", columns=['id', 'image', 'question', 'options', 'GT'])
from torch.utils.data import DataLoader

data_loader = DataLoader(dataset,)
records = {}
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")

def check_correctness(model_answer, GT):
    GT = GT[0]
    if "<AND>" in GT:
        # Â§ÑÁêÜ AND ÈÄªËæë
        gt_answers = GT.split("<AND>")
        return all(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    elif "<OR>" in GT:
        # Â§ÑÁêÜ OR ÈÄªËæë
        gt_answers = GT.split("<OR>")
        return any(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    return GT.strip().lower() in model_answer.lower()

for batch in tqdm(data_loader):
    id = batch['id'][0]
    img = batch['image']  # shape: (B, C, H, W)

    # print("Type:", type(imgs))
    # print("Shape:", imgs.shape)

    # ÂèØËßÜÂåñÁ¨¨ 0 Âº†ÂõæÂÉè
    img_pil = to_pil_image(img[0])  # shape: [4, H, W]


    full_prompt = batch['question']   
    GT = batch['GT']

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img_pil},
                {"type": "text", "text": full_prompt}
            ]
        },
    ]
    try:
        response = pipe(text=messages)
    except Exception as e:
        print(f"Error processing {id}: {e}")
        raise e
    model_answer = response[0]["generated_text"][-1]["content"]
    whether_correct = check_correctness(model_answer, GT)
    # record all of information for later analysis
    records[id] = {
        "question": full_prompt,
        "GT": GT,
        "model_answer": model_answer,
        "correct": whether_correct
    }
# ‰øùÂ≠òÁªìÊûú
print(f"Total records: {len(records)}")
print(f"Correct records: {sum(1 for r in records.values() if r['correct'])}")
print(f"Accuracy: {sum(1 for r in records.values() if r['correct']) / len(records):.2%}")
import json
with open("mm_vet.json", "w") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)
    


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


  0%|          | 0/75 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Total records: 75
Correct records: 36
Accuracy: 48.00%


## what is up

In [5]:
from datasets import load_dataset
from tqdm.auto import tqdm
# Load a specific split 
dataset = load_dataset("LLDDSS/Awesome_Spatial_VLMs", split="whats_up")
from transformers import pipeline
import json
from PIL import Image
from torchvision.transforms.functional import to_pil_image
from PIL import Image
from matplotlib import pyplot as plt

In [6]:
dataset

Dataset({
    features: ['id', 'image', 'question', 'options', 'GT'],
    num_rows: 820
})

In [4]:


dataset = dataset.with_format("torch", columns=['id', 'image', 'question', 'options', 'GT'])
from torch.utils.data import DataLoader

data_loader = DataLoader(dataset,)
records = {}
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")

def check_correctness(model_answer, GT):
    if "<AND>" in GT:
        # Â§ÑÁêÜ AND ÈÄªËæë
        gt_answers = GT.split("<AND>")
        return all(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    elif "<OR>" in GT:
        # Â§ÑÁêÜ OR ÈÄªËæë
        gt_answers = GT.split("<OR>")
        return any(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    return GT.strip().lower() in model_answer.lower()

for batch in tqdm(data_loader):
    id = batch['id'][0]
    img = batch['image']  # shape: (B, C, H, W)

    img_pil = to_pil_image(img[0])  # shape: [4, H, W]

    question = batch['question'][0]
    options = batch['options'][0]
    full_prompt = f"Question: {question}\nOptions: {options}. Please only retuen a correct option without analysis."
    GT = batch['GT'][0]
    # print()
    # print(full_prompt)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img_pil},
                {"type": "text", "text": full_prompt}
            ]
        },
    ]
    try:
        response = pipe(text=messages)
    except Exception as e:
        print(f"Error processing {id}: {e}")
        raise e
    model_answer = response[0]["generated_text"][-1]["content"]
    whether_correct = check_correctness(model_answer, GT)
    # record all of information for later analysis
    records[id] = {
        "question": full_prompt,
        "GT": GT,
        "model_answer": model_answer,
        "correct": whether_correct
    }
# ‰øùÂ≠òÁªìÊûú
print(f"Total records: {len(records)}")
print(f"Correct records: {sum(1 for r in records.values() if r['correct'])}")
print(f"Accuracy: {sum(1 for r in records.values() if r['correct']) / len(records):.2%}")
import json
with open("whats_up.json", "w") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)
    


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


  0%|          | 0/820 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Total records: 820
Correct records: 779
Accuracy: 95.00%


## CV-Bench

In [18]:
from datasets import load_dataset
from tqdm.auto import tqdm
# Load a specific split 
cv_bench = load_dataset("LLDDSS/Awesome_Spatial_VLMs", split="cv_bench")
from transformers import pipeline
import json
from PIL import Image
from torchvision.transforms.functional import to_pil_image
from PIL import Image
from matplotlib import pyplot as plt
cv_bench


README.md: 0.00B [00:00, ?B/s]

cv_bench-00000-of-00001.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

Generating seed_bench_spatial split:   0%|          | 0/1635 [00:00<?, ? examples/s]

Generating EgoOrientBench split:   0%|          | 0/33460 [00:00<?, ? examples/s]

Generating GeoMeter split:   0%|          | 0/25557 [00:00<?, ? examples/s]

Generating mm_vet split:   0%|          | 0/75 [00:00<?, ? examples/s]

Generating whats_up split:   0%|          | 0/820 [00:00<?, ? examples/s]

Generating cv_bench split:   0%|          | 0/2638 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'image', 'question', 'options', 'GT'],
    num_rows: 2638
})

In [26]:
example["choices"]

['motorcycle', 'bus']

In [27]:
from datasets import Dataset
from tqdm import tqdm


cv_bench = load_dataset("nyu-visionx/CV-Bench")

# ÂÅáËÆæÂéüÂßãÊï∞ÊçÆÈõÜ‰∏∫ raw_dataset
raw_dataset = cv_bench["test"]  # DatasetDict({'test': Dataset(...)})
new_data = []
mapping = {"A":0, "B":1, "C":2, "D":3, "E":4, "F":5, "G":6, "H":7, "I":8, "J":9}

for example in tqdm(raw_dataset):
    answer = example["answer"][1]
    choices = example["choices"]
    GT = choices[mapping[answer]]  # Ëé∑ÂèñÊ≠£Á°ÆÁ≠îÊ°à

    try:
        new_data.append({
            "id": str(example["idx"]),
            "image": example["image"],
            "question": example["prompt"],
            "options": '; '.join(example["choices"]),
            "GT": example["answer"][1]
        })
    except Exception as e:
        print(f"‚ùå Error at idx {example['idx']}: {e}")
        continue

new_dataset = Dataset.from_list(new_data)
print(new_dataset)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2638/2638 [00:08<00:00, 315.57it/s]


Dataset({
    features: ['id', 'image', 'question', 'options', 'GT'],
    num_rows: 2638
})


In [23]:
example["prompt"]

'Estimate the real-world distances between objects in this image. Which object is closer to the traffic cone (highlighted by a red box), the motorcycle (highlighted by a blue box) or the bus (highlighted by a green box)?\n(A) motorcycle\n(B) bus'

In [29]:
# upload to LLDDSS/Awesome_Spatial_VLMs
new_dataset.push_to_hub("LLDDSS/Awesome_Spatial_VLMs", split="cv_bench")   

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/2638 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/LLDDSS/Awesome_Spatial_VLMs/commit/33956dcb9c1ea2ab58849fd8e9341e97ea47cb37', commit_message='Upload dataset', commit_description='', oid='33956dcb9c1ea2ab58849fd8e9341e97ea47cb37', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LLDDSS/Awesome_Spatial_VLMs', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LLDDSS/Awesome_Spatial_VLMs'), pr_revision=None, pr_num=None)

In [30]:

dataset = load_dataset("LLDDSS/Awesome_Spatial_VLMs", split="cv_bench")
dataset = dataset.with_format("torch", columns=['id', 'image', 'question', 'options', 'GT'])
from torch.utils.data import DataLoader

data_loader = DataLoader(dataset,)
records = {}
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")

def check_correctness(model_answer, GT):
    if "<AND>" in GT:
        # Â§ÑÁêÜ AND ÈÄªËæë
        gt_answers = GT.split("<AND>")
        return all(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    elif "<OR>" in GT:
        # Â§ÑÁêÜ OR ÈÄªËæë
        gt_answers = GT.split("<OR>")
        return any(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    return GT.strip().lower() in model_answer.lower()

for batch in tqdm(data_loader):
    id = batch['id'][0]
    img = batch['image']  # shape: (B, C, H, W)

    img_pil = to_pil_image(img[0])  # shape: [4, H, W]

    question = batch['question'][0]
    full_prompt = f"Question: {question}.\n Please only retuen a correct option without analysis."
    GT = batch['GT'][0]
    # print()
    # print(full_prompt)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img_pil},
                {"type": "text", "text": full_prompt}
            ]
        },
    ]
    try:
        response = pipe(text=messages)
    except Exception as e:
        print(f"Error processing {id}: {e}")
        raise e
    model_answer = response[0]["generated_text"][-1]["content"]
    whether_correct = check_correctness(model_answer, GT)
    # record all of information for later analysis
    records[id] = {
        "question": full_prompt,
        "GT": GT,
        "model_answer": model_answer,
        "correct": whether_correct
    }
# ‰øùÂ≠òÁªìÊûú
print(f"Total records: {len(records)}")
print(f"Correct records: {sum(1 for r in records.values() if r['correct'])}")
print(f"Accuracy: {sum(1 for r in records.values() if r['correct']) / len(records):.2%}")
import json
with open("cv_bench.json", "w") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

README.md: 0.00B [00:00, ?B/s]

cv_bench-00000-of-00001.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

Generating seed_bench_spatial split:   0%|          | 0/1635 [00:00<?, ? examples/s]

Generating EgoOrientBench split:   0%|          | 0/33460 [00:00<?, ? examples/s]

Generating GeoMeter split:   0%|          | 0/25557 [00:00<?, ? examples/s]

Generating mm_vet split:   0%|          | 0/75 [00:00<?, ? examples/s]

Generating whats_up split:   0%|          | 0/820 [00:00<?, ? examples/s]

Generating cv_bench split:   0%|          | 0/2638 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2638/2638 [22:37<00:00,  1.94it/s]

Total records: 2638
Correct records: 2004
Accuracy: 75.97%





## srbench

In [32]:
from datasets import Dataset

dataset = load_dataset("stogian/srbench", split="train")

README.md:   0%|          | 0.00/390 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1800 [00:00<?, ? examples/s]

In [63]:
batch['image'].shape

torch.Size([1, 3, 900, 1600])

In [66]:
batch

{'id': ['2637'],
 'image': tensor([[[[ 55,  56,  56,  ...,  95,  95,  95],
           [ 49,  51,  51,  ...,  85,  85,  85],
           [ 47,  48,  49,  ...,  82,  82,  83],
           ...,
           [ 43,  89,  90,  ...,  94,  94,  94],
           [ 43,  89,  90,  ...,  94,  94,  94],
           [ 43,  89,  91,  ...,  94,  94,  94]],
 
          [[ 60,  61,  61,  ..., 101, 101, 101],
           [ 54,  56,  56,  ...,  91,  90,  91],
           [ 52,  53,  54,  ...,  87,  87,  88],
           ...,
           [ 40,  86,  87,  ...,  98,  98,  98],
           [ 40,  86,  87,  ...,  98,  98,  98],
           [ 40,  86,  88,  ...,  98,  98,  98]],
 
          [[ 64,  65,  65,  ..., 101, 101, 101],
           [ 58,  60,  60,  ...,  91,  93,  91],
           [ 56,  57,  58,  ...,  90,  91,  91],
           ...,
           [ 35,  79,  80,  ...,  97,  97,  97],
           [ 35,  79,  80,  ...,  97,  97,  97],
           [ 35,  79,  81,  ...,  97,  97,  97]]]], dtype=torch.uint8),
 'question': ['

In [70]:
dataset

Dataset({
    features: ['question', 'image', 'answer', 'split'],
    num_rows: 1800
})

In [75]:
batch

{'question': 'This image shows a 3D polycube shape. Which of the options is simply the original shape in a rotated orientation?\nOnly one of the options is correct.\nAvailable options: A. Left, B. Center, C. Right',
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=286x257>,
 'answer': 'B',
 'split': 'mrt_easy'}

In [77]:
dataset = load_dataset("stogian/srbench", split="train")
from torch.utils.data import DataLoader

data_loader = DataLoader(dataset)
records = {}
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")

def process_pil_image(pil_img, max_width=512):
    # ËΩ¨Êç¢‰∏∫ RGB Ê®°ÂºèÔºàÂõ†‰∏∫ SpaceOm Ê®°Âûã‰∏çÊîØÊåÅ RGBAÔºâ
    img = pil_img.convert("RGB")
    # ÈôêÂà∂ÂÆΩÂ∫¶Ôºå‰øùÊåÅÁ∫µÊ®™ÊØîÔºàÂèØÈÄâÔºâ
    if img.width > max_width:
        ratio = img.height / img.width
        img = img.resize((max_width, int(max_width * ratio)), Image.Resampling.LANCZOS)
    return img

def check_correctness(model_answer, GT):
    if "<AND>" in GT:
        # Â§ÑÁêÜ AND ÈÄªËæë
        gt_answers = GT.split("<AND>")
        return all(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    elif "<OR>" in GT:
        # Â§ÑÁêÜ OR ÈÄªËæë
        gt_answers = GT.split("<OR>")
        return any(answer.strip().lower() in model_answer.lower() for answer in gt_answers)
    return GT.strip().lower() in model_answer.lower()
id = 0
for batch in tqdm(dataset):
    img = batch['image']  # shape: (B, C, H, W)
    img_pil = process_pil_image(img)  # shape: [4, H, W]

    question = batch['question']
    full_prompt = f"Question: {question}.\n Please only retuen a correct option without analysis."
    GT = batch['answer']

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img_pil},
                {"type": "text", "text": full_prompt}
            ]
        },
    ]
    try:
        response = pipe(text=messages)
    except Exception as e:
        print(f"Error processing {id}: {e}")
        raise e
    model_answer = response[0]["generated_text"][-1]["content"]
    whether_correct = check_correctness(model_answer, GT)
    # record all of information for later analysis
    records[id] = {
        "question": full_prompt,
        "GT": GT,
        "model_answer": model_answer,
        "correct": whether_correct
    }
    id += 1
print(f"Total records: {len(records)}")
print(f"Correct records: {sum(1 for r in records.values() if r['correct'])}")
print(f"Accuracy: {sum(1 for r in records.values() if r['correct']) / len(records):.2%}")
import json
with open("srbench.json", "w") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1800/1800 [10:20<00:00,  2.90it/s]

Total records: 1800
Correct records: 947
Accuracy: 52.61%



