In [None]:
#ONLY FOR GEMINI 2.5
import json
import re
from datasets import load_dataset
import numpy as np  

def read_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

# # 提前编译正则，加速匹配
# boxed_pattern = re.compile(r"\$\s*boxed\{([^}]*)\}")
# letter_pattern = re.compile(r"\b([A-D])\b")

# def clean_answer(ans: str) -> str:
#     m = boxed_pattern.search(ans)
#     if m:
#         return m.group(1).strip()
#     m = letter_pattern.search(ans)
#     if m:
#         return m.group(1).strip()
#     return ans.strip()

def clean_answer(ans: str) -> str:
    # 按第一个换行符切分，只取前半部分
    first_line = ans.split("\n", 1)[0]
    return first_line.strip()

# ===== 主逻辑 =====
data = read_jsonl("/home/tuo/Codes/survey_eval/Commercial/gemini_2_5_pro/prediction-model-2025-10-09T20:32:36.788515Z/predictions.jsonl")
print(len(data), "records loaded")

dataset = load_dataset("LLDDSS/Awesome_Spatial_VQA_Benchmarks")
bench = "GeoMeter_Real"

# ✅ 提前建立 id -> GT 映射，避免内层循环
id_to_gt = {str(item["id"]): item["GT"] for item in dataset[bench]}

count = 0
for item in data:
    pred = clean_answer(item["response"]["candidates"][0]["content"]["parts"][0]["text"])
    gt = id_to_gt.get(str(item["id"]))

    print(f"{item['id']}: {pred}")
    pred=pred.strip()
    if gt in pred or pred in gt  :
        count += 1
        print(pred, gt)

print(f"Accuracy: {count}/{len(data)} = {count/len(data):.4f}")


1533 records loaded


In [None]:
#ONLY FOR GEMINI 2.5 FLASH
import json
import re
from datasets import load_dataset

def read_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

# 提前编译正则，加速匹配
boxed_pattern = re.compile(r"\$\s*boxed\{([^}]*)\}")

def clean_answer(ans: str) -> str:
    m = boxed_pattern.search(ans)
    if m:
        return m.group(1).strip()
    else:
        #get last captial letter
        m = re.findall(r"\b([A-D])\b", ans)
        if m and m!= []:
            return m[-1].strip()
        
    return ans.strip()

# ===== 主逻辑 =====
data = read_jsonl("/home/tuo/Codes/survey_eval/Commercial/gemini_2_5_flash_results/prediction-model-2025-10-09T18:21:11.855520Z/predictions.jsonl")
print(len(data), "records loaded")

dataset = load_dataset("LLDDSS/Awesome_Spatial_VQA_Benchmarks")
bench = "SRBench"

# ✅ 提前建立 id -> GT 映射，避免内层循环
id_to_gt = {str(item["id"]): item["GT"] for item in dataset[bench]}

count = 0
for item in data:
    pred = clean_answer(item["response"]["candidates"][0]["content"]["parts"][0]["text"])
    gt = id_to_gt.get(str(item["id"]))

    print("Predicted:")
    print(f"{item['id']}: {pred}")
    print("Raw response:\n")
    print(item["response"]["candidates"][0]["content"]["parts"][0]["text"])
    print("GT:\n")
    print(gt)

    if pred.strip() in gt.strip() or gt.strip() in pred.strip():
        count += 1
        print(pred, gt,flush=True)

print(f"Accuracy: {count}/{len(data)} = {count/len(data):.4f}")


1800 records loaded
Predicted:
595: A
Raw response:

The problem describes a square piece of paper that is folded and then punched with holes. We need to determine the pattern of holes when the paper is unfolded.

**Step 1: Understand the folding process.**
*   **Unfolded:** We start with a square piece of paper.
*   **Fold 1:** The square is folded in half horizontally. The top half is folded downwards onto the bottom half. This means the fold line is the horizontal midline of the original square. After this fold, the paper becomes a rectangle with double thickness. The "Final view" shows this rectangle. The rectangle corresponds to the bottom half of the original square, but now it's two layers thick (the original bottom half and the original top half folded over it).

**Step 2: Analyze the holes in the "Final view".**
The "Final view" shows the folded rectangle with three holes punched through it. Since the paper is double-layered at this point, each punch will create two holes in t

In [None]:
#ONLY OFR GPT-4o/5
import json
import re
from datasets import load_dataset
import numpy as np  

def read_jsonl(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def clean_answer(ans: str) -> str:
    # 提取 <answer>...</answer>
    if "<answer>" in ans:
        m = re.search(r"<answer>(.*?)</answer>", ans, re.DOTALL)
        if m:
            return m.group(1).strip()
    else :
        return ans.strip()

dataset = load_dataset("LLDDSS/Awesome_Spatial_VQA_Benchmarks")
bench = "EgoOrientBench"

data = read_jsonl("/home/tuo/Codes/survey_eval/Commercial/gpt5_results/EgoOrientBench_result.jsonl")
# print(data[3]['response']["body"]["choices"][0]["message"]["content"])

count = 0
#根据id来匹配
id_to_gt = {str(item["id"]): item["GT"] for item in dataset[bench]}

for item in data:
    pred = clean_answer(item['response']["body"]["choices"][0]["message"]["content"])
    #split the "request-"
    id = item["custom_id"].split("-")[-1]
    
    gt = id_to_gt.get(str(id))
    # print(f"{id}: {pred}")

    if pred is None or gt is None:
        print(f"Missing prediction or GT for id {id}")
        continue

    pred=pred.strip()
    
    if (gt.lower() in pred.lower() or pred.lower() in gt.lower() ):
        count += 1
        print(pred, gt)

            

total = len(data)
print(f"Accuracy: {count}/{total} = {count/total:.4f}")


yes yes
no no
A.front front
C facing the camera
yes yes
no no
A facing right
A facing the camera
yes yes
no no
A facing left
<yes></yes> yes
no no
yes yes
no no
no no
H front right
A facing right while facing the camera
no no
C facing the camera
yes yes
no no
A facing left
yes yes
no no
A facing the camera
no no
A facing the camera
yes yes
no no
C facing the camera
yes yes
no no
H.front left front left
no no
yes yes
F.back left back left
A toward left while facing away the camera
yes yes
no no
A.front front
A facing the camera
yes yes
yes yes
C facing the camera
yes yes
no no
yes yes
no no
G.left left
A facing left
yes yes
A facing left
yes yes
no no
D toward right while facing away the camera
yes yes
yes yes
no no
F back left
yes yes
no no
A facing right while facing the camera
H.front left front
yes yes
A facing right while facing the camera
no no
A facing left
yes yes
no no
A facing left
yes yes
no no
H.front left front
A facing the camera
yes yes
no no
A facing left
yes yes
yes yes

In [None]:
# ONLY for General models
import json
import os
import re

# Calculate the accuracy of the json
# Result folder 
result_folder = 'output/llava_1_5'

files = os.listdir(result_folder)

def extract_first_option(text):
    """
    Extract the first uppercase letter from [A-I] in the text.
    If not found, return the original text.
    """
    if not text:
        return text
    
    # Search for first uppercase letter in [A-I]
    # Sometimes we should add some special patterns to mathch due to the model output format, e.g.<answer>A</answer> or [INST]
    # Please adjust the regex pattern as needed.
    match = re.search(r'[A-I]', text)
    if match:
        return match.group(0)
    return text

for file in files:
    if not file.endswith('.json'):
        continue
    file_path = os.path.join(result_folder, file)
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    count = 0
    for d in data:
        # Extract first option from result
        extracted_answer = extract_first_option(d['result'])

        if d['GT'].strip() in extracted_answer or extracted_answer in d['GT'].strip():
            count += 1
    
    print(file)
    print("Total:", len(data))
    print("Correct:", count)
    print("Accuracy:", count/len(data))