In [1]:
import json
import os
from pathlib import Path

# Define paths
batch_query_jsonl_file = "/mnt/fast10/brandon/mmr_rollout_data/raw_rollouts/soft_estimation/AI2D/verification/verification_pipeline_outputs/o4-mini/AI2D/verification_batches/batch_0001.jsonl"
batch_verification_result_json_file = "/mnt/fast10/brandon/mmr_rollout_data/raw_rollouts/soft_estimation/AI2D/verification/verification_pipeline_outputs/o4-mini/AI2D/batch_0001_verification_results.json"
output_dir = "/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files"

# Extract dataset name from path
dataset_name = batch_query_jsonl_file.split('/')[-3]  # Gets "AI2D" from batch_0001.jsonl

# print(dataset_name)
# # Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)

# # Load JSONL file into dict
batch_query_data = {}
with open(batch_query_jsonl_file, 'r') as f:
    for line in f:
        item = json.loads(line)
        batch_query_data[item['custom_id']] = item

# print(len(batch_query_data.keys()))

In [2]:
# # Load JSON file
with open(batch_verification_result_json_file, 'r') as f:
    batch_verification_data = json.load(f)

# Check all custom_ids and custom_ids match
json_ids = {item['custom_id'] for item in batch_verification_data}
jsonl_ids = set(batch_query_data.keys())

if jsonl_ids != json_ids:
    missing_in_json = jsonl_ids - json_ids
    missing_in_jsonl = json_ids - jsonl_ids
    if missing_in_json:
        print(f"custom_ids in JSONL not found in JSON: {missing_in_json}")
    if missing_in_jsonl:
        print(f"custom_ids in JSON not found in JSONL: {missing_in_jsonl}")
else:
    print(f"All custom_ids and custom_ids match")

# Merge and save
output_path = os.path.join(output_dir, f"{dataset_name}_verification_merged.jsonl")
with open(output_path, 'w') as f:
    for item in batch_verification_data:
        merged = {**batch_query_data[item['custom_id']], **item}
        f.write(json.dumps(merged) + '\n')

print(f"Merged {len(batch_verification_data)} items to {output_path}")

All custom_ids and custom_ids match
Merged 746 items to /mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_merged.jsonl


In [None]:
# Print first line of merged file
with open(output_path, 'r') as f:
    first_line = f.readline().strip()
    if first_line:
        first_item = json.loads(first_line)
        print("First line of merged file:")
        print(json.dumps(first_item, indent=2))
    else:
        print("File is empty")

# Now extract the Solution as the unique key to match with rollouts

In [4]:
import re

merged_verification_file = "/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_merged.jsonl"
# Extract solutions from merged file
solutions = []
solution_pattern = re.compile(r'<solution>(.*?)</solution>', re.DOTALL)

with open(output_path, 'r') as f:
    for line_num, line in enumerate(f, 1):
        item = json.loads(line)
        try:
            text = item["body"]["messages"][0]["content"][0]["text"]
            # Find all matches and get the second one
            matches = solution_pattern.findall(text)
            if len(matches) >= 2:
                solution_text = matches[1].strip()  # Get second occurrence
                if solution_text:  # Only add non-empty solutions
                    solutions.append({
                        "custom_id": item.get("custom_id", "ERROR: custom_id not found"),
                        "unique_key": solution_text
                    })
            elif len(matches) == 1:
                print(f"Warning: Only one <solution> tag found in line {line_num}")
            else:
                print(f"Warning: No <solution> tags found in line {line_num}")
        except (KeyError, IndexError, TypeError) as e:
            print(f"Error accessing text in line {line_num}: {e}")

print(f"Extracted {len(solutions)} valid solutions")

Extracted 746 valid solutions


In [5]:
# Load flattened file once into memory
full_raw_rollout_data_file = "/mnt/fast10/brandon/mmr_rollout_data/flattened_rollout_files/AI2D_flattened.jsonl"
full_raw_rollout_data_array = []

with open(full_raw_rollout_data_file, 'r') as f:
    for line in f:
        item = json.loads(line)
        full_raw_rollout_data_array.append({
            "response": item.get("response", ""),
            "uid": item.get("uid")
        })

print(f"Loaded {len(full_raw_rollout_data_array)} items from flattened file")

Loaded 25557 items from flattened file


In [6]:
full_raw_rollout_data_array[0]

{'response': '[Visual Elements]\n<step_1>\nIdentify all organisms in the food web: short-eared owl, vole, meadow pippit, emperor moth larvae, red grouse, heather, fox, brown hare, red kite or hen harrier.\n</step_1>\n<step_2>\nNote the arrows indicating feeding relationships: arrows point from food to consumer.\n</step_2>\n<step_3>\nObserve that the fox eats: red grouse, brown hare, and vole (arrows pointing from each of these to fox).\n</step_3>\n<step_4>\nDetermine what eats fox: no arrows point to fox, indicating it is a top predator.\n</step_4>\n<step_5>\nDetermine what else eats red grouse: Arrow from red grouse to fox, and red kite or hen harrier. So red grouse is eaten by fox and red kite/hen harrier.\n</step_5>\n<step_6>\nDetermine who eats meadow pippit: Arrow from meadow pippit to fox only.\n</step_6>\n<step_7>\nNote which options are present: more grouse, more pippit, less grouse, less owl.\n</step_7>\n\n[Reasoning]\n<step_1>\nThe question asks what would happen if the fox p

In [7]:
print(solutions[0])

{'custom_id': 'd8086fbb-f598-450c-8e53-15074142e4ab', 'unique_key': '[Visual Elements]\n<step_1>\nThe image shows four different flies, each labeled with a blue box. The labels are A, B, C, and D, positioned above each respective fly.\n</step_1>\n<step_2>\nFly A is on the far left. Its body is robust and the wings are broad. The thorax and abdomen are relatively compact compared to other examples.\n</step_2>\n<step_3>\nFly B is second from the right, smaller and slender compared to A.\n</step_3>\n<step_4>\nFly C appears to have the largest and most robust body, with a pronounced head and broader thorax.\n</step_4>\n<step_5>\nFly D is second from the left, slightly larger than A but not as large as C.\n</step_5>\n<step_6>\nNo direct text is visible associating each fly with its species in the image; identification must be based on morphology.\n</step_6>\n\n[Reasoning]\n<step_1>\nThe question asks to identify the type of fly depicted at letter A, with the options: Stable, Horn, Face, Hou

In [11]:
collision_errors = []
no_matches_array = []
for sol in solutions:
    unique_key = sol["unique_key"]
    
    # Find all matches
    matches = [item for item in full_raw_rollout_data_array if item["response"] == unique_key]
    
    if len(matches) > 1:
        collision_errors.append({
            "solution_unique_key": unique_key,
            "solution_custom_id": sol["custom_id"],
            "matches": matches
        })
    elif len(matches) == 0:
        no_matches_array.append({
            "solution_unique_key": unique_key,
            "solution_custom_id": sol["custom_id"],
        })

# Report collision errors
if collision_errors:
    print(f"\n🚨 COLLISION ERRORS FOUND: {len(collision_errors)} unique_keys have multiple matches!")
    for error in collision_errors:
        print(f"\nCollision for rollout_uid: {error['rollout_uid']}")
        print(f"solution_unique_key: {error['solution_unique_key'][:100]}...")
        print(f"Found {len(error['matches'])} matches:")
        for match in error['matches']:
            print(f"  - uid: {match['uid']}, response: {match['response'][:50]}...")
    
    raise ValueError(f"{len(collision_errors)} collision errors found. See details above.")
else:
    print(f"\n✅ No collisions found! All {len(solutions)} solutions have at most one match.")

if no_matches_array:
    print(f"\n🚨 NO MATCHES FOUND: {len(no_matches_array)} unique_keys have no matches!")
    for error in no_matches_array:
        print(f"\nNo match found for solution_custom_id: {error['solution_custom_id']}")
        print(f"solution_unique_key: {error['solution_unique_key'][:100]}...")
    
    raise ValueError(f"{len(no_matches_array)} no matches found. See details above.")
else:
    print(f"\n✅ No no matches found! All {len(solutions)} solutions have at least one match.")


✅ No collisions found! All 746 solutions have at most one match.

🚨 NO MATCHES FOUND: 53 unique_keys have no matches!

No match found for solution_custom_id: 488b5663-da97-4ce5-bf92-a5269a4f70de
solution_unique_key: [Visual Elements]
<step_1>
The diagram illustrates a food chain (likely a food web) using rectangula...

No match found for solution_custom_id: fa6e48b6-5edf-4579-a3f0-70b64e9d8016
solution_unique_key: [Visual Elements]
<step_1>
Identify all the species in the diagram: Vegetation, Lizard, Rodents, Wal...

No match found for solution_custom_id: 198d7fa8-8a56-47b3-9694-d60da59750dd
solution_unique_key: [Visual Elements]
<step_1>
Identify the box labeled "Snakes" in the diagram as the species for which...

No match found for solution_custom_id: c2cf9066-5b01-4bfd-85c8-e85fe1c45492
solution_unique_key: [Visual Elements]
<step_1>
There are five labeled diagrams of insects in the image, marked as A, B, ...

No match found for solution_custom_id: 9ad2673d-e7b8-47e2-ac1c-d922fa016

ValueError: 53 no matches found. See details above.