# Merge Verification Query batches with Verification Results

In [22]:
import json
import os
from pathlib import Path
import re

models = ["gpt-4.1-mini", "gpt-4.1-nano", "o4-mini"]
dataset_name = "AI2D"
output_dir = "/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files"

Path(output_dir).mkdir(parents=True, exist_ok=True)


for model in models:
    batch_query_jsonl_file = f"/mnt/fast10/brandon/mmr_rollout_data/flattened_verification_query_files/{dataset_name}_{model}_verification_flattened.jsonl"
    batch_verification_result_json_file = f"/mnt/fast10/brandon/mmr_rollout_data/flattened_verification_result_files/{dataset_name}_{model}_verification_flattened.json"
    
    # # Load JSONL file into dict
    batch_query_data = {}
    with open(batch_query_jsonl_file, 'r') as f:
        for line in f:
            item = json.loads(line)
            batch_query_data[item['custom_id']] = item
    
    # # Load JSON file
    with open(batch_verification_result_json_file, 'r') as f:
        batch_verification_data = json.load(f)

    # Check all custom_ids and custom_ids match
    json_ids = {item['custom_id'] for item in batch_verification_data}
    jsonl_ids = set(batch_query_data.keys())

    if jsonl_ids != json_ids:
        missing_in_json = jsonl_ids - json_ids
        missing_in_jsonl = json_ids - jsonl_ids
        if missing_in_json:
            print(f"custom_ids in JSONL not found in JSON: {missing_in_json}")
        if missing_in_jsonl:
            print(f"custom_ids in JSON not found in JSONL: {missing_in_jsonl}")
    else:
        print(f"All custom_ids and custom_ids match")

    # Merge and save
    output_path = os.path.join(output_dir, f"{dataset_name}_{model}_verification_merged.jsonl")
    with open(output_path, 'w') as f:
        for item in batch_verification_data:
            merged = {**batch_query_data[item['custom_id']], **item}
            f.write(json.dumps(merged) + '\n')

    print(f"Merged {len(batch_verification_data)} items to {output_path}")

    model_name = model  # Change this as needed

    # Load the merged verification file
    merged_verification_file = output_path

    # Pattern to extract text between <conclusion> tags
    conclusion_pattern = re.compile(r'<conclusion>(.*?)</conclusion>', re.DOTALL)

    # Process each row
    processed_data = []
    correct_count = 0
    incorrect_count = 0
    invalid_count = 0

    with open(merged_verification_file, 'r') as f:
        for line_num, line in enumerate(f, 1):
            item = json.loads(line)
            
            # Extract conclusion text
            verification_response = item.get("verification_response", "")
            conclusion_match = conclusion_pattern.search(verification_response)
            
            if conclusion_match:
                conclusion_text = conclusion_match.group(1).strip()
                
                # Verify it's either "Correct" or "Incorrect"
                if conclusion_text == "Correct":
                    item[f"{model_name}_isVerified"] = True
                    correct_count += 1
                elif conclusion_text == "Incorrect":
                    item[f"{model_name}_isVerified"] = False
                    incorrect_count += 1
                else:
                    print(f"Warning: Invalid conclusion text on line {line_num}: '{conclusion_text}'")
                    invalid_count += 1
                    item[f"{model_name}_isVerified"] = None
            else:
                print(f"Warning: No <conclusion> tags found on line {line_num}")
                invalid_count += 1
                item[f"{model_name}_isVerified"] = None
            
            processed_data.append(item)

    print(f"Processing complete:")
    print(f"  Correct: {correct_count}")
    print(f"  Incorrect: {incorrect_count}")
    print(f"  Invalid: {invalid_count}")
    print(f"  Total: {len(processed_data)}")

    # Save the processed data back to a new file
    output_file = f"/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_processed_{model_name}.jsonl"
    with open(output_file, 'w') as f:
        for item in processed_data:
            f.write(json.dumps(item) + '\n')

    print(f"Saved processed data to: {output_file}")

custom_ids in JSONL not found in JSON: {'c22922e6-c01e-48fd-962d-731cf528c989', 'df0c263b-2c35-41af-ab0c-ad5154a9aab3', '0607a915-6dab-411c-aeb9-6fc24e8c47c2', 'e9f7c93a-a003-46dc-8364-b3f2ca2b5d0b', 'fd9d7210-85ba-41a4-a63e-b0a61342471a', '3962b469-ce17-472d-8081-7800439ae7b5', '3f48e81d-5d0d-4212-8144-e526b0ee792f', 'a49d9db1-67d8-476b-ba66-2cecbf043405', 'f6fce25c-065c-4007-b1f6-23c56850b80f', '42a4f207-c3ef-4b65-b132-9ac013073e3e', 'f460c09c-1fd8-4954-93c3-33adc6d3411e', 'd286acf6-7cde-42cb-89a1-0f721a6e93e8', '9288ba86-21a7-4a0b-a6d4-6ef6fb3e25fe', '9985cd43-4ae3-4f30-83e8-adca35a19caa', 'f7dd1861-d24e-41d7-bdd4-18c005c4e38a', 'f3316f04-c0ac-4126-920a-2192966ae943', 'cbeb129c-4218-4e07-a3b1-24133d3870c9', '50ed5f9b-4dd3-47b4-ba8d-bbc1ce8e04dc', '0b9cdf11-112a-4519-b8e7-222aa8eef3de', '02d7a026-fa67-49c1-a1b6-7958f81aa0a5', '85b82663-32d2-46cf-ac85-761af1e5559e', '4151b8e5-9e31-4f7b-afb4-02334e39328a', '417667e3-9822-487e-beba-ad88e2296a13', '65e647b4-df92-495d-8f5b-e404967e96c2', 

In [None]:
import json
import os
from pathlib import Path

# Define paths
batch_query_jsonl_file = "/mnt/fast10/brandon/mmr_rollout_data/raw_rollouts/soft_estimation/AI2D/verification/verification_pipeline_outputs/o4-mini/AI2D/verification_batches/batch_0001.jsonl"
batch_verification_result_json_file = "/mnt/fast10/brandon/mmr_rollout_data/raw_rollouts/soft_estimation/AI2D/verification/verification_pipeline_outputs/o4-mini/AI2D/batch_0001_verification_results.json"
output_dir = "/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files"

# Extract dataset name from path
dataset_name = batch_query_jsonl_file.split('/')[-3]  # Gets "AI2D" from batch_0001.jsonl

# print(dataset_name)
# # Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)

# # Load JSONL file into dict
batch_query_data = {}
with open(batch_query_jsonl_file, 'r') as f:
    for line in f:
        item = json.loads(line)
        batch_query_data[item['custom_id']] = item

# print(len(batch_query_data.keys()))

In [2]:
# # Load JSON file
with open(batch_verification_result_json_file, 'r') as f:
    batch_verification_data = json.load(f)

# Check all custom_ids and custom_ids match
json_ids = {item['custom_id'] for item in batch_verification_data}
jsonl_ids = set(batch_query_data.keys())

if jsonl_ids != json_ids:
    missing_in_json = jsonl_ids - json_ids
    missing_in_jsonl = json_ids - jsonl_ids
    if missing_in_json:
        print(f"custom_ids in JSONL not found in JSON: {missing_in_json}")
    if missing_in_jsonl:
        print(f"custom_ids in JSON not found in JSONL: {missing_in_jsonl}")
else:
    print(f"All custom_ids and custom_ids match")

# Merge and save
output_path = os.path.join(output_dir, f"{dataset_name}_verification_merged.jsonl")
with open(output_path, 'w') as f:
    for item in batch_verification_data:
        merged = {**batch_query_data[item['custom_id']], **item}
        f.write(json.dumps(merged) + '\n')

print(f"Merged {len(batch_verification_data)} items to {output_path}")

All custom_ids and custom_ids match
Merged 746 items to /mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_merged.jsonl


In [13]:
# Print first line of merged file
with open(output_path, 'r') as f:
    first_line = f.readline().strip()
    if first_line:
        first_item = json.loads(first_line)
        print("Keys in first item:")
        print(list(first_item.keys()))
    else:
        print("File is empty")

Keys in first item:
['custom_id', 'method', 'url', 'body', 'verification_response']


In [14]:
import re

# Set your model name
model_name = "o4-mini"  # Change this as needed

# Load the merged verification file
merged_verification_file = "/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_merged.jsonl"

# Pattern to extract text between <conclusion> tags
conclusion_pattern = re.compile(r'<conclusion>(.*?)</conclusion>', re.DOTALL)

# Process each row
processed_data = []
correct_count = 0
incorrect_count = 0
invalid_count = 0

with open(merged_verification_file, 'r') as f:
    for line_num, line in enumerate(f, 1):
        item = json.loads(line)
        
        # Extract conclusion text
        verification_response = item.get("verification_response", "")
        conclusion_match = conclusion_pattern.search(verification_response)
        
        if conclusion_match:
            conclusion_text = conclusion_match.group(1).strip()
            
            # Verify it's either "Correct" or "Incorrect"
            if conclusion_text == "Correct":
                item[f"{model_name}_isVerified"] = True
                correct_count += 1
            elif conclusion_text == "Incorrect":
                item[f"{model_name}_isVerified"] = False
                incorrect_count += 1
            else:
                print(f"Warning: Invalid conclusion text on line {line_num}: '{conclusion_text}'")
                invalid_count += 1
                item[f"{model_name}_isVerified"] = None
        else:
            print(f"Warning: No <conclusion> tags found on line {line_num}")
            invalid_count += 1
            item[f"{model_name}_isVerified"] = None
        
        processed_data.append(item)

print(f"Processing complete:")
print(f"  Correct: {correct_count}")
print(f"  Incorrect: {incorrect_count}")
print(f"  Invalid: {invalid_count}")
print(f"  Total: {len(processed_data)}")

# Save the processed data back to a new file
output_file = f"/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_processed_{model_name}.jsonl"
with open(output_file, 'w') as f:
    for item in processed_data:
        f.write(json.dumps(item) + '\n')

print(f"Saved processed data to: {output_file}")

Processing complete:
  Correct: 557
  Incorrect: 189
  Invalid: 0
  Total: 746
Saved processed data to: /mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_processed_o4-mini.jsonl


In [16]:
# Print first line of merged file
with open(output_file, 'r') as f:
    first_line = f.readline().strip()
    if first_line:
        first_item = json.loads(first_line)
        print("Keys in first item:")
        print(list(first_item.keys()))
    else:
        print("File is empty")

Keys in first item:
['custom_id', 'method', 'url', 'body', 'verification_response', 'o4-mini_isVerified']


# Now extract the Solution as the unique key to match with rollouts

In [4]:
import re

merged_verification_file = "/mnt/fast10/brandon/mmr_rollout_data/merged_verification_files/AI2D_verification_merged.jsonl"
# Extract solutions from merged file
solutions = []
solution_pattern = re.compile(r'<solution>(.*?)</solution>', re.DOTALL)

with open(output_path, 'r') as f:
    for line_num, line in enumerate(f, 1):
        item = json.loads(line)
        try:
            text = item["body"]["messages"][0]["content"][0]["text"]
            # Find all matches and get the second one
            matches = solution_pattern.findall(text)
            if len(matches) >= 2:
                solution_text = matches[1].strip()  # Get second occurrence
                if solution_text:  # Only add non-empty solutions
                    solutions.append({
                        "custom_id": item.get("custom_id", "ERROR: custom_id not found"),
                        "unique_key": solution_text
                    })
            elif len(matches) == 1:
                print(f"Warning: Only one <solution> tag found in line {line_num}")
            else:
                print(f"Warning: No <solution> tags found in line {line_num}")
        except (KeyError, IndexError, TypeError) as e:
            print(f"Error accessing text in line {line_num}: {e}")

print(f"Extracted {len(solutions)} valid solutions")

Extracted 746 valid solutions


In [5]:
# Load flattened file once into memory
full_raw_rollout_data_file = "/mnt/fast10/brandon/mmr_rollout_data/flattened_rollout_files/AI2D_flattened.jsonl"
full_raw_rollout_data_array = []

with open(full_raw_rollout_data_file, 'r') as f:
    for line in f:
        item = json.loads(line)
        full_raw_rollout_data_array.append({
            "response": item.get("response", ""),
            "uid": item.get("uid")
        })

print(f"Loaded {len(full_raw_rollout_data_array)} items from flattened file")

Loaded 25557 items from flattened file


In [6]:
full_raw_rollout_data_array[0]

{'response': '[Visual Elements]\n<step_1>\nIdentify all organisms in the food web: short-eared owl, vole, meadow pippit, emperor moth larvae, red grouse, heather, fox, brown hare, red kite or hen harrier.\n</step_1>\n<step_2>\nNote the arrows indicating feeding relationships: arrows point from food to consumer.\n</step_2>\n<step_3>\nObserve that the fox eats: red grouse, brown hare, and vole (arrows pointing from each of these to fox).\n</step_3>\n<step_4>\nDetermine what eats fox: no arrows point to fox, indicating it is a top predator.\n</step_4>\n<step_5>\nDetermine what else eats red grouse: Arrow from red grouse to fox, and red kite or hen harrier. So red grouse is eaten by fox and red kite/hen harrier.\n</step_5>\n<step_6>\nDetermine who eats meadow pippit: Arrow from meadow pippit to fox only.\n</step_6>\n<step_7>\nNote which options are present: more grouse, more pippit, less grouse, less owl.\n</step_7>\n\n[Reasoning]\n<step_1>\nThe question asks what would happen if the fox p

In [7]:
print(solutions[0])

{'custom_id': 'd8086fbb-f598-450c-8e53-15074142e4ab', 'unique_key': '[Visual Elements]\n<step_1>\nThe image shows four different flies, each labeled with a blue box. The labels are A, B, C, and D, positioned above each respective fly.\n</step_1>\n<step_2>\nFly A is on the far left. Its body is robust and the wings are broad. The thorax and abdomen are relatively compact compared to other examples.\n</step_2>\n<step_3>\nFly B is second from the right, smaller and slender compared to A.\n</step_3>\n<step_4>\nFly C appears to have the largest and most robust body, with a pronounced head and broader thorax.\n</step_4>\n<step_5>\nFly D is second from the left, slightly larger than A but not as large as C.\n</step_5>\n<step_6>\nNo direct text is visible associating each fly with its species in the image; identification must be based on morphology.\n</step_6>\n\n[Reasoning]\n<step_1>\nThe question asks to identify the type of fly depicted at letter A, with the options: Stable, Horn, Face, Hou

### initial collision workings, refer to full ones in merge_rollout_and_verification_files.ipynb

In [12]:
collision_errors = []
no_matches_array = []
for sol in solutions:
    unique_key = sol["unique_key"]
    
    # Find all matches
    matches = [item for item in full_raw_rollout_data_array if item["response"].strip() == unique_key]
    
    if len(matches) > 1:
        collision_errors.append({
            "solution_unique_key": unique_key,
            "solution_custom_id": sol["custom_id"],
            "matches": matches
        })
    elif len(matches) == 0:
        no_matches_array.append({
            "solution_unique_key": unique_key,
            "solution_custom_id": sol["custom_id"],
        })

# Report collision errors
if collision_errors:
    print(f"\n🚨 COLLISION ERRORS FOUND: {len(collision_errors)} unique_keys have multiple matches!")
    for error in collision_errors:
        print(f"\nCollision for rollout_uid: {error['rollout_uid']}")
        print(f"solution_unique_key: {error['solution_unique_key'][:100]}...")
        print(f"Found {len(error['matches'])} matches:")
        for match in error['matches']:
            print(f"  - uid: {match['uid']}, response: {match['response'][:50]}...")
    
    raise ValueError(f"{len(collision_errors)} collision errors found. See details above.")
else:
    print(f"\n✅ No collisions found! All {len(solutions)} solutions have at most one match.")

if no_matches_array:
    print(f"\n🚨 NO MATCHES FOUND: {len(no_matches_array)} unique_keys have no matches!")
    for error in no_matches_array:
        print(f"\nNo match found for solution_custom_id: {error['solution_custom_id']}")
        print(f"solution_unique_key: {error['solution_unique_key'][:100]}...")
    
    raise ValueError(f"{len(no_matches_array)} no matches found. See details above.")
else:
    print(f"\n✅ No no matches found! All {len(solutions)} solutions have at least one match.")


✅ No collisions found! All 746 solutions have at most one match.

✅ No no matches found! All 746 solutions have at least one match.


In [3]:
# Load the JSONL file as a pandas dataframe
import pandas as pd

# Assuming the JSONL file is the final processed verification file
# You'll need to specify the correct file path
verification_file_path = "/mnt/fast10/brandon/mmr_rollout_data/processed_full_verification_files/AI2D_final_mc_and_verification_merged_o4-mini.jsonl"

# Load JSONL into pandas dataframe
df = pd.read_json(verification_file_path, lines=True)

# Calculate summary statistics for o4-mini_isVerified column
print("Summary statistics for o4-mini_isVerified column:")
print("=" * 50)

# Basic value counts
print("\nValue counts:")
print(df['o4-mini_isVerified'].value_counts(dropna=False))

# Percentage breakdown
print("\nPercentage breakdown:")
print(df['o4-mini_isVerified'].value_counts(dropna=False, normalize=True) * 100)

# Summary statistics
print("\nSummary:")
print(f"Total rows: {len(df)}")
print(f"True (Correct): {df['o4-mini_isVerified'].sum()}")
print(f"False (Incorrect): {(df['o4-mini_isVerified'] == False).sum()}")
print(f"None (Invalid/No verification): {df['o4-mini_isVerified'].isna().sum()}")

# Display first few rows to verify data structure
print("\nFirst 5 rows of the dataframe:")
print(df.columns)
print(df.head())


Summary statistics for o4-mini_isVerified column:

Value counts:
o4-mini_isVerified
1.0    17225
0.0     7368
NaN      964
Name: count, dtype: int64

Percentage breakdown:
o4-mini_isVerified
1.0    67.398364
0.0    28.829675
NaN     3.771961
Name: proportion, dtype: float64

Summary:
Total rows: 25557
True (Correct): 17225.0
False (Incorrect): 7368
None (Invalid/No verification): 964

First 5 rows of the dataframe:
Index(['verification_custom_id', 'response_uid', 'rollout_response',
       'rollout_image_path', 'o4-mini_verification_solution',
       'o4-mini_isVerified'],
      dtype='object')
                 verification_custom_id                          response_uid  \
0  1246cedc-7f5b-4409-9535-2d88841ae65f  b6c24365-7b04-4a00-96fb-4ed28c9fcdfc   
1  19b64cd9-14d6-4244-91ad-02fab7061573  b6c24365-7b04-4a00-96fb-4ed28c9fcdfc   
2  2ac3bfad-1a4f-4a8e-ba2d-b870fde562fd  b6c24365-7b04-4a00-96fb-4ed28c9fcdfc   
3  30591da1-e29e-4c02-bdb5-3b827ef41bcf  b6c24365-7b04-4a00-96fb-4ed28c9fc

In [7]:
# Filter rows where custom_id is "c0cb3f17-b910-4df7-9c55-f2e34f0f4039"
target_custom_id = "69107ceb-49c4-48ea-bed3-3e459a60a699"
target_row = df[df["verification_custom_id"] == target_custom_id]
print(f"Row with custom_id {target_custom_id}:")
print(target_row)

Row with custom_id 69107ceb-49c4-48ea-bed3-3e459a60a699:
Empty DataFrame
Columns: [verification_custom_id, response_uid, rollout_response, rollout_image_path, o4-mini_verification_solution, o4-mini_isVerified]
Index: []


In [9]:
# Filter rows where o4-mini_isVerified is None
none_verified_rows = df[df['o4-mini_isVerified'].isna()]

print(f"Rows where o4-mini_isVerified is None (count: {len(none_verified_rows)}):")
print("=" * 60)

# Display first 3 rows where isVerified is None
for idx, row in none_verified_rows.head(3).iterrows():
    print(f"\nRow {idx}:")
    print(f"  response_uid: {row.get('response_uid', 'N/A')}")
    print(f"  verification_custom_id: {row.get('verification_custom_id', 'N/A')}")
    print(f"  rollout_response: {row.get('rollout_response', 'N/A')}")
    print(f"  o4-mini_isVerified: {row.get('o4-mini_isVerified', 'N/A')}")
    print(f"  verification_response: {row.get('verification_response', 'N/A')[:200]}...")  # Truncate long responses
    print("-" * 40)


Rows where o4-mini_isVerified is None (count: 964):

Row 580:
  response_uid: 1f741d15-fd09-49a4-b117-e21f2aaa8fb0
  verification_custom_id: None
  rollout_response: [Visual Elements]
<step_1>
The image shows a labeled diagram of a plant cell, with various organelles labeled with blue tags.
</step_1>
<step_2>
There are mitochondria visible in the diagram, identifiable as the orange organelles with squiggly lines inside, commonly used to represent mitochondria in biology diagrams.
</step_2>
<step_3>
Other visible parts include the cell wall (outermost boundary), nucleus (large round central feature), and cytoplasm (the fluid/jelly-like substance filling the cell).
</step_3>
<step_4>
The question asks which part generates energy, and the options correspond to typical plant cell components: cell wall, mitochondria, nucleus, cytoplasm.
</step_4>

[Reasoning]
<step_1>
The task is to identify which cell component is responsible for generating energy.
</step_1>
<step_2>
The mitochondria are u

In [1]:
# Filter rows where o4-mini_isVerified is None
verified_rows = df[df['o4-mini_isVerified'] == True]

print(f"Rows where o4-mini_isVerified is True (count: {len(verified_rows)}):")
print("=" * 60)

# Display first 3 rows where isVerified is None
for idx, row in verified_rows.head(3).iterrows():
    print(f"\nRow {idx}:")
    print(f"  response_uid: {row.get('response_uid', 'N/A')}")
    print(f"  verification_custom_id: {row.get('verification_custom_id', 'N/A')}")
    print(f"  rollout_response: {row.get('rollout_response', 'N/A')}")
    print(f"  o4-mini_isVerified: {row.get('o4-mini_isVerified', 'N/A')}")
    print(f"  verification_response: {row.get('verification_response', 'N/A')[:200]}...")  # Truncate long responses
    print("-" * 40)


NameError: name 'df' is not defined