In [1]:
import json
import os
import glob
from pathlib import Path

In [2]:
# Directory containing JSONL files
jsonl_directory = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/generated_rollouts/soft_estimation/RAVEN/verification/verification_pipeline_outputs/gpt-4.1-nano/mock_batch_for_testing/old_batch_files"

# Get all JSONL files in the directory
jsonl_files = glob.glob(os.path.join(jsonl_directory, "*.jsonl"))
print(f"Found {len(jsonl_files)} JSONL files:")
for file in jsonl_files:
    print(f"  - {os.path.basename(file)}")

Found 2 JSONL files:
  - batch_0010.jsonl
  - batch_0011.jsonl


In [8]:
if jsonl_files:
    first_file = jsonl_files[0]
    print(f"Loading first file: {os.path.basename(first_file)}")
    
    with open(first_file, 'r') as f:
        first_line = f.readline().strip()
        if first_line:
            first_record = json.loads(first_line)
            print("Keys in the first record:")
            for key in first_record.keys():
                print(f"  - {key}")
            
            if "body" in first_record and "model" in first_record["body"]:
                current_model = first_record["body"]["model"]
                print(f"\nCurrent model value: {current_model}")
            if "url" in first_record:
                print(f"\nCurrent url value: {first_record['url']}")
            else:
                print("\nNo 'body.model' key found in the first record")
else:
    print("No JSONL files found in the directory")

Loading first file: batch_0010.jsonl
Keys in the first record:
  - custom_id
  - method
  - url
  - body

Current model value: gpt-4.1-nano-2

Current url value: /chat/completions


In [4]:
def update_jsonl_model(jsonl_file_path, new_model_value, new_url_value):
    """
    Update the model value in all records of a JSONL file
    """
    updated_records = []
    
    with open(jsonl_file_path, 'r') as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if line:
                try:
                    record = json.loads(line)
                    if "body" in record and "model" in record["body"]:
                        record["body"]["model"] = new_model_value
                    if "url" in record:
                        record["url"] = new_url_value
                    updated_records.append(record)
                except json.JSONDecodeError as e:
                    print(f"Error parsing line {line_num}: {e}")
                    continue
    
    return updated_records

In [9]:
# Get new model value and number of records from user
new_model_value = input("Enter the new model value: ")
# new_url_value = input("Enter the new url value: ")
new_url_value = "v1/chat/completions"
num_records_input = input("Enter the number of records to include in updated files (or 'all' for all records): ")

# Process all JSONL files
for jsonl_file in jsonl_files:
    filename = os.path.basename(jsonl_file)
    print(f"\nProcessing: {filename}")
    
    # Update the model value
    updated_records = update_jsonl_model(jsonl_file, new_model_value, new_url_value)
    
    # Handle "all" input or limit to specified number of records
    if num_records_input.lower() == "all":
        limited_records = updated_records
    else:
        num_records = int(num_records_input)
        limited_records = updated_records[:num_records]
    
    # Create new filename
    name_without_ext = os.path.splitext(filename)[0]
    new_filename = f"../verification_batches/{name_without_ext}.jsonl"
    new_filepath = os.path.join(jsonl_directory, new_filename)
    
    # Save updated file with limited records
    with open(new_filepath, 'w') as f:
        for record in limited_records:
            f.write(json.dumps(record) + '\n')
    
    print(f"Saved updated file: {new_filename}")
    print(f"Original records: {len(updated_records)}, Saved records: {len(limited_records)}")

print("\nAll files processed successfully!")


Processing: batch_0010.jsonl
Saved updated file: ../verification_batches/batch_0010.jsonl
Original records: 2331, Saved records: 2331

Processing: batch_0011.jsonl
Saved updated file: ../verification_batches/batch_0011.jsonl
Original records: 2331, Saved records: 2331

All files processed successfully!


In [7]:
# Verify the updates by checking the first record of each updated file
verification_batches_dir = os.path.normpath(os.path.join(jsonl_directory, "../verification_batches"))

print(verification_batches_dir)

updated_files = glob.glob(os.path.join(verification_batches_dir, "*.jsonl"))
print(f"Verifying {len(updated_files)} updated files:")

for updated_file in updated_files:
    filename = os.path.basename(updated_file)
    print(f"\nChecking: {filename}")
    
    # Count total records in the file
    record_count = 0
    with open(updated_file, 'r') as f:
        for line in f:
            if line.strip():
                record_count += 1
    
    print(f"  Total records: {record_count}")
    
    # Check first record
    with open(updated_file, 'r') as f:
        first_line = f.readline().strip()
        if first_line:
            first_record = json.loads(first_line)
            if "body" in first_record and "model" in first_record["body"]:
                model_value = first_record["body"]["model"]
                print(f"  Model value: {model_value}")
            if "url" in first_record:
                url_value = first_record["url"]
                print(f"  Url value: {url_value}")
            else:
                print("  No 'body.model' key found")
        else:
            print("  File is empty")

/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/generated_rollouts/soft_estimation/RAVEN/verification/verification_pipeline_outputs/gpt-4.1-nano/mock_batch_for_testing/verification_batches
Verifying 2 updated files:

Checking: batch_0010.jsonl
  Total records: 100
  Model value: gpt-4.1-nano
  Url value: /v1/chat/completions

Checking: batch_0011.jsonl
  Total records: 100
  Model value: gpt-4.1-nano
  Url value: /v1/chat/completions
