In [None]:
import os
import json

In [None]:
def collect_failed_jailbreaks(PAIR_file: str, output_file: str, rating_thres: int):
    """
    Collect failed jailbreaking objectives from PAIR output json
    where ALL prompts for an objective have rating < rating_thres.
    """
    
    # Read the JSON file
    json_file_path = f"data/{PAIR_file}"
    
    if not os.path.exists(json_file_path):
        print(f"Error: {json_file_path} not found!")
        return
    
    try:
        with open(json_file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            
            print("opening PAIR file ", json_file_path)
            print(f"getting failed objectives from total of {len(data)} objective")
            
    except Exception as e:
        print(f"Error reading JSON file: {e}")
        return
    
    failed_objectives = []
    
    # Iterate through each objective
    for objective_data in data:
        objective = objective_data.get("Objective", "")
        output = objective_data.get("Output", {})
        
        # Check if output is valid, continue if not
        if not output or not isinstance(output, dict):
            continue
        
        # Check if ALL prompts for this objective have rating < threshold
        all_prompts_below_threshold = True
        
        print("current objective ", objective)
        for category, prompts in output.items():
            if isinstance(prompts, list):
                for prompt_data in prompts:
                    rating = prompt_data.get("Rating", 0)
                    print(rating)
                    
                    # Check if rating is above thres
                    if rating >= rating_thres:
                        all_prompts_below_threshold = False
                        break
                
                if not all_prompts_below_threshold:
                    break
        
        # Only add objective if ALL prompts are below threshold and it has prompts
        if all_prompts_below_threshold :
            failed_objectives.append(objective)
    
    # Print results
    print(f"Found {len(failed_objectives)} objectives where ALL prompts have rating < {rating_thres}\n")
    print("=" * 80)
    
    for i, objective in enumerate(failed_objectives, 1):
        print(f"\n{i}. OBJECTIVE: {objective}")
        print("-" * 80)
    
    # Save to a new JSON file
    output_file = f"data/{output_file}"
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(failed_objectives, file, indent=2, ensure_ascii=False)
    
    print(f"\nResults saved to {output_file}")
    
    # Print summary statistics
    print(f"\nSUMMARY:")
    print(f"Total failed objectives: {len(failed_objectives)}")

In [None]:
collect_failed_jailbreaks(PAIR_file="PAIR_results_target_gpt_oss_20b.json", output_file="agentic/failed_PAIR_prompt_gpt_oss_20b.json", rating_thres=2)