## Table 1

In [None]:
import random
random.seed(0)
import json
from anaysis import analysis, print_analysis, get_correct_problem_ids, get_over_leaf_format, analysis_v2

In [None]:
baselines = {
  "gpt-4o-mini": "BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1",
  "gpt-4o": "BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1",
  "o1-mini": ("BaselineAgent_o1-mini_c92_alert_level_t0_s25_trial1", "v2"),
  "o3-mini": ("BaselineAgent_o3-mini_c99_alert_level_t0_s25_trial1", "v2"),
  "phi-4": ("BaselineAgent_phi4_c469_alert_level_t0_s25_trial1", "v2"),
  "r1": ("BaselineAgent_r1_c468_alert_level_t0_s25_trial1", "v2"),
}

log_path = "../secgym/final_results"

for name, b in baselines.items():

    if type(b) == tuple:
        file_folder, version = b
    else:
        file_folder = b
        version = "v1"
    print(name, version)
    get_over_leaf_format(log_path, file_folder, version)
    print("===================")

4o-mini v1
589
& 0.163 & 0.195 & 0.273 & 0.185 & 0.174 & 0.228 & 0.163 & 0.276 & 0.192 
gpt-4o v1
589
& 0.338 & 0.293 & 0.364 & 0.273 & 0.249 & 0.491 & 0.166 & 0.315 & 0.293 
o1-mini v2
589
& 0.147 & 0.244 & 0.091 & 0.23 & 0.16 & 0.333 & 0.189 & 0.382 & 0.222 
o3-mini v2
No usage summary, skipping 32-51, trial 0
589
& 0.35 & 0.293 & 0.273 & 0.257 & 0.227 & 0.404 & 0.253 & 0.36 & 0.296 
phi-4 v2
589
& 0.086 & 0.037 & 0.182 & 0.082 & 0.066 & 0.13 & 0.085 & 0.125 & 0.085 
r1 v2
589
& 0.106 & 0.09 & 0.0 & 0.169 & 0.043 & 0.025 & 0.074 & 0.054 & 0.084 


In [9]:
mmbaselines = [
    "MultiModelBaselineAgent_master_o1_mini_slave_gpt-4o_c96_alert_level_t0_s25_trial1",
    "MultiModelBaselineAgent_master_o1_slave_gpt-4o_c98_alert_level_t0_s25_trial1",
    "MultiModelBaselineAgent_master_o3_mini_slave_gpt-4o_c100_alert_level_t0_s25_trial1"
]

for b in mmbaselines:
    print(b)
    get_over_leaf_format(log_path, b, version="v2")

MultiModelBaselineAgent_master_o1_mini_slave_gpt-4o_c96_alert_level_t0_s25_trial1
589
& 0.304 & 0.256 & 0.273 & 0.238 & 0.296 & 0.316 & 0.211 & 0.379 & 0.279 
MultiModelBaselineAgent_master_o1_slave_gpt-4o_c98_alert_level_t0_s25_trial1
No usage summary, skipping 109-34, trial 0
589
& 0.398 & 0.317 & 0.091 & 0.265 & 0.297 & 0.474 & 0.228 & 0.391 & 0.323 
MultiModelBaselineAgent_master_o3_mini_slave_gpt-4o_c100_alert_level_t0_s25_trial1
589
& 0.404 & 0.31 & 0.364 & 0.274 & 0.264 & 0.333 & 0.218 & 0.375 & 0.308 


## Table 2

In [3]:
from anaysis import analysis, add_to_usage, print_analysis, get_over_leaf_format, analysis_v2
import sys
from matplotlib import pyplot as plt
import numpy as np
import json

def add_to_usage(usage_summary:dict, total_cost=0, total_prompt_tokens=0, total_completion_tokens=0):
    model = list(usage_summary.keys())[-1]
    total_prompt_tokens += usage_summary[model]['prompt_tokens']
    total_completion_tokens += usage_summary[model]['completion_tokens']
    return total_cost, total_prompt_tokens, total_completion_tokens    

def get_avg_accuracy_and_cost(log_path, file_folder, model_name, version="v1", round_cut=-1):
    d = 1_000_000  # price is per 1M tokens
    prices = {
        "gpt-4o": (2.5 / d, 10 / d),
        "gpt-4o-mini": (0.15 / d, 0.6 / d),
        "o1-mini": (1.1 / d, 4.4 / d),
        "o1": (15 / d, 7.5 / d),
        "o3-mini": (1.1 / d, 4.4 / d),
        "deepseek-r1": (0.55 / d, 0.27 / d),
        "phi4": (0.07/ d, 0.14/ d),
    }

    prompt_price, completion_price = prices[model_name]
    file_template = f"{log_path}/{file_folder}" + "/agent_incident_{0}.json"

    total_count = 0
    total_reward = 0
    total_prompt_tokens = 0
    total_completion_tokens = 0
    total_round = 0

    incidents = [5, 34, 38, 39, 55, 134, 166, 322]
    for i in incidents:
        with open(file_template.format(i), "r") as f:
            data = json.load(f)

        if version == "v2":
            result = analysis_v2(data, round_cut=round_cut)
        else:
            result = analysis(data, round_cut=round_cut)

        total_count += result['total_len']
        total_reward += result['total_reward']
        total_prompt_tokens += result['total_prompt_tokens']
        total_completion_tokens += result['total_completion_tokens']
        total_round += result['total_round']

    avg_accuracy = round(total_reward / total_count, 3)
    avg_cost = round(
        (total_prompt_tokens * prompt_price + total_completion_tokens * completion_price)
        / total_count,
        5,
    )
    avg_round = round(total_round / total_count, 3)

    return avg_accuracy, avg_cost, avg_round

In [6]:

tab2_one_trial = {
  "base-gpt-4o": "BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1",
  "strategy-gpt-4o": ("PromptSauceAgent_4o-mini_c34_alert_level_t0_s15_trial1", "v2"),
  "expel-gpt-4o": ("ExpelAgent_gpt-4o_c392_alert_level_t0_s15_trial1", "v2"),
  "react-gpt-4o": ("ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial1", "v2"),

  "base-4o-mini": "BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1",
  "strategy-4o-mini": ("PromptSauceAgent_gpt-4o_c345_alert_level_t0_s15_trial1", "v2"),
  "expel-4o-mini": ("ExpelAgent_4o-mini_c393_alert_level_t0_s15_trial1", "v2"),
  "react-4o-mini": ("ReactAgent_4o-mini_c131_alert_level_t0_s15_trial1", "v2"),
}

log_path = "../secgym/final_results"

for name, b in tab2_one_trial.items():
    if type(b) == tuple:
        file_folder, version = b
    else:
        file_folder = b
        version = "v1"
    if "gpt-4o" in name:
        model_name = "gpt-4o"
    elif "4o-mini" in name:
        model_name = "gpt-4o-mini"
    else:
        raise ValueError(f"Unknown model name: {name}")
    
    print(name, version)
    avg_accuracy, avg_cost, avg_round = get_avg_accuracy_and_cost(log_path, file_folder, model_name, version=version)
    print(f"Avg Accuracy: {avg_accuracy}, Avg Cost: {avg_cost}", 
          f"Avg Round: {avg_round}")
    print(f"& {avg_accuracy} & {avg_round} & {avg_cost} ")

base-gpt-4o v1
Avg Accuracy: 0.293, Avg Cost: 0.23517 Avg Round: 14.154
& 0.293 & 14.154 & 0.23517 
strategy-gpt-4o v2
Error calculating usage: usage summary None
Avg Accuracy: 0.273, Avg Cost: 0.17797 Avg Round: 10.555
& 0.273 & 10.555 & 0.17797 
expel-gpt-4o v2
Avg Accuracy: 0.39, Avg Cost: 0.37938 Avg Round: 8.684
& 0.39 & 8.684 & 0.37938 
react-gpt-4o v2
Avg Accuracy: 0.354, Avg Cost: 0.23712 Avg Round: 10.16
& 0.354 & 10.16 & 0.23712 
base-4o-mini v1
Avg Accuracy: 0.192, Avg Cost: 0.00922 Avg Round: 12.91
& 0.192 & 12.91 & 0.00922 
strategy-4o-mini v2
Avg Accuracy: 0.29, Avg Cost: 0.01048 Avg Round: 11.221
& 0.29 & 11.221 & 0.01048 
expel-4o-mini v2
Avg Accuracy: 0.311, Avg Cost: 0.02298 Avg Round: 8.43
& 0.311 & 8.43 & 0.02298 
react-4o-mini v2
Avg Accuracy: 0.274, Avg Cost: 0.01551 Avg Round: 9.716
& 0.274 & 9.716 & 0.01551 


In [None]:
def analysis_3_trials(data: dict, verbose: bool = False):
    """
    • Aggregates rounds & tokens over every trial.
    • For each trial, picks the *last* model in `usage_summary`
      when adding prompt/completion tokens.
    • Submission / evaluation flags inspected only on the last trial
      (per earlier request).
    """
    total_len = len(data)
    total_reward = total_round = 0
    success_count = non_zero_reward_count = not_submit_count = 0
    path_count, reward_count, round_count = {}, {}, {}
    eval_error_count = fail_to_run_count = 0
    total_prompt_tokens = total_completion_tokens = total_cost = 0
    empty_result_count = error_query_count = query_count = 0  # placeholders

    for q in data:
        p_len = len(q["question_dict"]["shortest_alert_path"])
        path_count.setdefault(p_len, 0)
        reward_count.setdefault(p_len, 0)
        round_count.setdefault(p_len, 0)

        # reward bookkeeping (once per question)
        reward = q["reward"]
        total_reward += reward
        reward_count[p_len] += reward
        if reward > 0:
            non_zero_reward_count += 1
        if reward == 1:
            success_count += 1
        path_count[p_len] += 1

        # ---- iterate over ALL trials ------------------------------------
        for trial in q["trials"].values():
            msgs = trial.get("messages", [])
            tmp_round = max((len(msgs) - 1) // 2, 0)
            total_round += tmp_round
            round_count[p_len] += tmp_round

            # === NEW token logic: use *last* model =======================
            usage_summary = trial.get("usage_summary", {})
            if usage_summary:                                   # guard empty dict
                try:
                    mdl = list(usage_summary.keys())[-1]        # last‑listed model
                    total_prompt_tokens     += usage_summary[mdl].get("prompt_tokens", 0)
                    total_completion_tokens += usage_summary[mdl].get("completion_tokens", 0)
                except Exception as e:
                    print(f"[usage error] q nodes={q['nodes']} trial={mdl}: {e}")

        # ---- submission / evaluation checks on final trial -------------
        last_trial = list(q["trials"].values())[-1]
        if not last_trial["info"].get("submit"):
            not_submit_count += 1
        elif not (last_trial["info"].get("is_json_success", True)
                  and last_trial["info"].get("is_reflect_success", True)):
            print(f"[eval error] q nodes={q['nodes']}: {last_trial['info']}")
            eval_error_count += 1

    # ---- optional console summary --------------------------------------
    if verbose:
        print(f"Average reward: {total_reward / total_len:.6f}")
        print(f"Average round:  {total_round  / total_len:.6f}")

    return {
        "total_len": total_len,
        "total_reward": total_reward,
        "success_count": success_count,
        "non_zero_reward_count": non_zero_reward_count,
        "not_submit_count": not_submit_count,
        "total_round": total_round,
        "total_prompt_tokens": total_prompt_tokens,
        "total_completion_tokens": total_completion_tokens,
        "total_cost": total_cost,   # still placeholder
        "empty_result_count": empty_result_count,
        "error_query_count": error_query_count,
        "query_count": query_count,
        "eval_error_count": eval_error_count,
        "fail_to_run_count": fail_to_run_count,
    }

def get_avg_accuracy_and_cost_3_trial(log_path, file_folder, model_name, version="v1", round_cut=-1):
    d = 1_000_000  # price is per 1M tokens
    prices = {
        "gpt-4o": (2.5 / d, 10 / d),
        "gpt-4o-mini": (0.15 / d, 0.6 / d),
        "o1-mini": (1.1 / d, 4.4 / d),
        "o1": (15 / d, 7.5 / d),
        "o3-mini": (1.1 / d, 4.4 / d),
        "deepseek-r1": (0.55 / d, 0.27 / d),
        "phi4": (0.07/ d, 0.14/ d),
    }

    prompt_price, completion_price = prices[model_name]
    file_template = f"{log_path}/{file_folder}" + "/agent_incident_{0}.json"

    total_count = 0
    total_reward = 0
    total_prompt_tokens = 0
    total_completion_tokens = 0
    total_round = 0

    incidents = [5, 34, 38, 39, 55, 134, 166, 322]
    for i in incidents:
        with open(file_template.format(i), "r") as f:
            data = json.load(f)

        if version == "v2":
            result = analysis_3_trials(data)
        else:
            raise ValueError("Unknown version: {}".format(version))

        total_count += result['total_len']
        total_reward += result['total_reward']
        total_prompt_tokens += result['total_prompt_tokens']
        total_completion_tokens += result['total_completion_tokens']
        total_round += result['total_round']

    avg_accuracy = round(total_reward / total_count, 3)
    avg_cost = round(
        (total_prompt_tokens * prompt_price + total_completion_tokens * completion_price)
        / total_count,
        3,
    )
    avg_round = round(total_round / total_count, 1)

    return avg_accuracy, avg_cost, avg_round

tab2_3_trial = {
  "strategy-gpt-4o": ("PromptSauceAgent_gpt-4o_c345_alert_level_t0_s15_trial3", "v2"),
  "strategy-reflect-gpt-4o": ("PromptSauceReflexionAgent_gpt-4o_c234_alert_level_t0_s15_trial3", "v2"),
  "react-gpt-4o": ("ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial3", "v2"),
  "react-reflect-gpt-4o": ("ReActReflexionAgent_gpt-4o_c378_alert_level_t0_s15_trial3", "v2"),

  "strategy-4o-mini": ("PromptSauceAgent_4o-mini_c34_alert_level_t0_s15_trial3", "v2"),
  "strategy-reflect-4o-mini": ("PromptSauceReflexionAgent_4o-mini_c234_alert_level_t0_s15_trial3", "v2"),
  "react-reflexion-4o-mini": ("ReActReflexionAgent_4o-mini_c378_alert_level_t0_s15_trial3", "v2"),
  "react-4o-mini": ("ReactAgent_4o-mini_c124_alert_level_t0_s15_trial3", "v2"),
}

log_path = "../secgym/final_results"
for name, b in tab2_3_trial.items():
    if type(b) == tuple:
        file_folder, version = b
    else:
        file_folder = b
        version = "v1"
    if "gpt-4o" in name:
        model_name = "gpt-4o"
    elif "4o-mini" in name:
        model_name = "gpt-4o-mini"
    else:
        raise ValueError(f"Unknown model name: {name}")
    
    print(name, version)
    avg_accuracy, avg_cost, avg_round = get_avg_accuracy_and_cost_3_trial(log_path, file_folder, model_name, version=version)
    print(f"Avg Accuracy: {avg_accuracy}, Avg Cost: {avg_cost}", 
          f"Avg Round: {avg_round}")
    print(f"& {avg_accuracy} & {avg_round} & {avg_cost} ")
    print("===================")

strategy-gpt-4o v2
Avg Accuracy: 0.473, Avg Cost: 0.36471 Avg Round: 26.066
& 0.473 & 26.066 & 0.36471 
strategy-reflect-gpt-4o v2
Avg Accuracy: 0.505, Avg Cost: 0.46978 Avg Round: 24.156
& 0.505 & 24.156 & 0.46978 
react-gpt-4o v2
Avg Accuracy: 0.563, Avg Cost: 0.48833 Avg Round: 20.233
& 0.563 & 20.233 & 0.48833 
react-reflect-gpt-4o v2
Avg Accuracy: 0.563, Avg Cost: 0.49576 Avg Round: 20.1
& 0.563 & 20.1 & 0.49576 
strategy-4o-mini v2
Avg Accuracy: 0.418, Avg Cost: 0.02785 Avg Round: 26.316
& 0.418 & 26.316 & 0.02785 
strategy-reflect-4o-mini v2
Avg Accuracy: 0.44, Avg Cost: 0.02775 Avg Round: 25.866
& 0.44 & 25.866 & 0.02775 
react-reflexion-4o-mini v2
Avg Accuracy: 0.452, Avg Cost: 0.0351 Avg Round: 23.392
& 0.452 & 23.392 & 0.0351 
react-4o-mini v2
Avg Accuracy: 0.423, Avg Cost: 0.03567 Avg Round: 23.757
& 0.423 & 23.757 & 0.03567 


In [None]:
tab2_4o = [
    "BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1",
    "PromptSauceAgent_gpt-4o_c83_alert_level_t0_s15_trial2",
    "ReflexionAgent_gpt-4o_c82_alert_level_t0_s15_trial3",
    "ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial1"

]

tab2_4o_mini = [
    "BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1",
    "PromptSauceAgent_4o-mini_c79_alert_level_t0_s15_trial2",
    "ReflexionAgent_4o-mini_c80_alert_level_t0_s15_trial3"
]

for b in tab2_4o:
    print(b)
    get_over_leaf_format(log_path, b, round_cut=15)

print("-"*40)
for b in tab2_4o_mini:
    print(b)
    get_over_leaf_format(log_path, b, round_cut=15)

BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1
589
& 0.304 & 0.268 & 0.364 & 0.238 & 0.218 & 0.439 & 0.126 & 0.304 & 0.261 
PromptSauceAgent_gpt-4o_c83_alert_level_t0_s15_trial2
589
& 0.257 & 0.339 & 0.273 & 0.363 & 0.274 & 0.246 & 0.303 & 0.375 & 0.306 
ReflexionAgent_gpt-4o_c82_alert_level_t0_s15_trial3
589
& 0.469 & 0.537 & 0.455 & 0.417 & 0.344 & 0.463 & 0.379 & 0.606 & 0.447 
ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial1
589
& 0.365 & 0.383 & 0.091 & 0.308 & 0.324 & 0.526 & 0.264 & 0.446 & 0.354 
----------------------------------------
BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1
589
& 0.132 & 0.188 & 0.273 & 0.169 & 0.164 & 0.218 & 0.12 & 0.257 & 0.172 
PromptSauceAgent_4o-mini_c79_alert_level_t0_s15_trial2
589
& 0.294 & 0.39 & 0.182 & 0.196 & 0.185 & 0.193 & 0.246 & 0.268 & 0.251 
ReflexionAgent_4o-mini_c80_alert_level_t0_s15_trial3
589
& 0.433 & 0.488 & 0.182 & 0.459 & 0.26 & 0.491 & 0.492 & 0.539 & 0.435 


In [4]:

# PromptSauceAgent_gpt-4o_c345_alert_level_t0_s15_trial1
# PromptSauceAgent_gpt-4o_c345_alert_level_t0_s15_trial3
# PromptSauceReflexionAgent_gpt-4o_c234_alert_level_t0_s15_trial3
# ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial1
# ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial3
# ReActReflexionAgent_gpt-4o_c378_alert_level_t0_s15_trial3
tab2_4o = {
    "Base+Strategy Prompt": "PromptSauceAgent_gpt-4o_c345_alert_level_t0_s15_trial1",
    "ReAct": "ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial1",
    "Base+Strategy Prompt x3": "PromptSauceAgent_gpt-4o_c345_alert_level_t0_s15_trial3",
    "ReAct x3": "ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial3",
    "ReAct+Reflexion": "ReActReflexionAgent_gpt-4o_c378_alert_level_t0_s15_trial3",
    "Base+Strategy Prompt+Reflexion": "PromptSauceReflexionAgent_gpt-4o_c234_alert_level_t0_s15_trial3",
    "Expel" : "ExpelAgent_4o-mini_c393_alert_level_t0_s15_trial1"
}

# [
#     "BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1",
#     "PromptSauceAgent_gpt-4o_c83_alert_level_t0_s15_trial2",
#     "ReflexionAgent_gpt-4o_c82_alert_level_t0_s15_trial3",
#     "ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial1"
# ]

# PromptSauceAgent_4o-mini_c34_alert_level_t0_s15_trial1
# PromptSauceAgent_4o-mini_c34_alert_level_t0_s15_trial3
# ReactAgent_4o-mini_c124_alert_level_t0_s15_trial3
# ReactAgent_4o-mini_c131_alert_level_t0_s15_trial1
# ReActReflexionAgent_4o-mini_c378_alert_level_t0_s15_trial3
# PromptSauceReflexionAgent_4o-mini_c234_alert_level_t0_s15_trial3

tab2_4o_mini = {
    "Base+Strategy Prompt": "PromptSauceAgent_4o-mini_c34_alert_level_t0_s15_trial1",
    "ReAct": "ReactAgent_4o-mini_c131_alert_level_t0_s15_trial1",
    "Base+Strategy Prompt x3": "PromptSauceAgent_4o-mini_c34_alert_level_t0_s15_trial3",
    "ReAct x3": "ReactAgent_4o-mini_c124_alert_level_t0_s15_trial3",
    "ReAct+Reflexion": "ReActReflexionAgent_4o-mini_c378_alert_level_t0_s15_trial3",
    "Base+Strategy Prompt+Reflexion": "PromptSauceReflexionAgent_4o-mini_c234_alert_level_t0_s15_trial3",
    "Expel" : "ExpelAgent_gpt-4o_c392_alert_level_t0_s15_trial1"
}

log_path = "../secgym/final_results"    

# get_over_leaf_format(log_path, "PromptSauceAgent_4o-mini_c34_alert_level_t0_s15_trial1")

for k, v in tab2_4o.items():
    print(k)
    get_over_leaf_format(log_path, v, round_cut=15)

print("-"*40)
for k, v in tab2_4o_mini.items():
    print(k)
    get_over_leaf_format(log_path, v, round_cut=15)

Base+Strategy Prompt
589
& 0.324 & 0.334 & 0.455 & 0.298 & 0.224 & 0.27 & 0.257 & 0.311 & 0.29 
ReAct
589
& 0.365 & 0.383 & 0.091 & 0.308 & 0.324 & 0.526 & 0.264 & 0.446 & 0.354 
Base+Strategy Prompt x3
589
& 0.457 & 0.515 & 0.364 & 0.469 & 0.414 & 0.526 & 0.46 & 0.543 & 0.473 
ReAct x3
589
& 0.614 & 0.554 & 0.455 & 0.473 & 0.554 & 0.737 & 0.517 & 0.574 & 0.563 
ReAct+Reflexion
No usage summary, skipping 82-82, trial 1
No usage summary, skipping 82-82, trial 2
No usage summary, skipping 156-138, trial 0
No usage summary, skipping 156-138, trial 1
No usage summary, skipping 156-138, trial 2
589
& 0.581 & 0.573 & 0.818 & 0.449 & 0.53 & 0.737 & 0.538 & 0.585 & 0.563 
Base+Strategy Prompt+Reflexion
No usage summary, skipping 137-136, trial 0
589
& 0.48 & 0.524 & 0.545 & 0.491 & 0.474 & 0.579 & 0.483 & 0.554 & 0.505 
Expel
589
& 0.371 & 0.341 & 0.182 & 0.293 & 0.26 & 0.298 & 0.269 & 0.382 & 0.311 
----------------------------------------
Base+Strategy Prompt
No usage summary, skipping 120-1

## Combine trials to get Best of 3 trials for baseline

In [15]:

def combine_results(t2step15_data, t1step25data, round_cut=-1):
    total_count = 0
    total_reward = 0

    for i in range(len(t2step15_data)):
        d1 = t2step15_data[i]
        d2 = t1step25data[i]
        if d1.get("usage_summary") is None or d2.get("usage_summary") is None:
            print(f"Missing usage summary for {i}")
            continue
        if round_cut != -1 and (len(d2["messages"]) - 1) // 2 > round_cut:
            # print(f"Cutting off at round {round_cut}")
            d2['reward'] = 0
        total_reward += max(d1['reward'], d2['reward']) # best of 3
        total_count += 1

    return {
        "total_len": total_count,
        "total_reward": total_reward
    }


def get_over_leaf_format_for_2(log_path, t2step15_file, t1step25_file, round_cut=-1):
    t2step15_file_template = f"{log_path}/{t2step15_file}" + "/agent_incident_{0}.json"
    t1step25_file_template = f"{log_path}/{t1step25_file}" + "/agent_incident_{0}.json"

    total_count = 0
    total_reward = 0
    accs_str = ""

    incidents = [5, 34, 38, 39, 55, 134, 166, 322]
    for i in incidents:
        # print(f"Analysis for incident {i}")
        with open(t2step15_file_template.format(i), "r") as f:
            data1 = json.load(f)
        with open(t1step25_file_template.format(i), "r") as f:
            data2 = json.load(f)

        result = combine_results(data1, data2, round_cut=round_cut)
        accs_str += "& " + str(round(result['total_reward']/result['total_len'], 3)) + " "

        total_count += result['total_len']
        total_reward += result['total_reward']
    
    accs_str += "& " + str(round(total_reward/total_count, 3)) + " "
    print(accs_str)


get_over_leaf_format_for_2(log_path, 
                           "PromptSauceAgent_4o-mini_c79_alert_level_t0_s15_trial2",
                           "PromptSauceAgent_4o-mini_c73_alert_level_t0_s25_trial1",
                           round_cut=15)
    

& 0.359 & 0.51 & 0.4 & 0.301 & 0.237 & 0.333 & 0.366 & 0.382 & 0.351 


In [16]:
get_over_leaf_format_for_2(log_path, 
                           "PromptSauceAgent_gpt-4o_c83_alert_level_t0_s15_trial2",
                           "PromptSauceAgent_gpt-4o_c72_alert_level_t0_s25_trial1",
                           round_cut=15)

& 0.353 & 0.48 & 0.364 & 0.455 & 0.348 & 0.368 & 0.407 & 0.471 & 0.408 


## GPT-4o and GPT-4o-mini with responding to steps

In [64]:
gpt4omini_results = []
for r in [5, 10, 15, 20, 25]:
    avg_reward = get_over_leaf_format(log_path, "BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1", round_cut=r)
    gpt4omini_results.append(avg_reward)

& 0.02 & 0.024 & 0.083 & 0.02 & 0.014 & 0.017 & 0.023 & 0.035 & 0.022 & 0.009 
& 0.074 & 0.174 & 0.083 & 0.12 & 0.074 & 0.138 & 0.101 & 0.236 & 0.122 & 0.009 
& 0.144 & 0.198 & 0.25 & 0.157 & 0.144 & 0.224 & 0.129 & 0.323 & 0.179 & 0.009 
& 0.194 & 0.209 & 0.25 & 0.181 & 0.184 & 0.241 & 0.179 & 0.376 & 0.213 & 0.009 
& 0.194 & 0.209 & 0.25 & 0.197 & 0.184 & 0.241 & 0.179 & 0.386 & 0.217 & 0.009 


In [65]:
gpt40_results = []
for r in [5, 10, 15, 20, 25]:
    avg_reward = get_over_leaf_format(log_path, "BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1", round_cut=r)
    gpt40_results.append(avg_reward)

& 0.1 & 0.129 & 0.0 & 0.112 & 0.03 & 0.069 & 0.034 & 0.105 & 0.08 & 0.464 
& 0.202 & 0.24 & 0.083 & 0.182 & 0.182 & 0.31 & 0.103 & 0.228 & 0.197 & 0.464 
& 0.312 & 0.311 & 0.25 & 0.233 & 0.212 & 0.414 & 0.126 & 0.298 & 0.262 & 0.464 
& 0.35 & 0.311 & 0.25 & 0.243 & 0.212 & 0.483 & 0.161 & 0.333 & 0.285 & 0.464 
& 0.37 & 0.334 & 0.25 & 0.273 & 0.215 & 0.483 & 0.172 & 0.333 & 0.299 & 0.464 


In [None]:
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

def plot_turns_vs_reward(model_rewards: dict, steps: list, output_filename: str):
    """
    Plots the relation between the number of interaction turns and rewards for different models.

    Parameters:
    - model_rewards (dict): A dictionary where keys are model names and values are lists of rewards.
    - steps (list): A list of interaction turns (e.g., [5, 10, 15, 20, 25]).
    - output_filename (str): The filename for saving the output plot.
    """
    plt.figure(figsize=(7, 5), dpi=300)  # High-resolution figure

    markers = ['o', 's', 'D', '^', 'v', '*', 'P', 'X']  # Different markers for distinction
    linestyles = ['-', '--', '-.', ':']  # Variety of line styles
    colors = plt.get_cmap("tab10").colors  # Professional color palette

    for i, (model, rewards) in enumerate(model_rewards.items()):
        plt.plot(
            steps,
            rewards,
            marker=markers[i % len(markers)], 
            linestyle=linestyles[i % len(linestyles)], 
            color=colors[i % len(colors)],
            linewidth=2,
            markersize=6,
            label=model
        )

    plt.xlabel("Number of Interaction Turns", fontsize=18)
    plt.ylabel("Reward", fontsize=18)
    plt.xticks(steps, fontsize=15)  # Explicitly set x-ticks to given steps
    plt.yticks(fontsize=15)
    plt.legend(fontsize=17, loc="best", frameon=True)
    plt.grid(True, linestyle="--", alpha=0.6)

    plt.tight_layout()
    plt.savefig(output_filename, format="png", bbox_inches="tight", dpi=300)
    plt.show()
# Example usage:
model_rewards = {
    "GPT-4o-mini": gpt4omini_results,
    "GPT-4o": gpt40_results
}
steps = [5, 10, 15, 20, 25]
plot_turns_vs_reward(model_rewards, steps, "reward_vs_turns.png")

## Reward with respect to steps

In [80]:
def get_over_leaf_format_path_wise(log_path, file_folder, round_cut=-1):
    file_template = f"{log_path}/{file_folder}" + "/agent_incident_{0}.json"

    total_count = 0
    total_reward = 0
    total_success_count = 0
    total_cost = 0
    total_prompt_tokens = 0
    total_completion_tokens = 0

    accs_str = ""

    data_by_path_count = {}
    incidents = [5, 34, 38, 39, 55, 134, 166, 322]
    for i in incidents:
        # print(f"Analysis for incident {i}")
        with open(file_template.format(i), "r") as f:
            data = json.load(f)
        for d in data:
            p = len(d['question_dict']['shortest_alert_path'])
            if p not in data_by_path_count:
                data_by_path_count[p] = []
            data_by_path_count[p].append(d)

    keys = list(data_by_path_count.keys())
    keys.sort()

    count_str = ""
    for p in keys:
        data = data_by_path_count[p]
        count_str += f"& {len(data)} "
        result = analysis(data, False, round_cut=round_cut)
        accs_str += "& " + str(round(result['total_reward']/result['total_len'], 3)) + " "
        total_count += result['total_len']
        total_reward += result['total_reward']
        total_success_count += result['success_count']
        total_cost += result['total_cost']
        total_prompt_tokens += result['total_prompt_tokens']
        total_completion_tokens += result['total_completion_tokens']
    
    accs_str += "& " + str(round(total_reward/total_count, 3)) + " "
    accs_str += "& " + str(round(total_cost/total_count, 3)) + " "

    pl_str = ""
    for k in keys:
        pl_str += f"& {k} "
    
    print(pl_str)
    print(count_str)    

    print(accs_str)
    return round(total_reward/total_count, 3)

In [1]:
import os
import json
import shutil

# Define base directories
base = "/Users/kevin/Downloads/SecRL/secgym/env/questions/min_overlap/test"
new_base = "/Users/kevin/Downloads/SecRL/secgym/env/questions/min_overlap/test_filtered"

# Ensure new folder exists
os.makedirs(new_base, exist_ok=True)

removed_count = 0

for file in os.listdir(base):
    if file.endswith(".json"):
        file_path = os.path.join(base, file)

        with open(file_path, "r") as f:
            data = json.load(f)

        # Filter out questions with context < 50
        filtered_data = [q for i, q in enumerate(data) if len(q['context']) >= 50]

        # Remove question 9 if it's incident_9.json
        if file == "incident_9.json" and len(filtered_data) > 9:
            del filtered_data[9]  # Question 9 is index 9

        removed_count += len(data) - len(filtered_data)

        # Save to new folder
        new_file_path = os.path.join(new_base, file)
        with open(new_file_path, "w") as f:
            json.dump(filtered_data, f, indent=4)

print(f"Total questions removed: {removed_count}")
print(f"Filtered files saved to: {new_base}")

Total questions removed: 0
Filtered files saved to: /Users/kevin/Downloads/SecRL/secgym/env/questions/min_overlap/test_filtered


In [85]:
get_over_leaf_format_path_wise(log_path, "BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1", round_cut=25)

& 1 & 3 & 5 & 7 & 9 
& 46 & 423 & 98 & 31 & 1 
& 0.391 & 0.217 & 0.158 & 0.117 & 1.0 & 0.217 & 0.009 


0.217

In [86]:
get_over_leaf_format_path_wise(log_path, "BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1", round_cut=25)

& 1 & 3 & 5 & 7 & 9 
& 46 & 423 & 98 & 31 & 1 
& 0.304 & 0.294 & 0.284 & 0.392 & 1.0 & 0.299 & 0.464 


0.299

In [83]:
get_over_leaf_format_path_wise(log_path, "PromptSauceAgent_4o-mini_c73_alert_level_t0_s25_trial1", round_cut=15)

& 1 & 3 & 5 & 7 & 9 
& 46 & 423 & 98 & 31 & 1 
& 0.217 & 0.253 & 0.228 & 0.196 & 1.0 & 0.244 & 0.016 


0.244

In [84]:
get_over_leaf_format_path_wise(log_path, "PromptSauceAgent_4o-mini_c79_alert_level_t0_s15_trial2", round_cut=15)

& 1 & 3 & 5 & 7 & 9 
& 46 & 423 & 98 & 31 & 1 
& 0.37 & 0.261 & 0.196 & 0.228 & 1.0 & 0.258 & 0.02 


0.258

In [2]:
get_over_leaf_format_path_wise(log_path, "ReactAgent_gpt-4o_c121_alert_level_t0_s15_trial1", round_cut=15)

NameError: name 'get_over_leaf_format_path_wise' is not defined