In [12]:
import random
random.seed(0)
import json

def analysis(data:dict, verbose:bool=False):
    """Analyze the data and print out the results

    Args:
        data (dict): The data to be analyzed, load an "agent log" json file
        
    """
    total_len = len(data)
    total_reward = 0
    total_round = 0
    
    success_count = 0 # reward == 1
    non_zero_reward_count = 0 # reward > 0
    submit_count = 0

    path_count = {}
    reward_count = {} # reward for each difficulty
    round_count = {}
    eval_error_count = 0

    # cost
    total_cost = 0
    total_prompt_tokens = 0
    total_completion_tokens = 0

    # query efficiency
    empty_result_count = 0
    error_query_count = 0
    query_count = 0

    success_non_empty_query_count = 0
    potential_good_example_count = 0

    fail_to_run_count = 0
    max_round = 0
    min_round = 1000


    for ii, k in enumerate(data):
        if k.get("usage_summary") is None:
            print("no usage summary", ii)
            fail_to_run_count += 1
            continue

        p = len(k['question_dict']['shortest_alert_path'])
        if p not in path_count:
            path_count[p] = 0
            reward_count[p] = 0
            round_count[p] = 0

        if k['reward'] > 0:
            non_zero_reward_count += 1
        if k['reward'] == 1:
            success_count += 1
        
        # total
        total_reward += k['reward']
        total_round += (len(k["messages"]) - 1) // 2

        path_count[p] += 1
        reward_count[p] += k['reward']
        round_count[p] += (len(k["messages"]) - 1) // 2
        max_round = max(max_round, (len(k["messages"]) - 1) // 2)
        min_round = min(min_round, (len(k["messages"]) - 1) // 2)
        submit_count += k['info'].get("submit", 0)

        if "error" in k['info'] or k['info'].get("is_json_success") == False or k['info'].get("is_reflect_success") == False:
            eval_error_count += 1

        try:
            total_cost += k['usage_summary']['total_cost']
        except Exception as e:
            print(k)
            raise e

        model = list(k['usage_summary'].keys())[-1]
        total_prompt_tokens += k['usage_summary'][model]['prompt_tokens']
        total_completion_tokens += k['usage_summary'][model]['completion_tokens']

        success_select_count_per_problem = 0
        for i, m in enumerate(k['messages']):
            content = m['content']
            if m['role'] == "user":
                if "ProgrammingError" in content or "DataError" in content:
                    error_query_count += 1
                elif content == "[]" or content == "":
                    empty_result_count += 1
                elif i > 0 and k['messages'][i-1]['role'] == "assistant" and  "execute[SELECT" in k['messages'][i-1]['content']: 
                    success_select_count_per_problem += 1
            if m['role'] == "assistant" and "Action: execute[" in content:
                query_count += 1
            
        if success_select_count_per_problem >= 4:
            potential_good_example_count += 1
    print(f"Max round: {max_round}, Min round: {min_round}")
                
    if verbose:
        print(f"Average reward: {total_reward}/{total_len} = {round(total_reward/total_len,6)}")
        print(f"Average round: {total_round}/{total_len} = {round(total_round/total_len,6)}")

        # sorted_keys = sorted(path_count.keys())
        # for k in sorted_keys:
        #     print(f"Difficulty {k}: {round(reward_count[k], 2)}/{path_count[k]} = {round(reward_count[k]/path_count[k],6)} | Avg round: {round(round_count[k]/path_count[k], 2)}")


    return {
        "total_len": total_len,
        "potential_good_example_count": potential_good_example_count,

        # Performance Analysis
        "total_reward": total_reward,
        "success_count": success_count, # reward==1
        "non_zero_reward_count": non_zero_reward_count, # for evaluator usefullness
        "submit_count": submit_count,

        # Efficiency / Computational Analysis
        "total_cost": total_cost,
        "total_prompt_tokens": total_prompt_tokens,
        "total_completion_tokens": total_completion_tokens,
        "total_round": total_round,

        # Query Efficiency
        "empty_result_count": empty_result_count,
        "error_query_count": error_query_count,
        "query_count": query_count,

        # For difficulty
        "path_count": path_count,
        "round_count": round_count,
        "reward_count": reward_count,

        # Evaluation Analysis
        "eval_error_count": eval_error_count,

        "fail_to_run_count": fail_to_run_count
    }


def print_analysis(result_dict:dict, head:str=None):
    print("*"*40)
    if head:
        print(f"{head} analysis")
    problem_count = result_dict['total_len']

    print(f"Average reward: {result_dict['total_reward']}/{problem_count} = {round(result_dict['total_reward']/problem_count,4)}")
    print(f"Success rate: {result_dict['success_count']}/{problem_count} = {round(result_dict['success_count']/problem_count * 100,2)}%")
    
    # Computational Analysis
    print(f"** Computational Analysis:")
    print(f"Average round: {result_dict['total_round']}/{problem_count} = {round(result_dict['total_round']/problem_count,4)}")
    print(f"Average cost: {result_dict['total_cost']}/{problem_count} = {round(result_dict['total_cost']/problem_count,4)}")
    print(f"Average prompt tokens: {result_dict['total_prompt_tokens']}/{problem_count} = {round(result_dict['total_prompt_tokens']/problem_count,4)}")
    print(f"Average completion tokens: {result_dict['total_completion_tokens']}/{problem_count} = {round(result_dict['total_completion_tokens']/problem_count,4)}")

    print("** Query Efficiency: ")
    success_query_count = result_dict['query_count'] - result_dict['error_query_count']
    print(f'Success Query rate: {success_query_count}/{result_dict["query_count"]} = {round(success_query_count/result_dict["query_count"] * 100,2)}%')
    print(f'Success Non-Empty Query rate: {success_query_count-result_dict["empty_result_count"]}/{result_dict["query_count"]} = {round((success_query_count-result_dict["empty_result_count"])/result_dict["query_count"] * 100,2)}%')
    
    print(f"Potential Good Example Count: {result_dict['potential_good_example_count']} / {problem_count}")
    print(f"Fail to run count: {result_dict['fail_to_run_count']}")

def analysis_one_run(log_path, file_template, print_total=False, sample_size=-1):

    incidents = [5, 34, 38, 39, 55, 134, 166, 322]

    total_len = 0
    total_reward = 0
    total_round = 0
    path_count = {}
    reward_count = {}

    all_data = []
    for i in incidents:
        if not print_total:
            print("*"*20)
            print(f"Analysis for incident {i}")

        a = open(f"{log_path}/{file_template.format(i)}", "r")
        b = json.load(a)
        all_data.extend(b)
    print(len(all_data))
        
    random.shuffle(all_data)
    if sample_size > 0:
        # sample_size is the size for each difficulty, For difficulty 1 3 5 7 9, we will sample sample_size number of incidents
        # go through the data and sample sample_size number for each difficulty
        # go through each problem, if the difficulty reaches sample_size, won't append it
        sampled_data = []
        sampled_count = {}
        for k in all_data:
            p = len(k['question_dict']['shortest_alert_path'])
            if p not in sampled_count:
                sampled_count[p] = 0
            if sampled_count[p] < sample_size:
                sampled_data.append(k)
                sampled_count[p] += 1
        all_data = sampled_data
        
    result_dict = analysis(all_data, not print_total)

    total_len += result_dict['total_len']
    total_reward += result_dict['total_reward']
    total_round += result_dict['total_round']
    for k in result_dict['path_count']:
        if k not in path_count:
            path_count[k] = 0
            reward_count[k] = 0

        path_count[k] += result_dict['path_count'][k]
        reward_count[k] += result_dict['reward_count'][k]
    if not print_total:
        print("*"*20)
        
    if print_total:
        print("*"*40)
        print("*"*40)
        print("Total analysis")
        print(f"Total length: {total_len}")
        print(f"Total reward: {total_reward}")
        print(f"Total round: {total_round}")
        print(f"Average reward: {total_reward}/{total_len} = {round(total_reward/total_len,6)}")
        print(f"Average success rate: {result_dict['success_count']}/{total_len} = {round(result_dict['success_count']/total_len,6)}")
        print(f"Average round: {total_round}/{total_len} = {round(total_round/total_len,6)}")

        sorted_keys = sorted(path_count.keys())
        for k in sorted_keys:
            print(f"Difficulty {k}: {round(reward_count[k], 2)}/{path_count[k]} = {round(reward_count[k]/path_count[k],6)}")


def get_correct_problem_ids(log_path, file_template, incident_id):
    a = open(f"{log_path}/{file_template.format(incident_id)}", "r")
    b = json.load(a)
    correct_ids = []
    for k in b:
        if k['reward'] == 1:
            correct_ids.append(k['question_id'])
    return correct_ids


In [13]:

def get_over_leaf_format(log_path, file_folder):
    file_template = f"{log_path}/{file_folder}" + "/agent_incident_{0}.json"

    total_count = 0
    total_reward = 0
    total_success_count = 0
    total_cost = 0
    total_prompt_tokens = 0
    total_completion_tokens = 0
    total_round = 0

    accs_str = ""

    incidents = [5, 34, 38, 39, 55, 134, 166, 322]
    for i in incidents:
        print(f"Analysis for incident {i}")
        with open(file_template.format(i), "r") as f:
            data = json.load(f)
        result = analysis(data, False)
        accs_str += "& " + str(round(result['total_reward']/result['total_len'], 3)) + " "

        total_count += result['total_len']
        total_reward += result['total_reward']
        total_success_count += result['success_count']
        total_cost += result['total_cost']
        total_prompt_tokens += result['total_prompt_tokens']
        total_completion_tokens += result['total_completion_tokens']
        total_round += result['total_round']
    
    accs_str += "& " + str(round(total_reward/total_count, 3)) + " "
    accs_str += "& " + str(round(total_cost/total_count, 3)) + " "
    accs_str += "& " + str(round(total_round/total_count)) + " //"
    print(accs_str)

In [55]:
import os 

log_path = "../secgym/final_results"
dirs = os.listdir("../secgym/final_results")
print(dirs)
for d in dirs:
    # check if directory
    if not "alert_level_" in d:
        continue
    print(d)
    get_over_leaf_format(log_path, d)
    print("*"*40)

['BaselineAgent_gpt-4o_c77_log_level_t0', 'PromptSauceAgent_gpt-4o_c72_alert_level_t0', 'BaselineAgent_4o-mini_c78_log_level_t0', 'PromptSauceAgent_gpt-4o_c75_log_level_t0', 'PromptSauceAgent_4o-mini_c74_log_level_t0', 'BaselineAgent_4o-mini_c71_alert_level_t0', 'PromptSauceAgent_4o-mini_c73_alert_level_t0', 'BaselineAgent_gpt-4o_c70_alert_level_t0']
PromptSauceAgent_gpt-4o_c72_alert_level_t0
Analysis for incident 5
Max round: 25, Min round: 4
Analysis for incident 34
Max round: 25, Min round: 1
Analysis for incident 38
Max round: 25, Min round: 6
Analysis for incident 39
Max round: 25, Min round: 4
Analysis for incident 55
Max round: 25, Min round: 4
Analysis for incident 134
Max round: 25, Min round: 5
Analysis for incident 166
Max round: 25, Min round: 3
Analysis for incident 322
Max round: 25, Min round: 5
& 0.364 & 0.506 & 0.333 & 0.366 & 0.222 & 0.328 & 0.31 & 0.385 & 0.351 & 0.553 & 14 //
****************************************
BaselineAgent_4o-mini_c71_alert_level_t0
Analysis 

In [8]:
incidents = [5, 34, 38, 39, 55, 134, 166, 322]

log_path = "../secgym/final_results"

file_name = "BaselineAgent_gpt-4o_c70_alert_level_t0/agent_incident_{0}.json"

for i in incidents:
    file_path = f"{log_path}/{file_name.format(i)}"
    with open(file_path, "r") as f:
        data = json.load(f)
    result = analysis(data, verbose=False)
    print_analysis(result, head=f"Analysis for incident {i}")

# analysis_one_run(log_path, file_name, print_total=True)

FileNotFoundError: [Errno 2] No such file or directory: '../secgym/final_results/BaselineAgent_gpt-4o_c70_alert_level_t0/agent_incident_5.json'

In [14]:
incidents = [55, 5, 34, 38, 134, 166, 39, 322]

log_path = "../secgym/final_results"


file_name = "PromptSauceAgent_gpt-4o_c72_alert_level_t0_s25_trial1/agent_incident_{0}.json"

for i in incidents:
    file_path = f"{log_path}/{file_name.format(i)}"
    with open(file_path, "r") as f:
        data = json.load(f)
    result = analysis(data, verbose=False)
    print_analysis(result, head=f"Analysis for incident {i}")

analysis_one_run(log_path, file_name, print_total=True)

Max round: 25, Min round: 4
****************************************
Analysis for incident 55 analysis
Average reward: 22.199999999999996/100 = 0.222
Success rate: 21/100 = 21.0%
** Computational Analysis:
Average round: 1403/100 = 14.03
Average cost: 71.18461499999998/100 = 0.7118
Average prompt tokens: 13779234/100 = 137792.34
Average completion tokens: 152563/100 = 1525.63
** Query Efficiency: 
Success Query rate: 1037/1327 = 78.15%
Success Non-Empty Query rate: 601/1327 = 45.29%
Potential Good Example Count: 10 / 100
Fail to run count: 0
Max round: 25, Min round: 4
****************************************
Analysis for incident 5 analysis
Average reward: 36.36/100 = 0.3636
Success rate: 35/100 = 35.0%
** Computational Analysis:
Average round: 1417/100 = 14.17
Average cost: 51.488265/100 = 0.5149
Average prompt tokens: 9842961/100 = 98429.61
Average completion tokens: 151564/100 = 1515.64
** Query Efficiency: 
Success Query rate: 1089/1344 = 81.03%
Success Non-Empty Query rate: 531/1

In [None]:
{'nodes': '48-108', 'reward': 0, 'question_dict': {'context': "In the context of a security incident, a new user account 'bdadmin' with specific details was created using a file `net.exe` to maintain access to the device or organization. The command was executed using a `powershell.exe` script located at `c:\\users\\tgs2z\\downloads\\domaindominance27\\domaindominance27.ps1`. This incident highlights that attackers might be attempting to sustain their access.", 'question': 'In the same device and timeframe, there was a detection of a credential theft tool. Can you identify the name of the file used for this credential theft?', 'answer': 'mimikatz.exe', 'solution': ['A suspicious new user account creation was initiated by `powershell.exe`.', 'The Mimikatz credential theft tool `mimikatz.exe` was detected, potentially being used to harvest credentials on the device.'], 'start_alert': 48, 'end_alert': 108, 'start_entities': [18, 49, 50, 51], 'end_entities': [64], 'shortest_alert_path': [48, 17, 108]}, 'trial': 0, 'info': {}, 'messages': [{'role': 'system', 'content': 'You are a security analyst. \nYou need to answer a given security question by querying the database.\nThe logs are stored in a MySQL database, you can use SQL queries to retrieve entries as needed.\nNote there are more than 20 tables in the database, so you may need to explore the schema or check example entries to understand the database structure.\n\nYour response should always be a thought-action pair:\nThought: <your reasoning>\nAction: <your SQL query>\n\nIn Thought, you can analyse and reason about the current situation, \nAction can be one of the following: \n(1) execute[<your query>], which executes the SQL query\n(2) submit[<your answer>], which is the final answer to the question\n'}, {'role': 'user', 'content': "In the context of a security incident, a new user account 'bdadmin' with specific details was created using a file `net.exe` to maintain access to the device or organization. The command was executed using a `powershell.exe` script located at `c:\\users\\tgs2z\\downloads\\domaindominance27\\domaindominance27.ps1`. This incident highlights that attackers might be attempting to sustain their access. In the same device and timeframe, there was a detection of a credential theft tool. Can you identify the name of the file used for this credential theft?"}], 'usage_summary': None}


In [51]:
import json


log_path = "C:/Users/amudgerikar/source/repos/SecRL/secgym/results/"
file_template = "PromptSauceAgent_incident_166_agent_log_gpt-4o_111_alert.json"

incidents = [55, 5, 34, 38, 134, 166, 39, 322]
#incidents = [5]


In [3]:
# baseline
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/base_agent_experiments_4o/alert_level"
file_template = "incident_{}_agent_log_gpt-4o_46_alert.json"

analysis_one_run(path, file_template, True)

676
****************************************
****************************************
Total analysis
Total length: 676
Total reward: 207.32000000000005
Total round: 7013
Average reward: 207.32000000000005/676 = 0.306686
Average round: 7013/676 = 10.37426
Difficulty 1: 27.16/66 = 0.411515
Difficulty 3: 115.6/402 = 0.287562
Difficulty 5: 42.16/127 = 0.331969
Difficulty 7: 17.4/71 = 0.24507
Difficulty 9: 5/10 = 0.5


In [None]:
# reflexion + GPT-4o
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/reflexion_agent_experiments_4o/alert_level/steps=15"
file_template = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_111_alert.json"

analysis_one_run(path, file_template, True)

676
****************************************
****************************************
Total analysis
Total length: 676
Total reward: 344.864
Total round: 5846
Average reward: 344.864/676 = 0.510154
Average round: 5846/676 = 8.647929
Difficulty 1: 39/66 = 0.590909
Difficulty 3: 221.4/402 = 0.550746
Difficulty 5: 57/127 = 0.448819
Difficulty 7: 26.46/71 = 0.372732
Difficulty 9: 1/10 = 0.1


In [55]:
print(f"Prompt Sauce Agent GPT-4o")

print("Step = 15")
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/prompt_sauce_agent_experiments_4o/alert_level/steps=15"
file_template = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_46_alert_sum.json"
analysis_one_run(path, file_template, True)


print("Step = 50")
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/prompt_sauce_agent_experiments_4o/alert_level/steps=50"
file_template = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_46_alert_sum.json"
analysis_one_run(path, file_template, True)

Prompt Sauce Agent GPT-4o
Step = 15
676
****************************************
****************************************
Total analysis
Total length: 676
Total reward: 270.18399999999997
Total round: 6978
Average reward: 270.18399999999997/676 = 0.39968
Average round: 6978/676 = 10.322485
Difficulty 1: 26/66 = 0.393939
Difficulty 3: 161.6/402 = 0.40199
Difficulty 5: 51.12/127 = 0.40252
Difficulty 7: 28.46/71 = 0.400901
Difficulty 9: 3/10 = 0.3
Step = 50
654
****************************************
****************************************
Total analysis
Total length: 654
Total reward: 321.66399999999993
Total round: 10638
Average reward: 321.66399999999993/654 = 0.491841
Average round: 10638/654 = 16.266055
Difficulty 1: 33.72/66 = 0.510909
Difficulty 3: 196.36/380 = 0.516737
Difficulty 5: 51.72/127 = 0.407244
Difficulty 7: 34.86/71 = 0.491042
Difficulty 9: 5/10 = 0.5


| Method                   | Accuracy (%) |
|--------------------------|----------|
| Baseline                | 30.7     |
| Refined Prompt          | 40.0     |
| Refined Prompt (step=50)| 49.2     |
| Reflexion               | 51.0      |


## Overlap of correct questions with different methods

In [41]:
import json

def get_correct_problem_ids(log_path, file_template, incident_id):
    with open(f"{log_path}/{file_template.format(incident_id)}", "r") as file:
        data = json.load(file)
    correct_ids = []
    for i, entry in enumerate(data):
        if entry['reward'] == 1:
            correct_ids.append(i)
    return correct_ids

def jaccard_similarity(path_1, template_1, path_2, template_2, incidents):
    """
    Calculate the Jaccard similarity between two agents across multiple incidents.

    Args:
        path_1 (str): Path to logs for Agent 1.
        template_1 (str): File template for Agent 1's logs.
        path_2 (str): Path to logs for Agent 2.
        template_2 (str): File template for Agent 2's logs.
        incidents (list of int): List of incident IDs to compare.

    Returns:
        float: The overall Jaccard similarity across all incidents.
    """
    union_set = set()
    intersection_set = set()

    offset = 0  # Initialize offset to ensure unique IDs across incidents

    for incident_id in incidents:
        # Retrieve correct question IDs for each agent and apply offset
        correct_1 = {qid + offset for qid in get_correct_problem_ids(path_1, template_1, incident_id)}
        correct_2 = {qid + offset for qid in get_correct_problem_ids(path_2, template_2, incident_id)}

        print(f"Incident {incident_id}")
        print(f"Correct 1: {correct_1}")
        print(f"Correct 2: {correct_2}")
        print()
        
        # Update union and intersection sets
        union_set.update(correct_1 | correct_2)
        intersection_set.update(correct_1 & correct_2)
        
        # Increment offset to avoid ID conflicts for the next incident
        offset += max(len(correct_1), len(correct_2)) + 1

    # Calculate Jaccard similarity
    if not union_set:  # To handle cases with no data
        return 0.0
    return len(intersection_set) / len(union_set)

In [42]:
# Baseline
path_1 = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/base_agent_experiments_4o/alert_level"
template_1 = "incident_{}_agent_log_gpt-4o_46_alert.json"

# Reflexion
path_2 = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/reflexion_agent_experiments_4o/alert_level/steps=15"
template_2 = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_111_alert.json"

incidents = [55, 5, 34, 38, 134, 166, 39, 322]

jaccard_similarity(path_1=path_1, template_1=template_1, path_2=path_2, template_2=template_2, incidents=incidents)


Incident 55
Correct 1: {0, 4, 8, 14, 21, 24, 26, 29, 30, 31, 32, 38, 39, 41, 46, 47, 51, 54, 57, 58, 59, 61, 70, 78, 79, 86, 93, 94, 96, 98, 99}
Correct 2: {0, 4, 7, 8, 11, 13, 14, 22, 23, 24, 28, 30, 31, 32, 34, 35, 38, 43, 44, 47, 48, 51, 53, 56, 57, 62, 67, 69, 75, 76, 78, 81, 83, 98}

Incident 5
Correct 1: {133, 134, 36, 37, 38, 39, 52, 54, 58, 60, 62, 66, 67, 68, 69, 73, 74, 76, 87, 94, 95, 97, 99, 100, 103, 112, 114, 117, 119, 123, 124, 125}
Correct 2: {130, 131, 133, 35, 36, 37, 38, 39, 40, 43, 45, 47, 52, 53, 54, 56, 62, 65, 67, 68, 73, 75, 89, 90, 91, 92, 93, 94, 95, 98, 99, 103, 110, 111, 112, 113, 114, 118, 119, 124, 125}

Incident 34
Correct 1: {129, 131, 132, 133, 143, 163, 166, 169, 79, 80, 81, 82, 84, 91, 92, 94, 103, 104, 112, 114, 116, 125}
Correct 2: {129, 130, 131, 132, 133, 135, 136, 139, 140, 143, 144, 146, 147, 148, 152, 153, 154, 160, 163, 164, 165, 166, 168, 169, 170, 172, 175, 176, 77, 78, 79, 81, 84, 91, 92, 93, 94, 96, 97, 98, 105, 106, 108, 110, 111, 113, 11

0.4755244755244755

## History




### GPT-4o-mini

| Incident Number  | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|--------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| Alert Counted (Sorted by) | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |
| Log              | 0.081   | 0.137   | 0.099  | 0.278   | 0.254   | 0.215   | 0.318    | 0.025   |
| Alert            | 0.111   | 0.218   | 0.150  | 0.264   | 0.236   | 0.252   | 0.316    | 0.088   |
| Incident         | 0.094   | 0.144   | 0.214  | 0.318   | 0.206   | 0.254   | 0.253    | 0.063   |


<!-- 
### GPT-4o Log Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |
| **Average Reward**  | 0.070   | 0.186   | 0.172  | 0.194   | 0.190   | 0.257   | 0.385    | 0.125   |
| **Average Round**   | 12.08   | 10.88   | 11.6   | 11.52   | 11.13   | 10.667  | 9.987    | 11.375  |

    total_len += result_dict['total_len']
    total_reward += result_dict['total_reward']
    total_round += result_dict['total_round']
    
    for k in result_dict['path_count']:
        if k not in path_count:
            path_count[k] = 0
            reward_count[k] = 0

### GPT-4o Incident Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.234   | 0.287   | 0.262  | 0.218   | 0.314   | 0.331   | 0.435    | 0.188   |
| **Average Round**   | 10.05   | 9.98    | 10.27  | 10.87   | 10.39   | 10.370  | 9.684    | 11.688  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |


### GPT-4o-mini Log Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.081   | 0.137   | 0.099  | 0.278   | 0.254   | 0.215   | 0.318    | 0.025   |
| **Average Round**   | 12.24   | 12.28   | 11.7   | 10.83   | 10.85   | 11.469  | 11.405   | 11.438  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |

### GPT-4o-mini Alert Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.111   | 0.218   | 0.150  | 0.264   | 0.236   | 0.252   | 0.316    | 0.088   |
| **Average Round**   | 11.41   | 11.92   | 11.84  | 10.82   | 10.9    | 11.123  | 10.937   | 11.188  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |

### GPT-4o-mini Incident Level

| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.094   | 0.144   | 0.214  | 0.318   | 0.206   | 0.254   | 0.253    | 0.063   |
| **Average Round**   | 11.58   | 11.92   | 11.54  | 10.84   | 10.89   | 11.309  | 11.025   | 10.313  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       | -->

# Difficulty Reward Change Table

<!-- | **Model**    | **Level**       | **Reward Change (1 → 3 → 5 → 7 → 9)**      | **Avg Reward** | **Avg Round** |
|--------------|-----------------|---------------------------------------------|---------------|---------------|
| GPT-4o       | Log Level       | 0.326667 → 0.192438 → 0.181102 → 0.132169 → 0.3 | 0.198675      | 11.177515     |
| GPT-4o-mini  | Log Level       | 0.35697 → 0.201493 → 0.143307 → 0.071324 → 0.0  | 0.189089      | 11.542899     |
| GPT-4o       | Alert Level     | 0.411515 → 0.287562 → 0.331969 → 0.24507 → 0.5  | 0.306686      | 10.37426      |
| GPT-4o-mini  | Alert Level     | 0.341818 → 0.210945 → 0.205669 → 0.155831 → 0.0 | 0.213822      | 11.29142      |
| GPT-4o       | Incident Level  | 0.366061 → 0.273632 → 0.308346 → 0.258028 → 0.4 | 0.289408      | 10.278107     |
| GPT-4o-mini  | Incident Level  | 0.320606 → 0.20995 → 0.154016 → 0.169014 → 0.2  | 0.205799      | 11.285503     | -->

| **Model**    | **Level**       | **Reward Change (1 → 3 → 5 → 7 → 9)**      | **Avg Reward** | **Avg Round** |
|--------------|-----------------|---------------------------------------------|---------------|---------------|
| GPT-4o       | Log Level       | 0.327 → 0.192 → 0.181 → 0.132 → 0.300      | 0.199         | 11.178        |
| GPT-4o-mini  | Log Level       | 0.357 → 0.201 → 0.143 → 0.071 → 0.000      | 0.189         | 11.543        |
| GPT-4o       | Alert Level     | 0.412 → 0.288 → 0.332 → 0.245 → 0.500      | 0.307         | 10.374        |
| GPT-4o-mini  | Alert Level     | 0.342 → 0.211 → 0.206 → 0.156 → 0.000      | 0.214         | 11.291        |
| GPT-4o       | Incident Level  | 0.366 → 0.274 → 0.308 → 0.258 → 0.400      | 0.289         | 10.278        |
| GPT-4o-mini  | Incident Level  | 0.321 → 0.210 → 0.154 → 0.169 → 0.200      | 0.206         | 11.286        |

# QA Validation test

1. With higher difficulty, the reward should be lower.
2. More advanced model should have higher reward.
3. Are the correct questions always the same?

In [3]:
from textwrap import dedent

prompt = dedent("""hello {nmae}, how are you doing today?""")
print(prompt)

print(prompt.format("Kevin"))

hello {nmae}, how are you doing today?


KeyError: 'nmae'