In [2]:
import random
random.seed(0)

def analysis(data:dict, verbose:bool=False):
    """Analyze the data and print out the results

    Args:
        data (dict): The data to be analyzed, load an "agent log" json file
        
    """
    total_len = len(data)
    total_reward = 0
    total_round = 0
    non_zero_reward_count = 0
    non_success_count = 0

    path_count = {}
    reward_count = {}
    round_count = {}
    for k in data:
        p = len(k['question_dict']['shortest_alert_path'])
        if p not in path_count:
            path_count[p] = 0
            reward_count[p] = 0
            round_count[p] = 0

        if k['reward'] > 0 and k['reward'] < 1:
            non_success_count += 1
        if k['reward'] > 0:
            non_zero_reward_count += 1
        
        # total
        total_reward += k['reward']
        total_round += (len(k["messages"]) - 1) // 2

        path_count[p] += 1
        reward_count[p] += k['reward']
        round_count[p] += (len(k["messages"]) - 1) // 2

    if verbose:
        print(f"Average reward: {total_reward}/{total_len} = {round(total_reward/total_len,6)}")
        print(f"Average round: {total_round}/{total_len} = {round(total_round/total_len,6)}")
        # print(f"Non success / non-zero reward count: {non_success_count}/{non_zero_reward_count}")

        # sorted_keys = sorted(path_count.keys())
        # for k in sorted_keys:
        #     print(f"Difficulty {k}: {round(reward_count[k], 2)}/{path_count[k]} = {round(reward_count[k]/path_count[k],6)} | Avg round: {round(round_count[k]/path_count[k], 2)}")


    return {
        "total_len": total_len,
        "total_reward": total_reward,
        "total_round": total_round,
        "non_zero_reward_count": non_zero_reward_count,
        "non_success_count": non_success_count,
        "path_count": path_count,
        "reward_count": reward_count,
        "round_count": round_count
    }

import json

def analysis_one_run(log_path, file_template, print_total=False, sample_size=-1):

    incidents = [55, 5, 34, 38, 134, 166, 39, 322]

    total_len = 0
    total_reward = 0
    total_round = 0
    path_count = {}
    reward_count = {}

    all_data = []
    for i in incidents:
        if not print_total:
            print("*"*20)
            print(f"Analysis for incident {i}")

        a = open(f"{log_path}/{file_template.format(i)}", "r")
        b = json.load(a)
        all_data.extend(b)
    print(len(all_data))
        
    random.shuffle(all_data)
    if sample_size > 0:
        # sample_size is the size for each difficulty, For difficulty 1 3 5 7 9, we will sample sample_size number of incidents
        # go through the data and sample sample_size number for each difficulty
        # go through each problem, if the difficulty reaches sample_size, won't append it
        sampled_data = []
        sampled_count = {}
        for k in all_data:
            p = len(k['question_dict']['shortest_alert_path'])
            if p not in sampled_count:
                sampled_count[p] = 0
            if sampled_count[p] < sample_size:
                sampled_data.append(k)
                sampled_count[p] += 1
        all_data = sampled_data
        
    result_dict = analysis(all_data, not print_total)

    total_len += result_dict['total_len']
    total_reward += result_dict['total_reward']
    total_round += result_dict['total_round']
    for k in result_dict['path_count']:
        if k not in path_count:
            path_count[k] = 0
            reward_count[k] = 0

        path_count[k] += result_dict['path_count'][k]
        reward_count[k] += result_dict['reward_count'][k]
    if not print_total:
        print("*"*20)
        
    if print_total:
        print("*"*40)
        print("*"*40)
        print("Total analysis")
        print(f"Total length: {total_len}")
        print(f"Total reward: {total_reward}")
        print(f"Total round: {total_round}")
        print(f"Average reward: {total_reward}/{total_len} = {round(total_reward/total_len,6)}")
        print(f"Average round: {total_round}/{total_len} = {round(total_round/total_len,6)}")

        sorted_keys = sorted(path_count.keys())
        for k in sorted_keys:
            print(f"Difficulty {k}: {round(reward_count[k], 2)}/{path_count[k]} = {round(reward_count[k]/path_count[k],6)}")


def get_correct_problem_ids(log_path, file_template, incident_id):
    a = open(f"{log_path}/{file_template.format(incident_id)}", "r")
    b = json.load(a)
    correct_ids = []
    for k in b:
        if k['reward'] == 1:
            correct_ids.append(k['question_id'])
    return correct_ids


In [51]:
import json


log_path = "C:/Users/amudgerikar/source/repos/SecRL/secgym/results/"
file_template = "PromptSauceAgent_incident_166_agent_log_gpt-4o_111_alert.json"

incidents = [55, 5, 34, 38, 134, 166, 39, 322]
#incidents = [5]


In [3]:
# baseline
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/base_agent_experiments_4o/alert_level"
file_template = "incident_{}_agent_log_gpt-4o_46_alert.json"

analysis_one_run(path, file_template, True)

676
****************************************
****************************************
Total analysis
Total length: 676
Total reward: 207.32000000000005
Total round: 7013
Average reward: 207.32000000000005/676 = 0.306686
Average round: 7013/676 = 10.37426
Difficulty 1: 27.16/66 = 0.411515
Difficulty 3: 115.6/402 = 0.287562
Difficulty 5: 42.16/127 = 0.331969
Difficulty 7: 17.4/71 = 0.24507
Difficulty 9: 5/10 = 0.5


method

baseline | 30.7
refined_prompt | 40.0
refined_prompt (step=50) | 49.2
reflexion | 51


In [None]:
# reflexion + GPT-4o
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/reflexion_agent_experiments_4o/alert_level/steps=15"
file_template = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_111_alert.json"

analysis_one_run(path, file_template, True)

676
****************************************
****************************************
Total analysis
Total length: 676
Total reward: 344.864
Total round: 5846
Average reward: 344.864/676 = 0.510154
Average round: 5846/676 = 8.647929
Difficulty 1: 39/66 = 0.590909
Difficulty 3: 221.4/402 = 0.550746
Difficulty 5: 57/127 = 0.448819
Difficulty 7: 26.46/71 = 0.372732
Difficulty 9: 1/10 = 0.1


In [55]:
print(f"Prompt Sauce Agent GPT-4o")

print("Step = 15")
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/prompt_sauce_agent_experiments_4o/alert_level/steps=15"
file_template = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_46_alert_sum.json"
analysis_one_run(path, file_template, True)


print("Step = 50")
path = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/prompt_sauce_agent_experiments_4o/alert_level/steps=50"
file_template = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_46_alert_sum.json"
analysis_one_run(path, file_template, True)

Prompt Sauce Agent GPT-4o
Step = 15
676
****************************************
****************************************
Total analysis
Total length: 676
Total reward: 270.18399999999997
Total round: 6978
Average reward: 270.18399999999997/676 = 0.39968
Average round: 6978/676 = 10.322485
Difficulty 1: 26/66 = 0.393939
Difficulty 3: 161.6/402 = 0.40199
Difficulty 5: 51.12/127 = 0.40252
Difficulty 7: 28.46/71 = 0.400901
Difficulty 9: 3/10 = 0.3
Step = 50
654
****************************************
****************************************
Total analysis
Total length: 654
Total reward: 321.66399999999993
Total round: 10638
Average reward: 321.66399999999993/654 = 0.491841
Average round: 10638/654 = 16.266055
Difficulty 1: 33.72/66 = 0.510909
Difficulty 3: 196.36/380 = 0.516737
Difficulty 5: 51.72/127 = 0.407244
Difficulty 7: 34.86/71 = 0.491042
Difficulty 9: 5/10 = 0.5


## Overlap of correct questions with different methods

In [41]:
import json

def get_correct_problem_ids(log_path, file_template, incident_id):
    with open(f"{log_path}/{file_template.format(incident_id)}", "r") as file:
        data = json.load(file)
    correct_ids = []
    for i, entry in enumerate(data):
        if entry['reward'] == 1:
            correct_ids.append(i)
    return correct_ids

def jaccard_similarity(path_1, template_1, path_2, template_2, incidents):
    """
    Calculate the Jaccard similarity between two agents across multiple incidents.

    Args:
        path_1 (str): Path to logs for Agent 1.
        template_1 (str): File template for Agent 1's logs.
        path_2 (str): Path to logs for Agent 2.
        template_2 (str): File template for Agent 2's logs.
        incidents (list of int): List of incident IDs to compare.

    Returns:
        float: The overall Jaccard similarity across all incidents.
    """
    union_set = set()
    intersection_set = set()

    offset = 0  # Initialize offset to ensure unique IDs across incidents

    for incident_id in incidents:
        # Retrieve correct question IDs for each agent and apply offset
        correct_1 = {qid + offset for qid in get_correct_problem_ids(path_1, template_1, incident_id)}
        correct_2 = {qid + offset for qid in get_correct_problem_ids(path_2, template_2, incident_id)}

        print(f"Incident {incident_id}")
        print(f"Correct 1: {correct_1}")
        print(f"Correct 2: {correct_2}")
        print()
        
        # Update union and intersection sets
        union_set.update(correct_1 | correct_2)
        intersection_set.update(correct_1 & correct_2)
        
        # Increment offset to avoid ID conflicts for the next incident
        offset += max(len(correct_1), len(correct_2)) + 1

    # Calculate Jaccard similarity
    if not union_set:  # To handle cases with no data
        return 0.0
    return len(intersection_set) / len(union_set)

In [42]:
# Baseline
path_1 = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/base_agent_experiments_4o/alert_level"
template_1 = "incident_{}_agent_log_gpt-4o_46_alert.json"

# Reflexion
path_2 = "/Users/kevin/Downloads/SecRL/secgym/agent_experiment_logs/reflexion_agent_experiments_4o/alert_level/steps=15"
template_2 = "PromptSauceAgent_incident_{}_agent_log_gpt-4o_111_alert.json"

incidents = [55, 5, 34, 38, 134, 166, 39, 322]

jaccard_similarity(path_1=path_1, template_1=template_1, path_2=path_2, template_2=template_2, incidents=incidents)


Incident 55
Correct 1: {0, 4, 8, 14, 21, 24, 26, 29, 30, 31, 32, 38, 39, 41, 46, 47, 51, 54, 57, 58, 59, 61, 70, 78, 79, 86, 93, 94, 96, 98, 99}
Correct 2: {0, 4, 7, 8, 11, 13, 14, 22, 23, 24, 28, 30, 31, 32, 34, 35, 38, 43, 44, 47, 48, 51, 53, 56, 57, 62, 67, 69, 75, 76, 78, 81, 83, 98}

Incident 5
Correct 1: {133, 134, 36, 37, 38, 39, 52, 54, 58, 60, 62, 66, 67, 68, 69, 73, 74, 76, 87, 94, 95, 97, 99, 100, 103, 112, 114, 117, 119, 123, 124, 125}
Correct 2: {130, 131, 133, 35, 36, 37, 38, 39, 40, 43, 45, 47, 52, 53, 54, 56, 62, 65, 67, 68, 73, 75, 89, 90, 91, 92, 93, 94, 95, 98, 99, 103, 110, 111, 112, 113, 114, 118, 119, 124, 125}

Incident 34
Correct 1: {129, 131, 132, 133, 143, 163, 166, 169, 79, 80, 81, 82, 84, 91, 92, 94, 103, 104, 112, 114, 116, 125}
Correct 2: {129, 130, 131, 132, 133, 135, 136, 139, 140, 143, 144, 146, 147, 148, 152, 153, 154, 160, 163, 164, 165, 166, 168, 169, 170, 172, 175, 176, 77, 78, 79, 81, 84, 91, 92, 93, 94, 96, 97, 98, 105, 106, 108, 110, 111, 113, 11

0.4755244755244755

## History




### GPT-4o-mini

| Incident Number  | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|--------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| Alert Counted (Sorted by) | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |
| Log              | 0.081   | 0.137   | 0.099  | 0.278   | 0.254   | 0.215   | 0.318    | 0.025   |
| Alert            | 0.111   | 0.218   | 0.150  | 0.264   | 0.236   | 0.252   | 0.316    | 0.088   |
| Incident         | 0.094   | 0.144   | 0.214  | 0.318   | 0.206   | 0.254   | 0.253    | 0.063   |


<!-- 
### GPT-4o Log Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |
| **Average Reward**  | 0.070   | 0.186   | 0.172  | 0.194   | 0.190   | 0.257   | 0.385    | 0.125   |
| **Average Round**   | 12.08   | 10.88   | 11.6   | 11.52   | 11.13   | 10.667  | 9.987    | 11.375  |

    total_len += result_dict['total_len']
    total_reward += result_dict['total_reward']
    total_round += result_dict['total_round']
    
    for k in result_dict['path_count']:
        if k not in path_count:
            path_count[k] = 0
            reward_count[k] = 0

### GPT-4o Incident Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.234   | 0.287   | 0.262  | 0.218   | 0.314   | 0.331   | 0.435    | 0.188   |
| **Average Round**   | 10.05   | 9.98    | 10.27  | 10.87   | 10.39   | 10.370  | 9.684    | 11.688  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |


### GPT-4o-mini Log Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.081   | 0.137   | 0.099  | 0.278   | 0.254   | 0.215   | 0.318    | 0.025   |
| **Average Round**   | 12.24   | 12.28   | 11.7   | 10.83   | 10.85   | 11.469  | 11.405   | 11.438  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |

### GPT-4o-mini Alert Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.111   | 0.218   | 0.150  | 0.264   | 0.236   | 0.252   | 0.316    | 0.088   |
| **Average Round**   | 11.41   | 11.92   | 11.84  | 10.82   | 10.9    | 11.123  | 10.937   | 11.188  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |

### GPT-4o-mini Incident Level

| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.094   | 0.144   | 0.214  | 0.318   | 0.206   | 0.254   | 0.253    | 0.063   |
| **Average Round**   | 11.58   | 11.92   | 11.54  | 10.84   | 10.89   | 11.309  | 11.025   | 10.313  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       | -->

# Difficulty Reward Change Table

<!-- | **Model**    | **Level**       | **Reward Change (1 → 3 → 5 → 7 → 9)**      | **Avg Reward** | **Avg Round** |
|--------------|-----------------|---------------------------------------------|---------------|---------------|
| GPT-4o       | Log Level       | 0.326667 → 0.192438 → 0.181102 → 0.132169 → 0.3 | 0.198675      | 11.177515     |
| GPT-4o-mini  | Log Level       | 0.35697 → 0.201493 → 0.143307 → 0.071324 → 0.0  | 0.189089      | 11.542899     |
| GPT-4o       | Alert Level     | 0.411515 → 0.287562 → 0.331969 → 0.24507 → 0.5  | 0.306686      | 10.37426      |
| GPT-4o-mini  | Alert Level     | 0.341818 → 0.210945 → 0.205669 → 0.155831 → 0.0 | 0.213822      | 11.29142      |
| GPT-4o       | Incident Level  | 0.366061 → 0.273632 → 0.308346 → 0.258028 → 0.4 | 0.289408      | 10.278107     |
| GPT-4o-mini  | Incident Level  | 0.320606 → 0.20995 → 0.154016 → 0.169014 → 0.2  | 0.205799      | 11.285503     | -->

| **Model**    | **Level**       | **Reward Change (1 → 3 → 5 → 7 → 9)**      | **Avg Reward** | **Avg Round** |
|--------------|-----------------|---------------------------------------------|---------------|---------------|
| GPT-4o       | Log Level       | 0.327 → 0.192 → 0.181 → 0.132 → 0.300      | 0.199         | 11.178        |
| GPT-4o-mini  | Log Level       | 0.357 → 0.201 → 0.143 → 0.071 → 0.000      | 0.189         | 11.543        |
| GPT-4o       | Alert Level     | 0.412 → 0.288 → 0.332 → 0.245 → 0.500      | 0.307         | 10.374        |
| GPT-4o-mini  | Alert Level     | 0.342 → 0.211 → 0.206 → 0.156 → 0.000      | 0.214         | 11.291        |
| GPT-4o       | Incident Level  | 0.366 → 0.274 → 0.308 → 0.258 → 0.400      | 0.289         | 10.278        |
| GPT-4o-mini  | Incident Level  | 0.321 → 0.210 → 0.154 → 0.169 → 0.200      | 0.206         | 11.286        |

# QA Validation test

1. With higher difficulty, the reward should be lower.
2. More advanced model should have higher reward.
3. Are the correct questions always the same?