In [5]:
def analysis(data:dict, verbose:bool=False):
    """Analyze the data and print out the results

    Args:
        data (dict): The data to be analyzed, load an "agent log" json file
        
    """
    total_len = len(data)
    total_reward = 0
    total_round = 0
    non_zero_reward_count = 0
    non_success_count = 0
    

    path_count = {}
    reward_count = {}
    round_count = {}
    for k in data:
        p = len(k['question_dict']['shortest_alert_path'])
        if p not in path_count:
            path_count[p] = 0
            reward_count[p] = 0
            round_count[p] = 0

        if k['reward'] > 0 and k['reward'] < 1:
            non_success_count += 1
        if k['reward'] > 0:
            non_zero_reward_count += 1
        
        # total
        total_reward += k['reward']
        total_round += (len(k["messages"]) - 1) // 2

        path_count[p] += 1
        reward_count[p] += k['reward']
        round_count[p] += (len(k["messages"]) - 1) // 2

    if verbose:
        print(f"Average reward: {total_reward}/{total_len} = {round(total_reward/total_len,6)}")
        print(f"Average round: {total_round}/{total_len} = {round(total_round/total_len,6)}")
        # print(f"Non success / non-zero reward count: {non_success_count}/{non_zero_reward_count}")

        # sorted_keys = sorted(path_count.keys())
        # for k in sorted_keys:
        #     print(f"Difficulty {k}: {round(reward_count[k], 2)}/{path_count[k]} = {round(reward_count[k]/path_count[k],6)} | Avg round: {round(round_count[k]/path_count[k], 2)}")


    return {
        "total_len": total_len,
        "total_reward": total_reward,
        "total_round": total_round,
        "non_zero_reward_count": non_zero_reward_count,
        "non_success_count": non_success_count,
        "path_count": path_count,
        "reward_count": reward_count,
        "round_count": round_count
    }

import json

def analysis_one_run(log_path, file_template, print_total=False):

    incidents = [55, 5, 34, 38, 134, 166, 39, 322]

    total_len = 0
    total_reward = 0
    total_round = 0
    path_count = {}
    reward_count = {}


    for i in incidents:
        if not print_total:
            print("*"*20)
            print(f"Analysis for incident {i}")

        a = open(f"{log_path}/{file_template.format(i)}", "r")
        b = json.load(a)
        result_dict = analysis(b, not print_total)

        total_len += result_dict['total_len']
        total_reward += result_dict['total_reward']
        total_round += result_dict['total_round']
        for k in result_dict['path_count']:
            if k not in path_count:
                path_count[k] = 0
                reward_count[k] = 0

            path_count[k] += result_dict['path_count'][k]
            reward_count[k] += result_dict['reward_count'][k]
        if not print_total:
            print("*"*20)
        
    if print_total:
        print("*"*40)
        print("*"*40)
        print("Total analysis")
        print(f"Total length: {total_len}")
        print(f"Total reward: {total_reward}")
        print(f"Total round: {total_round}")
        print(f"Average reward: {total_reward}/{total_len} = {round(total_reward/total_len,6)}")
        print(f"Average round: {total_round}/{total_len} = {round(total_round/total_len,6)}")

        sorted_keys = sorted(path_count.keys())
        for k in sorted_keys:
            print(f"Difficulty {k}: {round(reward_count[k], 2)}/{path_count[k]} = {round(reward_count[k]/path_count[k],6)}")

In [29]:
import json


log_path = "C:/Users/amudgerikar/source/repos/SecRL/secgym/results/"
file_template = "PromptSauceAgent_incident_166_agent_log_gpt-4o_111_alert.json"

incidents = [55, 5, 34, 38, 134, 166, 39, 322]
#incidents = [5]



### GPT-4o-mini

| Incident Number  | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|--------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| Alert Counted (Sorted by) | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |
| Log              | 0.081   | 0.137   | 0.099  | 0.278   | 0.254   | 0.215   | 0.318    | 0.025   |
| Alert            | 0.111   | 0.218   | 0.150  | 0.264   | 0.236   | 0.252   | 0.316    | 0.088   |
| Incident         | 0.094   | 0.144   | 0.214  | 0.318   | 0.206   | 0.254   | 0.253    | 0.063   |


<!-- 
### GPT-4o Log Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |
| **Average Reward**  | 0.070   | 0.186   | 0.172  | 0.194   | 0.190   | 0.257   | 0.385    | 0.125   |
| **Average Round**   | 12.08   | 10.88   | 11.6   | 11.52   | 11.13   | 10.667  | 9.987    | 11.375  |

    total_len += result_dict['total_len']
    total_reward += result_dict['total_reward']
    total_round += result_dict['total_round']
    
    for k in result_dict['path_count']:
        if k not in path_count:
            path_count[k] = 0
            reward_count[k] = 0

### GPT-4o Incident Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.234   | 0.287   | 0.262  | 0.218   | 0.314   | 0.331   | 0.435    | 0.188   |
| **Average Round**   | 10.05   | 9.98    | 10.27  | 10.87   | 10.39   | 10.370  | 9.684    | 11.688  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |


### GPT-4o-mini Log Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.081   | 0.137   | 0.099  | 0.278   | 0.254   | 0.215   | 0.318    | 0.025   |
| **Average Round**   | 12.24   | 12.28   | 11.7   | 10.83   | 10.85   | 11.469  | 11.405   | 11.438  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |

### GPT-4o-mini Alert Level
| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.111   | 0.218   | 0.150  | 0.264   | 0.236   | 0.252   | 0.316    | 0.088   |
| **Average Round**   | 11.41   | 11.92   | 11.84  | 10.82   | 10.9    | 11.123  | 10.937   | 11.188  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       |

### GPT-4o-mini Incident Level

| **Metric**         | **5**   | **39**  | **55** | **34**  | **166** | **134** | **322**  | **38**  |
|---------------------|---------|---------|--------|---------|---------|---------|----------|---------|
| **Average Reward**  | 0.094   | 0.144   | 0.214  | 0.318   | 0.206   | 0.254   | 0.253    | 0.063   |
| **Average Round**   | 11.58   | 11.92   | 11.54  | 10.84   | 10.89   | 11.309  | 11.025   | 10.313  |
| **Alert Number**    | 68      | 47      | 17     | 11      | 11      | 9       | 9        | 4       | -->

# Difficulty Reward Change Table

<!-- | **Model**    | **Level**       | **Reward Change (1 → 3 → 5 → 7 → 9)**      | **Avg Reward** | **Avg Round** |
|--------------|-----------------|---------------------------------------------|---------------|---------------|
| GPT-4o       | Log Level       | 0.326667 → 0.192438 → 0.181102 → 0.132169 → 0.3 | 0.198675      | 11.177515     |
| GPT-4o-mini  | Log Level       | 0.35697 → 0.201493 → 0.143307 → 0.071324 → 0.0  | 0.189089      | 11.542899     |
| GPT-4o       | Alert Level     | 0.411515 → 0.287562 → 0.331969 → 0.24507 → 0.5  | 0.306686      | 10.37426      |
| GPT-4o-mini  | Alert Level     | 0.341818 → 0.210945 → 0.205669 → 0.155831 → 0.0 | 0.213822      | 11.29142      |
| GPT-4o       | Incident Level  | 0.366061 → 0.273632 → 0.308346 → 0.258028 → 0.4 | 0.289408      | 10.278107     |
| GPT-4o-mini  | Incident Level  | 0.320606 → 0.20995 → 0.154016 → 0.169014 → 0.2  | 0.205799      | 11.285503     | -->

| **Model**    | **Level**       | **Reward Change (1 → 3 → 5 → 7 → 9)**      | **Avg Reward** | **Avg Round** |
|--------------|-----------------|---------------------------------------------|---------------|---------------|
| GPT-4o       | Log Level       | 0.327 → 0.192 → 0.181 → 0.132 → 0.300      | 0.199         | 11.178        |
| GPT-4o-mini  | Log Level       | 0.357 → 0.201 → 0.143 → 0.071 → 0.000      | 0.189         | 11.543        |
| GPT-4o       | Alert Level     | 0.412 → 0.288 → 0.332 → 0.245 → 0.500      | 0.307         | 10.374        |
| GPT-4o-mini  | Alert Level     | 0.342 → 0.211 → 0.206 → 0.156 → 0.000      | 0.214         | 11.291        |
| GPT-4o       | Incident Level  | 0.366 → 0.274 → 0.308 → 0.258 → 0.400      | 0.289         | 10.278        |
| GPT-4o-mini  | Incident Level  | 0.321 → 0.210 → 0.154 → 0.169 → 0.200      | 0.206         | 11.286        |

GPT-4o
Log level
********************
Analysis for incident 55
Average reward: 17.224/100 = 0.17224
Average round: 1160/100 = 11.6
Average reward: 52.4/100 = 0.524
Average round: 999/100 = 9.99
Non success / non-zero reward count: 1/53
Difficulty 1: 5/11 = 0.454545 | Avg round: 11.36
Difficulty 3: 47.4/89 = 0.532584 | Avg round: 9.82
********************
********************
Analysis for incident 5
Average reward: 52.4/100 = 0.524
Average round: 999/100 = 9.99
Non success / non-zero reward count: 1/53
Difficulty 1: 5/11 = 0.454545 | Avg round: 11.36
Difficulty 3: 47.4/89 = 0.532584 | Avg round: 9.82
********************
********************
Analysis for incident 34
Average reward: 19.36/100 = 0.1936
Average round: 1152/100 = 11.52
Average reward: 52.4/100 = 0.524
Average round: 999/100 = 9.99
Non success / non-zero reward count: 1/53
Difficulty 1: 5/11 = 0.454545 | Avg round: 11.36
Difficulty 3: 47.4/89 = 0.532584 | Avg round: 9.82
********************
********************
Analysis for