In [26]:
import json
import os
def extract_from_one_file(data_folder, finetune_raw_folder):
    """
    Save correct questions
    """
    incidents = [5, 34, 38, 39, 55, 134, 166, 322]

    data_key = data_folder.split("/")[-1]
    os.makedirs(finetune_raw_folder, exist_ok=True)
    for iid in incidents:

        # check if save path exists
        save_path = f"{finetune_raw_folder}/incident_{iid}.json"
        if os.path.exists(save_path):
            # load
            with open(save_path, "r") as f:
                loaded_data = json.load(f)
        else:
            loaded_data = []

        existed_data_keys = [f"{data_folder}-{d['nodes']}" for d in loaded_data]


        file_name = f"../secgym/final_results/{data_folder}/agent_incident_{iid}.json"
        with open(file_name, "r") as f:
            data = json.load(f)

        selected_data = []
        for d in data:
            if d['reward'] != 1:
                continue
            
            if f"{data_key}-{d['nodes']}" in existed_data_keys:
                continue

            selected_data.append(
                {
                    "messages": d['messages'],
                    "question_dict": d['question_dict'],
                    "nodes": d['nodes'],
                    "data_folder": data_folder,
                }
            )
        loaded_data += selected_data
        print(f"incident {iid} has {len(loaded_data)} data")
        with open(save_path, "w") as f:
            json.dump(loaded_data, f, indent=4)


def prepare_finetune_data(
    save_folder: str,
    finetune_raw_folder: str,
    train_incidents: list, # rest is test
    pre_name: str = ""
):
    incidents = [5, 34, 38, 39, 55, 134, 166, 322]
    train_data = []
    test_incidents = [iid for iid in incidents if iid not in train_incidents]

    for iid in incidents:
        if iid in train_incidents:
            with open(f"{finetune_raw_folder}/incident_{iid}.json", "r") as f:
                data = json.load(f)
            # only get the messages
            data = [{"messages": d["messages"]} for d in data]
            train_data += data
    
    os.makedirs(save_folder, exist_ok=True)
    with open(f"{save_folder}/{pre_name}_train.jsonl", "w") as f:
        for d in train_data:
            f.write(json.dumps(d) + "\n")
        
    print(f"train data has {len(train_data)} data")
    with open(f"{save_folder}/test_incident_ids.json", "w") as f:
        json.dump(test_incidents, f)


In [24]:
baselines = [
    "BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1",
    "BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1",
    "BaselineAgent_o1-mini_c92_alert_level_t0_s25_trial1",
    "BaselineAgent_o3-mini_c99_alert_level_t0_s25_trial1"
]

for b in baselines:
    print(f"Extracting {b}")
    extract_from_one_file(b, "finetune_raw")


Extracting BaselineAgent_gpt-4o_c70_alert_level_t0_s25_trial1
incident 5 has 52 data
incident 34 has 44 data
incident 38 has 5 data
incident 39 has 44 data
incident 55 has 48 data
incident 134 has 39 data
incident 166 has 32 data
incident 322 has 38 data
Extracting BaselineAgent_4o-mini_c71_alert_level_t0_s25_trial1
incident 5 has 52 data
incident 34 has 44 data
incident 38 has 5 data
incident 39 has 44 data
incident 55 has 48 data
incident 134 has 39 data
incident 166 has 32 data
incident 322 has 38 data
Extracting BaselineAgent_o1-mini_c92_alert_level_t0_s25_trial1
incident 5 has 52 data
incident 34 has 44 data
incident 38 has 5 data
incident 39 has 44 data
incident 55 has 48 data
incident 134 has 39 data
incident 166 has 32 data
incident 322 has 38 data
Extracting BaselineAgent_o3-mini_c99_alert_level_t0_s25_trial1
incident 5 has 52 data
incident 34 has 44 data
incident 38 has 5 data
incident 39 has 44 data
incident 55 has 48 data
incident 134 has 39 data
incident 166 has 32 data
in

In [28]:
prepare_finetune_data("finetune_data", "finetune_raw", [5, 39, 55, 134, 166, 322], "no34_38")

cv_sets = [
    [5, 39, 55, 134, 166, 322], # remove 34, 38
    [34, 38, 39, 55, 134, 166, 322], # remove 5
    [5, 34, 38, 55, 134, 166, 322], # remove 39
    [5, 34, 38, 39, 55, 166], # remove 134, 322
    [5, 34, 38, 39, 55, 134, 322], # remove 166
    [5, 34, 38, 39, 134, 166, 322], # remove 55
]

for i, cv in enumerate(cv_sets):
    prepare_finetune_data("finetune_data", "finetune_raw", cv, f"cv{i+1}")


train data has 253 data
train data has 253 data
train data has 250 data
train data has 258 data
train data has 225 data
train data has 270 data
train data has 254 data


In [2]:
qpath = "/Users/kevin/Downloads/SecRL/secgym/env/questions/min_overlap/test"

import os
import json

for file in os.listdir(qpath):
    if file.endswith(".json"):
        with open(f"{qpath}/{file}", "r") as f:
            data = json.load(f)
            print(file, len(data))

incident_38_qa_incident_o1-ga_c42.json 11
incident_34_qa_incident_o1-ga_c42.json 82
incident_5_qa_incident_o1-ga_c42.json 98
incident_39_qa_incident_o1-ga_c42.json 98
incident_134_qa_incident_o1-ga_c42.json 57
incident_322_qa_incident_o1-ga_c42.json 56
incident_166_qa_incident_o1-ga_c42.json 87
incident_55_qa_incident_o1-ga_c42.json 100
