# This is a dataset generator and validator for the MITRE ATT&CK framework using OpenAI's API models.

In [1]:
from distutils.command.clean import clean

import openai
import json

from scipy.io.arff.tests.test_arffread import missing
from tqdm.auto import tqdm
import uuid
import time
import os
import re
from datetime import datetime
import os
from dotenv import load_dotenv

load_dotenv()  # Load from .env file
openai_key = os.getenv("OPENAI_API_KEY")
openai.api_key = openai_key

default_model_name =  "GPT-4o mini" #"gpt-4-turbo"

MITRE_TECHNIQUES_DEMO = [
    ("T1566.001", "Phishing: Spearphishing Attachment"),
    ("T1203", "Exploitation for Client Execution"),
    ("T1486", "Data Encrypted for Impact"),
    ("T1021.001", "Remote Desktop Protocol"),
    ("T1003.001", "OS Credential Dumping"),
    ("T1059.003", "Windows Command Shell"),
    ("T1071.001", "Application Layer Protocol: Web Protocols"),
    ("T1053.005", "Scheduled Task/Job: Scheduled Task"),
    ("T1218.010", "Signed Binary Proxy Execution: Regsvr32"),
    ("T1490", "Inhibit System Recovery"),
    # Add more as needed
]

#MITRE_TECHNIQUES = MITRE_TECHNIQUES_DEMO

# Load the MITRE techniques from the JSON file, we use the distributed ones
with open("Samples/mitre_techniques_distributed.json", "r") as f:
    MITRE_TECHNIQUES = json.load(f)

client = openai.OpenAI(api_key=openai.api_key)

In [8]:
#def generate_incident(technique_tuple):
from pprint import pprint
# Given a tuple (technique_id, technique_desc), generate a realistic incident scenario using OpenAI's API and a given model.
# Returns the generated incident in JSON format.
def generate_incident(technique_tuple, model_name="gpt-4-turbo"):
    technique_id, technique_desc = technique_tuple
    # prompt_old = f"""
    # Your are an expert in cybersecurity.
    # Generate a realistic cybersecurity incident scenario aligned with MITRE ATT&CK technique {technique_id}: {technique_desc}.
    #
    # Clearly provide:
    # - A short incident description.
    # - A chronological set of 3-5 structured attack logs with timestamps and actions.
    # - 3 to 5 clearly structured ground-truth mitigations steps as you would do in a real incident response.
    #
    # Format your output exactly as JSON:
    # {{
    #   "incident_id": "{uuid.uuid4()}",
    #   "technique_id": "{technique_id}",
    #   "technique_desc": "{technique_desc}",
    #   "incident_description": "...",
    #   "attack_logs": [{{"timestamp":"...", "host":"...", "action":"...", "details":"..."}}, ...],
    #   "ground_truth_mitigations": ["step1", "step2", "..."]
    # }}
    # """


    incident_prompt_template = f"""
    You are a cybersecurity simulation assistant trained to generate structured synthetic incident data for automation workflows.

    Your task is to generate a JSON object representing a cybersecurity incident that is aligned with a given MITRE ATT&CK technique which is described in the following link: https://attack.mitre.org/techniques/{technique_id}/

    The JSON must include the following fields:

    1. `incident_id` – a unique identifier (UUID format).
    2. `technique_id` – e.g., T1059
    3. `technique_desc` – short description of the MITRE technique (e.g., Command and Scripting Interpreter).
    4. `incident_description` – a 2-sentence narrative of how the attack unfolded.
    5. `attack_logs` – a list of exactly 3 logs, each containing:
       - `timestamp` (ISO 8601 format)
       - `host` (e.g., host-22)
       - `action` (e.g., “File Dropped”)
       - `details` (e.g., suspicious behavior)
    6. `ground_truth_mitigations` – a list of 3 to 6 mitigation steps. Each mitigation should include:
       - `step`: description of the action (e.g., “Kill malicious process”)
       - `uuid`: a unique UUID
       - `agent`: organization ID responsible (e.g., organization--abc)
       - `command`: executable bash-style command to perform the step
       - optionally:
         - `condition`: for if/else branching
         - `loop`: for iterative steps
         - `variables`: any variable set or used

    You are encouraged to add structural diversity:
    - Some mitigations should include loop or conditional logic (e.g., “repeat until scan is clean”).
    - Some should be parallelizable or dependent on others using variable links.

    The final output must be valid JSON.
    Only return the JSON object.

    Target MITRE Technique:
    {technique_id} - {technique_desc}
    """

    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": incident_prompt_template}],
        #temperature=0.6,
        #max_tokens=4096,
    )

    output_json_str = response.choices[0].message.content
    match = re.search(r'\{.*\}', output_json_str, re.DOTALL)

    clean_json_str = None # This is the string that will be parsed
    clean_json = None # This is the json that will be returned (dict format, not a string)
    if match:
        clean_json_str = match.group(0)
        clean_json = json.loads(clean_json_str)

        #pprint(clean_json)
    else:
        print("No JSON-like structure found in the response.")
        clean_json = output_json_str

    timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%S")

    key = f"{technique_id}_{timestamp}"
    return {key: clean_json}




In [7]:
import random
# Set the random seed for reproducibility
random.seed(42)
from tqdm.auto import tqdm

# Take by round robin the techniques from the list and generate incidents
models_to_try = ["gpt-4o"] #["gpt-4o-mini"]# ["o3", "o3-mini", "gpt-4o", "gpt-4.5-preview"] #["gpt-4o-mini"] #["o3-mini"] #["gpt-4o-mini", "o3-mini"]#["gpt-4o", "gpt-4.5-preview"] #"gpt-4o-mini", "gpt-4-turbo", o mini", "GPT-4o", "gpt-3.5-turbo", "o3-mini", "o3", "gpt-4"]

NUM_SAMPLES_PER_TECHNIQUE = 1
DEMO_MODE = True

LIMIT = NUM_SAMPLES_PER_TECHNIQUE * len(MITRE_TECHNIQUES) if not DEMO_MODE else 30

# Iterate over the models
for model_iter in tqdm(models_to_try, desc="Models", total=len(models_to_try), leave=False):
    # For each model, iterate over the techniques
    model_outputs = []
    round_robin_counter = 0

    with tqdm(total=NUM_SAMPLES_PER_TECHNIQUE, desc="Round robin iteration", leave=False) as pbar:
        while len(model_outputs) < LIMIT:
            round_robin_counter += 1
            pbar.update(1)

            # Randomly shuffle the techniques
            techniques  = MITRE_TECHNIQUES
            random.shuffle(MITRE_TECHNIQUES)

            # Wrap tqdm ONCE for this batch
            technique_iterator = tqdm(techniques, desc="Techniques", total=len(techniques), leave=False)

            for technique_tuple in technique_iterator:
                # For each technique, generate an incident
                try:
                    response = generate_incident(technique_tuple, model_name=model_iter)
                    # THe response is a dictionary with the techniqueid_timestamp as key and the incident as value
                    if response:
                        # Add the response to the model outputs
                        model_outputs.append(response)

                        if len(model_outputs) >= LIMIT:
                            break
                except Exception as e:
                    print(f"Error generating incident for model {model_iter} and technique {technique_tuple}: {e}")

    # Save the generated incident to a JSON file
    model_output_folder = f"Samples/{model_iter}"
    os.makedirs(model_output_folder, exist_ok=True)
    # Append the response to a JSON file
    # Check if the file already exists
    if os.path.exists(f"{model_output_folder}/dataset.json"):
        # If it exists, load the existing data
        with open(f"{model_output_folder}/dataset.json", "r") as f:
            existing_data = json.load(f)
            # Append the new data to the existing data
            existing_data.extend(model_outputs)
            model_outputs = existing_data
    with open(f"{model_output_folder}/dataset.json", "w") as f:
        json.dump(model_outputs, f, indent=4)



Models:   0%|          | 0/1 [00:00<?, ?it/s]
Round robin iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
                                                            [A

NameError: name 'MITRE_TECHNIQUES' is not defined

In [21]:
print(len(model_outputs))
# # Save the generated incident to a JSON file
# model_output_folder = f"Samples/{models_to_try[0]}"
# os.makedirs(model_output_folder, exist_ok=True)
# # Append the response to a JSON file
# # Check if the file already exists
# if os.path.exists(f"{model_output_folder}/dataset.json"):
#     # If it exists, load the existing data
#     with open(f"{model_output_folder}/dataset.json", "r") as f:
#         existing_data = json.load(f)
#         # Append the new data to the existing data
#         existing_data.extend(model_outputs)
#         model_outputs = existing_data
# with open(f"{model_output_folder}/dataset.json", "w") as f:
#     json.dump(model_outputs, f, indent=4)

1211
