#Install

SDK: https://github.com/anthropics/anthropic-sdk-python

In [None]:
!pip install anthropic

import os
import json
import pandas as pd
import time
import re
import csv

Collecting anthropic
  Downloading anthropic-0.28.0-py3-none-any.whl (862 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m862.7/862.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from anthropic)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jiter<1,>=0.4.0 (from anthropic)
  Downloading jiter-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (327 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.6/327.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->anthropic)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->anthropic)


# This is formatted as code


### Import packages

In [None]:
import pathlib
import textwrap


from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

  # Used to securely store your API key
from google.colab import userdata

### Setup your API key

Before you can use the Cluade API, you must first obtain an API key. If you don't already have one, create a key with one click in Google AI Studio.

<a class="button button-primary" href="https://console.anthropic.com/settings/keys" target="_blank" rel="noopener noreferrer">Get an API key</a>


In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name `CLUADE_API_KEY`.

Once you have the API key, pass it to the SDK. You can do this in two ways:

* Put the key in the `CLUADE_API_KEY` environment variable (the SDK will automatically pick it up from there).


In [None]:
import os
from anthropic import Anthropic

client = Anthropic(
    # This is the default and can be omitted
    # Or use `os.getenv('finalproject-secret-key')` to fetch an environment variable.
    #CLUADE_API_KEY=userdata.get('finalproject-secret-key')
    api_key=userdata.get("ANTHROPIC_API_KEY"),
)


### Demo test for us

In [None]:
snort_rule = '''alert tcp $HOME_NET [21,25,443,465,636,992,993,995,2484] -> $EXTERNAL_NET any ( msg:"SERVER-OTHER OpenSSL TLSv1.1 large heartbeat response - possible ssl heartbleed attempt"; flow:to_client,established; content:"|16 03 02|"; byte_jump:2,0,relative; content:"|18 03 02|",within 3,fast_pattern; byte_test:2,>,128,0,relative; metadata:policy max-detect-ips drop,policy security-ips drop,ruleset community; service:ssl; reference:cve,2014-0160; classtype:attempted-recon; sid:30781; rev:5; )'''

message = client.messages.create(
    max_tokens=1024,
    messages=[
        {
            "role": "user",
            "content": f"""
                          I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

                          First, find the techniques from MITRE ATT&CK that are most relevant to the Snort rule.

                          Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

                          Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.

                          Thus, the format of your overall response should look like what's shown between the <examples></examples> tags. Make sure to follow the formatting and spacing exactly.


                          <examples>
                          [
                            "sid": "2274",
                            "Technique ID": "T1110",
                            "Technique name": "Brute Force",
                            "Quotes": [
                              "\"PROTOCOL-POP login brute force attempt\"",
                              "track by_dst,count 30,seconds 30"
                            ],
                            "Explanation": "The rule is looking for excessive \"USER\" commands within a short period of time, which are common indicators of brute-force attacks targeting the POP3 service."
                          ]
                          </examples>

                          Do not include anything besides write the JSON.
                          """,
        }
    ],
    model="claude-2.1",
    temperature=0,
)


#print(message.content) # list

In [None]:
text = str(message.content)
#text = str(message.content).split("\\n\\n")[1].split("', type")[0]
print(type(text))
print(text)

t_numbers = re.findall(r'[\'\"](T\d+(?:\.\d+)?)', text)

print(t_numbers)

<class 'str'>
[ContentBlock(text='{\n    "Sid": "30781",\n    "Technique ID": "T1529",\n    "Technique Name": "System Services Discovery",\n    "Quotes": "\\"This rule detects attackers attempting to exploit the Heartbleed vulnerability in OpenSSL to read sensitive information from the server\'s memory.\\"",\n    "Explanation": "The rule is looking for signs of attackers trying to exploit the Heartbleed vulnerability, which allows reading sensitive data from memory. This aligns with the System Services Discovery technique in MITRE ATT&CK, which involves gathering information about services running on remote systems. The attempt to exploit Heartbleed to gather data is a form of system services discovery."\n}', type='text')]
['T1529']


### Data:
Our data will be taken from 162 snort rules that have already been manually labeled to techniques from MITRE ATT&CK.

In [None]:
!git clone https://github.com/trickdeath0/Labeling_IDS_to_MITRE.git

Cloning into 'Labeling_IDS_to_MITRE'...
remote: Enumerating objects: 413, done.[K
remote: Counting objects: 100% (413/413), done.[K
remote: Compressing objects: 100% (287/287), done.[K
remote: Total 413 (delta 216), reused 307 (delta 111), pack-reused 0[K
Receiving objects: 100% (413/413), 9.19 MiB | 10.44 MiB/s, done.
Resolving deltas: 100% (216/216), done.


In [None]:
# data = pd.read_csv('/content/Labeling_IDS_to_MITRE/ground_truth.csv') # Nir experiment
data = pd.read_csv('/content/Labeling_IDS_to_MITRE/Semester_B/01 stratification/test_data_fix.csv') # Our experiment
print(data.head())
rules_list = data['Rule']
true_labels = data['technique ids']

#print(data['Sid'][0+41])
print(f"\n{len(data)=}")

     Sid                                  URL       technique ids  \
0  50094  https://snort.org/rule_docs/1-50094           ['T1187']   
1  38563  https://snort.org/rule_docs/1-38563           ['T1056']   
2    976    https://snort.org/rule_docs/1-976           ['T1204']   
3   1129   https://snort.org/rule_docs/1-1129           ['T1218']   
4  27967  https://snort.org/rule_docs/1-27967  ['T1505', 'T1219']   

                                                Rule  
0  alert tcp any $HTTP_PORTS -> any any ( msg:"IN...  
1  alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_N...  
2  alert tcp $EXTERNAL_NET any -> $HTTP_SERVERS $...  
3  alert tcp $EXTERNAL_NET any -> $HTTP_SERVERS $...  
4  alert tcp $EXTERNAL_NET any -> $HOME_NET $HTTP...  

len(data)=300


In [None]:
def clean_response(text):
    text = text.data.replace(">", "").strip()  # Remove leading ">", whitespace
    try:
      text = text.replace("```json", "")
      text = text.replace("```", "")
    except:
      pass
    return text


# **Zero Shot (ZS):**
At this stage, the LLMs will receive a prompt that does not include the list of techniques from MITRE ATT&CK in order to examine the results of the models based on prior knowledge that has been trained. According to our request, the LLMs will classify the techniques according to the content of the rule.

In [None]:
def ZS(snort_rule):

  prompt = f"""Rule: {snort_rule}
  Return a MITRE technique ID (with quotation marks) that related to the rule"""


  message = client.messages.create(
      max_tokens=1024,
      messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model="claude-3-sonnet-20240229",
      temperature=0,
  )

  return message

In [None]:
a = '''
"alert udp $HOME_NET any -> $EXTERNAL_NET 15165 ( msg:""MALWARE-OTHER Keylogger stealthwatcher 2000 runtime detection - agent up notification""; content:""|00 00 00 00 0A 02 08 A6|"",depth 8; content:""|02 00 00|v"",distance 0; classtype:successful-recon-limited; sid:6386; rev:8; )"
'''

print(ZS(a))

Message(id='msg_01F71A7QyT4ZLYF1PoydRxio', content=[TextBlock(text='Based on the rule description "MALWARE-OTHER Keylogger stealthwatcher 2000 runtime detection - agent up notification", the relevant MITRE technique ID is:\n\n"T1556.002"\n\nThis technique ID corresponds to "Credential Access: Credentials from Web Browsers" in the MITRE ATT&CK framework, which includes keylogging as a method for capturing credentials from web browsers.', type='text')], model='claude-3-sonnet-20240229', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=143, output_tokens=95))


# **Prompting without techniques guide and without example (WTGWE):**
At this stage, the LLMs will receive a prompt that does not include the list of techniques from MITRE ATT&CK in order to examine the results of the models based on prior knowledge that has been trained. According to our request, the LLMs will classify the techniques according to the content of the rule.


**Prompt1**:

      prompt = f"""You are an information security expert. Your task is to label IDS rules for MITRE ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques from MITRE ATT&CK that are related to the rule.
      Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
      Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
      Please don't write anything but the JSON. Rule: {snort_rule}"""


**prompt2**:

      prompt2 = f"""I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

      First, find the techniques from MITRE ATT&CK that are most relevant to the Snort rule.

      Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

      Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.

      Thus, the format of your overall response should look like what's shown between the <examples></examples> tags. Make sure to follow the formatting and spacing exactly.


      <examples>
        [
          "sid": "2274",
          "Technique ID": "T1110",
          "Technique name": "Brute Force",
          "Quotes": [
            "\"PROTOCOL-POP login brute force attempt\"",
            "track by_dst,count 30,seconds 30"
          ],
          "Explanation": "The rule is looking for excessive \"USER\" commands within a short period of time, which are common indicators of brute-force attacks targeting the POP3 service."
        ]
        </examples>

        Do not include anything besides write the JSON.
        """


**prompt3**:

        prompt3 = f"""You work in a company that deals with information security, your role in the company is to label techniques from MITRE ATT&CK to the rules of IDS systems. The labeling between a rule and a technique indicates that the attacker operated with a technique that you found to be suitable for the rule that alerted the IDS system. Now we will test your knowledge labeling IDS rules for MITRE ATT&CK techniques. For your task, you're going to have a single Snort IDS rule and you'll need to label the most relevant techniques from MITRE ATT&CK associated with the rule. From the rule you receive, your labeling should be based on your knowledge and the information found within the 'msg' in the rule received. For each technique you call the rule, include the following information as JSON format in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.  Note: The value of the 'Quotes' field should contain quotation marks from the data sets relevant to the mapped technique. The value of the 'Explanation' should be your explanation of why you decided to give the technique and how it relates to the rule. The 'Technique ID' should be the official MITRE technique ID.
        Please don't write anything but the JSON. Rule: {snort_rule}""")


**prompt4**

        prompt4 = f'''You are going to receive a Snort rule and your task is to find as many MITRE ATT&CK techniques as possible that are associated with the rule. Note: You should categorize the techniques to 1 or 2. Technique of type 1 is a technique that you can associate with the rule directly based on the rule. Technique of type 2 is a technique that can be associated with the rule indirectly, based on your knowledge and understanding. The categorization value should be the value 1 or 2, based on the explanation given above. The quotes field value should contain quotes from the rules data that are relevant to the technique mapped and they are the main reason you believe the mapping to this technique is correct. The explanation’s value should be your explanation for why you decided to give the technique and how it is associated with the rule. The technique id should be the official MITRE technique id. For each technique include the following information as JSON: sid, Technique id, Technique name, Categorization, Quotes, Explanation. After each rule I will provide you with, answer according to the provided format. Please do not write anything else but the JSON. Rule: {snort_rule}''')


**prompt5 fix:**

        prompt5 = f"""You are an information security expert. Your task is to label IDS rules for MITER ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the rule.
        Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
        Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITER technique ID.
        Please don't write anything but the JSON. Rule: {snort_rule}"""

In [None]:
def WTGWE(snort_rule):

  # prompt = f"""You are an information security expert. Your task is to label IDS rules for MITRE ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques from MITRE ATT&CK that are related to the rule.
  #             Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
  #             Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  #             Please don't write anything but the JSON. Rule: {snort_rule}"""

  prompt = f"""I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

  First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule.

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """


  message = client.messages.create(
      max_tokens=1024,
      messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model="claude-3-sonnet-20240229",
      temperature=0,
  )

  return message

# **Prompting without techniques guide and with 1 example (WTG1E):**
At this stage, the LLMs will receive a prompt that does not include the list of techniques from MITRE ATT&CK in order to examine the results of the models based on prior knowledge that has been trained. According to our request, the LLMs will classify the techniques according to the content of the rule.

In addition, the prompt has one example (one shot)


In [None]:
def WTG1E(snort_rule):

  # prompt = f"""Q: You are an information security expert. Your task is to label IDS rules for MITRE ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques from MITRE ATT&CK that are related to the rule.
  #   Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
  #   Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  #   Please don't write anything but the JSON. Rule: "alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_NET any ( msg:""MALWARE-CNC Win.Trojan.GateKeylogger fake 404 response""; flow:to_client,established; http_stat_code; content:""200""; http_stat_msg; content:""OK""; pkt_data; content:"">404 Not Found<"",fast_pattern,nocase; content:"" requested URL / was not found ""; metadata:impact_flag red,ruleset community; service:http; T1056; classtype:trojan-activity; sid:38563; rev:4; )"
  #   A: [
  #       "sid": "38563",
  #       "Technique ID": "T1056",
  #       "Technique name": "Input Capture",
  #       "Quotes": "\"Input Capture techniques involve intercepting and capturing user input data, such as keystrokes, to obtain sensitive information. The rule indicates the presence of a Trojan (GateKeylogger) that mimics a '404 Not Found' error to disguise its communication with a command and control server, which is a common method used by keyloggers to stealthily capture input data.\"",
  #       "Explanation": "This event is generated when activity relating to malware is detected."
  #   ]

  #   Q: You are an information security expert. Your task is to label IDS rules for MITRE ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques from MITRE ATT&CK that are related to the rule.
  #   Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
  #   Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  #   Please don't write anything but the JSON. Rule: {snort_rule}
  #   A: """



  prompt_with_example = f"""Q: I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>"alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_NET any ( msg:""MALWARE-CNC Win.Trojan.GateKeylogger fake 404 response""; flow:to_client,established; http_stat_code; content:""200""; http_stat_msg; content:""OK""; pkt_data; content:"">404 Not Found<"",fast_pattern,nocase; content:"" requested URL / was not found ""; metadata:impact_flag red,ruleset community; service:http; T1056; classtype:trojan-activity; sid:38563; rev:4; )"</snort_rule>

  First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule.

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """

  answer_we_expect = f"""
    [
        "sid": "38563",
        "Technique ID": "T1056",
        "Technique name": "Input Capture",
        "Quotes": "\"Input Capture techniques involve intercepting and capturing user input data, such as keystrokes, to obtain sensitive information. The rule indicates the presence of a Trojan (GateKeylogger) that mimics a '404 Not Found' error to disguise its communication with a command and control server, which is a common method used by keyloggers to stealthily capture input data.\"",
        "Explanation": "This event is generated when activity relating to malware is detected."
    ]

  """

  prompt = f"""
  I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

  First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule.

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """


  message = client.messages.create(
      max_tokens=1024,
      messages=[
          {
              "role": "user",
              "content": prompt_with_example,
          },
          {
            "role": "assistant",
            "content": answer_we_expect,
          },
                    {
              "role": "user",
              "content": prompt,
          },
      ],
      model="claude-3-sonnet-20240229",
      temperature=0,
  )

  return message

# **Prompting without techniques guide and with 2 example (WTG2E):**
At this stage, the LLMs will receive a prompt that does not include the list of techniques from MITRE ATT&CK in order to examine the results of the models based on prior knowledge that has been trained. According to our request, the LLMs will classify the techniques according to the content of the rule.

In addition, the prompt has two example (two shot)


In [None]:
def WTG2E(snort_rule):

  # prompt = f"""Q: You are an information security expert. Your task is to label IDS rules for MITRE ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques from MITRE ATT&CK that are related to the rule.
  #   Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
  #   Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  #   Please don't write anything but the JSON. Rule: "alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_NET any ( msg:""MALWARE-CNC Win.Trojan.GateKeylogger fake 404 response""; flow:to_client,established; http_stat_code; content:""200""; http_stat_msg; content:""OK""; pkt_data; content:"">404 Not Found<"",fast_pattern,nocase; content:"" requested URL / was not found ""; metadata:impact_flag red,ruleset community; service:http; T1056; classtype:trojan-activity; sid:38563; rev:4; )"
  #   A: [
  #       "sid": "38563",
  #       "Technique ID": "T1056",
  #       "Technique name": "Input Capture",
  #       "Quotes": "\"Input Capture techniques involve intercepting and capturing user input data, such as keystrokes, to obtain sensitive information. The rule indicates the presence of a Trojan (GateKeylogger) that mimics a '404 Not Found' error to disguise its communication with a command and control server, which is a common method used by keyloggers to stealthily capture input data.\"",
  #       "Explanation": "This event is generated when activity relating to malware is detected."
  #   ]

  #   Q: You are an information security expert. Your task is to label IDS rules for MITRE ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques from MITRE ATT&CK that are related to the rule.
  #   Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
  #   Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  #   Please don't write anything but the JSON. Rule: "alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_NET any ( msg:""MALWARE-CNC Win.Trojan.GateKeylogger fake 404 response""; flow:to_client,established; http_stat_code; content:""200""; http_stat_msg; content:""OK""; pkt_data; content:"">404 Not Found<"",fast_pattern,nocase; content:"" requested URL / was not found ""; metadata:impact_flag red,ruleset community; service:http; T1056; classtype:trojan-activity; sid:38563; rev:4; )"
  #   A: [
  #       "sid": "23934",
  #       "Technique ID": "T1190",
  #       "Technique name": "Exploit Public-Facing Application",
  #       "Quotes": "\"Exploit Public-Facing Application techniques involve targeting vulnerabilities in externally facing applications to gain unauthorized access or execute arbitrary code. This rule detects an attempted blind SQL injection attack on the Symantec Web Gateway's 'blocked.php' page, which is a common method attackers use to exploit web applications by manipulating SQL queries.\"",
  #       "Explanation": "SQL injection vulnerability in the management console in Symantec Web Gateway 5.0.x before 5.0.3.18 allows remote attackers to execute arbitrary SQL commands via unspecified vectors, related to a "blind SQL injection" issue."
  #   ]

  #   Q: You are an information security expert. Your task is to label IDS rules for MITRE ATT&CK techniques based on your cybersecurity knowledge. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques from MITRE ATT&CK that are related to the rule.
  #   Try to search based on keywords and based on the knowledge you have. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
  #   Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  #   Please don't write anything but the JSON. Rule: {snort_rule}
  #   A: """

  prompt_with_example_1 = f"""Q: I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>"alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_NET any ( msg:""MALWARE-CNC Win.Trojan.GateKeylogger fake 404 response""; flow:to_client,established; http_stat_code; content:""200""; http_stat_msg; content:""OK""; pkt_data; content:"">404 Not Found<"",fast_pattern,nocase; content:"" requested URL / was not found ""; metadata:impact_flag red,ruleset community; service:http; T1056; classtype:trojan-activity; sid:38563; rev:4; )"</snort_rule>

  First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule.

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """

  answer_we_expect_1 = f"""
    [
        "sid": "38563",
        "Technique ID": "T1056",
        "Technique name": "Input Capture",
        "Quotes": "\"Input Capture techniques involve intercepting and capturing user input data, such as keystrokes, to obtain sensitive information. The rule indicates the presence of a Trojan (GateKeylogger) that mimics a '404 Not Found' error to disguise its communication with a command and control server, which is a common method used by keyloggers to stealthily capture input data.\"",
        "Explanation": "This event is generated when activity relating to malware is detected."
    ]
  """

  prompt_with_example_2 = f"""Q: I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>"alert tcp $EXTERNAL_NET any -> $HOME_NET $HTTP_PORTS ( msg:""SERVER-OTHER Apache Log4j logging remote code execution attempt""; flow:to_server,established; http_header; content:""upper"",fast_pattern,nocase; pcre:""/(%(25)?24|\x24)(%(25)?7b|\x7b)upper(%(25)?3a|\x3a)/i""; metadata:policy balanced-ips drop,policy connectivity-ips drop,policy max-detect-ips drop,policy security-ips drop,ruleset community; service:http; classtype:attempted-user; gid:1; sid:58738; rev:5; )"</snort_rule>

  First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule.

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """

  answer_we_expect_2 = f"""
    [
        "sid": "23934",
        "Technique ID": "T1190",
        "Technique name": "Exploit Public-Facing Application",
        "Quotes": "Adversaries may attempt to exploit a weakness in an Internet-facing host or system to initially access a network.",
        "Explanation": "This rule looks for attempts to exploit a remote code execution vulnerability in Log4j's "Lookup" functionality."
    ]
  """

  prompt = f"""
  I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

  First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule.

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  """



  message = client.messages.create(
      max_tokens=1024,
      messages=[
          {
              "role": "user",
              "content": prompt_with_example_1,
          },
          {
            "role": "assistant",
            "content": answer_we_expect_1,
          },
          {
              "role": "user",
              "content": prompt_with_example_2,
          },
          {
            "role": "assistant",
            "content": answer_we_expect_2,
          },
          {
              "role": "user",
              "content": prompt,
          },
      ],
      model="claude-3-sonnet-20240229",
      temperature=0,
  )

  return message

# Pre collection data for TG

In [None]:
def recursive_enter(path: str, file_list: list = None) -> list:
    if file_list is None:
        file_list = []

    try:
        os.chdir(path)  # Change path

        items = os.listdir()  # List everything in the directory
        for item in items:
            full_path = os.path.join(path, item)

            if full_path.endswith(".json"):
                with open(full_path) as f:
                    file_list.append(json.load(f))

    except Exception as e:
        print(f"An error occurred: {e}")

    return file_list

tacticFolder = "/content/Labeling_IDS_to_MITRE/Semester_A/Extract data from MITRE ATTACK/techniques_split"
file_list = []
MITRE_Technique = recursive_enter(tacticFolder, file_list)
print(len(MITRE_Technique))
os.chdir("/content/")

All_MITRE_Technique_json = None
All_MITRE_Technique_json_path = "/content/Labeling_IDS_to_MITRE/Semester_A/Extract data from MITRE ATTACK/combined_techniques_split.json"
# Open and read the JSON file
with open(All_MITRE_Technique_json_path, 'r') as file:
    All_MITRE_Technique_json = json.load(file)

# **Prompting with techniques guide and without example (TGWE):**
In the next step, we will provide the LLMs with the list of all the techniques from MITRE ATT&CK, to guarantee that the models are targeted to the present techniques, even the infrequently used ones. Each technique will include the technique number, the name of the technique and its description. The techniques will be provided to the models in the form of batches (due to the memory limit of the models) and after each batch we will ask him to classify the appropriate techniques from the list he received (if exist), finally we will unite the model's answers for each individual rule.


In [None]:
def TGWE(snort_rule, techniques, limit):
  if limit:
    str_limit = "First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule."
  else:
    str_limit = "First, find the techniques from MITRE ATT&CK that are most relevant to the Snort rule."


  prePrompt = f"""You are an information security expert. Now I will provide you information about techniques from MITRE ATT&CK, you will use the information for a task you will receive later. Do not reply to the information you receive."""

  dataPrompt = f"The information:\n {str(techniques)}"

  # response_data = f"""Your task is to label IDS rules for MITRE ATT&CK techniques based on the information I have provided you. For the task, you are going to get a single Snort IDS rule and you will need to return the most relevant techniques associated with the rule from the information I provided you only.
  #  Note 1: There is not necessarily a suitable technique in the information, return a technique if and only if it has an unambiguous relationship to the provided rule, if not return an empty JSON. For each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.
  #  Note 2: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  #  Please don't write anything but the JSON. Rule: {snort_rule}"""

  response_data = f"""I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

  {str_limit}

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """

  tg_data_list = prePrompt + dataPrompt + response_data

  message = client.messages.create(
      max_tokens=1024,
      messages=[
          {
              "role": "user",
              "content": tg_data_list,
          }
      ],
      model="claude-3-sonnet-20240229",
      temperature=0,
  )

  return message



# **Prompting with techniques guide and with 1 example (TG1E):**
In the next step, we will provide the LLMs with the list of all the techniques from MITRE ATT&CK, to guarantee that the models are targeted to the present techniques, even the infrequently used ones. Each technique will include the technique number, the name of the technique and its description. The techniques will be provided to the models in the form of batches (due to the memory limit of the models) and after each batch we will ask him to classify the appropriate techniques from the list he received (if exist), finally we will unite the model's answers for each individual rule.

In addition, the prompt has one example (one shot)

In [None]:
def TG1E(snort_rule, techniques, limit):
  if limit:
    str_limit = "First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule."
  else:
    str_limit = "First, find the techniques from MITRE ATT&CK that are most relevant to the Snort rule."


  prePrompt = f"""You are an information security expert. Now I will provide you information about techniques from MITRE ATT&CK, you will use the information for a task you will receive later. Do not reply to the information you receive."""

  dataPrompt = f"The information:\n {str(techniques)}"

  response_data = f"""Q: I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>"alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_NET any ( msg:""MALWARE-CNC Win.Trojan.GateKeylogger fake 404 response""; flow:to_client,established; http_stat_code; content:""200""; http_stat_msg; content:""OK""; pkt_data; content:"">404 Not Found<"",fast_pattern,nocase; content:"" requested URL / was not found ""; metadata:impact_flag red,ruleset community; service:http; T1056; classtype:trojan-activity; sid:38563; rev:4; )"</snort_rule>

  {str_limit}

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """

  tg_data_list = prePrompt + dataPrompt + response_data


  answer_we_expect = f"""
    [
        "sid": "38563",
        "Technique ID": "T1056",
        "Technique name": "Input Capture",
        "Quotes": "\"Input Capture techniques involve intercepting and capturing user input data, such as keystrokes, to obtain sensitive information. The rule indicates the presence of a Trojan (GateKeylogger) that mimics a '404 Not Found' error to disguise its communication with a command and control server, which is a common method used by keyloggers to stealthily capture input data.\"",
        "Explanation": "This event is generated when activity relating to malware is detected."
    ]

  """

  prompt = f"""
  I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

  {str_limit}

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """
  tg_data_list_prompt = prePrompt + dataPrompt + prompt

  message = client.messages.create(
      max_tokens=1024,
      messages=[
          {
              "role": "user",
              "content": tg_data_list,
          },
          {
            "role": "assistant",
            "content": answer_we_expect,
          },
          {
              "role": "user",
              "content": tg_data_list_prompt,
          },
      ],
      model="claude-3-sonnet-20240229",
      temperature=0,
  )

  return message


# **Prompting with techniques guide and with 2 example (TG2E):**
In the next step, we will provide the LLMs with the list of all the techniques from MITRE ATT&CK, to guarantee that the models are targeted to the present techniques, even the infrequently used ones. Each technique will include the technique number, the name of the technique and its description. The techniques will be provided to the models in the form of batches (due to the memory limit of the models) and after each batch we will ask him to classify the appropriate techniques from the list he received (if exist), finally we will unite the model's answers for each individual rule.

In addition, the prompt has two example (two shot)

In [None]:
def TG2E(snort_rule, techniques, limit):
  if limit:
    str_limit = "First, your task is to return no more than 2 most relevant techniques from MITER ATT&CK that are related to the Snort rule."
  else:
    str_limit = "First, find the techniques from MITRE ATT&CK that are most relevant to the Snort rule."


  prePrompt = f"""You are an information security expert. Now I will provide you information about techniques from MITRE ATT&CK, you will use the information for a task you will receive later. Do not reply to the information you receive."""

  dataPrompt = f"The information:\n {str(techniques)}"

  response_data = f"""Q: I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>"alert tcp $EXTERNAL_NET $HTTP_PORTS -> $HOME_NET any ( msg:""MALWARE-CNC Win.Trojan.GateKeylogger fake 404 response""; flow:to_client,established; http_stat_code; content:""200""; http_stat_msg; content:""OK""; pkt_data; content:"">404 Not Found<"",fast_pattern,nocase; content:"" requested URL / was not found ""; metadata:impact_flag red,ruleset community; service:http; T1056; classtype:trojan-activity; sid:38563; rev:4; )"</snort_rule>

  {str_limit}

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """

  tg_data_list = prePrompt + dataPrompt + response_data


  answer_we_expect_1 = f"""
    [
        "sid": "38563",
        "Technique ID": "T1056",
        "Technique name": "Input Capture",
        "Quotes": "\"Input Capture techniques involve intercepting and capturing user input data, such as keystrokes, to obtain sensitive information. The rule indicates the presence of a Trojan (GateKeylogger) that mimics a '404 Not Found' error to disguise its communication with a command and control server, which is a common method used by keyloggers to stealthily capture input data.\"",
        "Explanation": "This event is generated when activity relating to malware is detected."
    ]
  """

  prompt_with_example_2 = f"""Q: I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>"alert tcp $EXTERNAL_NET any -> $HOME_NET $HTTP_PORTS ( msg:""SERVER-OTHER Apache Log4j logging remote code execution attempt""; flow:to_server,established; http_header; content:""upper"",fast_pattern,nocase; pcre:""/(%(25)?24|\x24)(%(25)?7b|\x7b)upper(%(25)?3a|\x3a)/i""; metadata:policy balanced-ips drop,policy connectivity-ips drop,policy max-detect-ips drop,policy security-ips drop,ruleset community; service:http; classtype:attempted-user; gid:1; sid:58738; rev:5; )"</snort_rule>

  {str_limit}

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.


  <examples>
    [
      "sid": "",
      "Technique ID": "",
      "Technique name": "",
      "Quotes": [""],
      "Explanation": ""
    ]
    </examples>

    Do not include anything besides write the JSON.
    """

  tg_data_list2 = prePrompt + dataPrompt + prompt_with_example_2

  answer_we_expect_2 = f"""
    [
        "sid": "23934",
        "Technique ID": "T1190",
        "Technique name": "Exploit Public-Facing Application",
        "Quotes": "Adversaries may attempt to exploit a weakness in an Internet-facing host or system to initially access a network.",
        "Explanation": "This rule looks for attempts to exploit a remote code execution vulnerability in Log4j's "Lookup" functionality."
    ]
  """

  prompt = f"""
  I'm going to give you a Snort rule. Read the Snort rule carefully, because I'm going to given you a task about it. Here is the Snort rule: <snort_rule>{snort_rule}</snort_rule>

  {str_limit}

  Then, answer the task, for each technique include the following information as JSON in this order: 'Sid', 'Technique ID', 'Technique Name', 'Quotes', 'Explanation'.

  Note: The value of the citation field should contain quotation marks from the data sets relevant to the mapped technique are the main reason you chose this technique to be correct. The value of the explanation should be your explanation of why you decided to give the technique and how it relates to the rule. The technique ID should be the official MITRE technique ID.
  """

  tg_data_list_prompt = prePrompt + dataPrompt + prompt

  message = client.messages.create(
      max_tokens=1024,
      messages=[
          {
              "role": "user",
              "content": tg_data_list,
          },
          {
            "role": "assistant",
            "content": answer_we_expect_1,
          },
          {
              "role": "user",
              "content": tg_data_list2,
          },
          {
            "role": "assistant",
            "content": answer_we_expect_2,
          },
          {
              "role": "user",
              "content": tg_data_list_prompt,
          },
      ],
      model="claude-3-sonnet-20240229",
      temperature=0,
  )

  return message

# Write to csv

Write Zero Shot

In [None]:
def write_csv_ZS(filename, rule_dict):
  # Define the field names
  field_names = ["Technique_id", "True_labels"]

  # Open the CSV file in write mode (truncating any existing content)
  with open(filename, "w", newline="") as csvfile: # "prompting_without_techniques_guide.csv"
      # Create a DictWriter object with the specified field names
      writer = csv.DictWriter(csvfile, fieldnames=field_names)

      # Write the header row
      writer.writeheader()

      # Extract relevant data from each item and write it as a dictionary
      counter = 0
      for key, value in rule_dict.items():
        text = clean_response(value)
        technique_ids = []
        print(text)

        if "'Sid" in text:
          # Define a regex pattern to switch single quotes to double quotes
          pattern = re.compile(r"((^|\s)'((?:[^'\\]|\\.)*)'(?=[\s.,:;!?)]))|(:\s*'((?:[^'\\]|\\.)+)')")
          # Switch single quotes to double quotes
          text = pattern.sub(lambda x: x.group().replace("'", '"'), text)

          pattern = re.compile(r'"\S+"[\s\.]|\s"[\w\s]*"\s')
          text = re.sub(pattern, "", text)

        # Extracting "TXXXX" numbers using regular expression
        technique_ids = re.findall(r'[\'\"](T\d+(?:\.\d+)?)', text)

        # Extracting "Sid"
        match = re.search(r'[\'\"][s|S]id[\'\"]: [\'\"](\d+)[\'\"]', text)
        if match:
            sid_number = match.group(1)


        # Assuming each item has all necessary fields:
        insertRow = {
            "Technique_id": technique_ids,  # Handle potential absence
            "True_labels": true_labels[counter],
        }
        writer.writerow(insertRow)
        counter += 1


Write without techniques guide

In [None]:
def write_csv_WTG(filename, rule_dict):
  # Define the field names
  field_names = ["Sid", "Response", "Technique_id", "True_labels"]

  # Open the CSV file in write mode (truncating any existing content)
  with open(filename, "w", newline="") as csvfile: # "prompting_without_techniques_guide.csv"
      # Create a DictWriter object with the specified field names
      writer = csv.DictWriter(csvfile, fieldnames=field_names)

      # Write the header row
      writer.writeheader()

      # Extract relevant data from each item and write it as a dictionary
      counter = 0
      for key, value in rule_dict.items():
        text = clean_response(value)
        technique_ids = []
        #print(text)

        if "'Sid" in text:
          # Define a regex pattern to switch single quotes to double quotes
          pattern = re.compile(r"((^|\s)'((?:[^'\\]|\\.)*)'(?=[\s.,:;!?)]))|(:\s*'((?:[^'\\]|\\.)+)')")
          # Switch single quotes to double quotes
          text = pattern.sub(lambda x: x.group().replace("'", '"'), text)

          pattern = re.compile(r'"\S+"[\s\.]|\s"[\w\s]*"\s')
          text = re.sub(pattern, "", text)

        # Extracting "TXXXX" numbers using regular expression
        technique_ids = re.findall(r'[\'\"](T\d+(?:\.\d+)?)', text)

        # Extracting "Sid"
        match = re.search(r'[\'\"][s|S]id[\'\"]: [\'\"](\d+)[\'\"]', text)
        if match:
            sid_number = match.group(1)


        # Assuming each item has all necessary fields:
        insertRow = {
            "Sid": sid_number,
            "Response": text,
            "Technique_id": technique_ids,  # Handle potential absence
            "True_labels": true_labels[counter],
        }
        writer.writerow(insertRow)
        counter += 1


Write with techniques guide

In [None]:
import csv
import re

headersCSV_TG = ["Sid", "Response_11_Iteration", "Without_Prompt_Limit_Without_Competition_Without_Limit_Return",
                 "Without_Prompt_Limit_Without_Competition_With_Limit_Return",
                 "Response_Competition_1", "Response_Competition_2", "Response_Competition_3",
                 "Without_Prompt_Limit_With_Competition_Without_Limit_Return",
                 "Without_Prompt_Limit_With_Competition_With_Limit_Return", "True_labels"]

def init_file(fileName):
  # Initial write to csv with header
  with open(fileName, 'w', newline='') as csvfile: # 'prompting_with_techniques_guide.csv'
      writer = csv.DictWriter(csvfile, fieldnames=headersCSV_TG)
      writer.writeheader()

def appendToCSV(rows_data, counter, fileName, technique_ids) -> None:
    '''
    rows_data -> {213: [<IPython.core.display.Markdown object>, <IPython.core.display.Markdown object>, ...]}
    '''
    all_technique, top_2_all_technique, all_competition, top_2_all_competition, tg_dict_Batch1, tg_dict_Batch2, tg_dict_Batch3 = technique_ids

    # Open the CSV file in append mode to add new rows
    with open(fileName, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headersCSV_TG)

        # Loop through each row and write data
        for row, value in rows_data.items():
            response_text = ""
            for i in value:
                text = clean_response(i)
                response_text += text
                #print(text)

            insertRow = {
                "Sid": row,
                "Response_11_Iteration": response_text,
                "Without_Prompt_Limit_Without_Competition_Without_Limit_Return": all_technique,
                "Without_Prompt_Limit_Without_Competition_With_Limit_Return": top_2_all_technique,
                "Response_Competition_1": tg_dict_Batch1,
                "Response_Competition_2": tg_dict_Batch2,
                "Response_Competition_3": tg_dict_Batch3,
                "Without_Prompt_Limit_With_Competition_Without_Limit_Return": all_competition,
                "Without_Prompt_Limit_With_Competition_With_Limit_Return": top_2_all_competition,
                "True_labels": true_labels[counter]
            }

            # Write the row to the CSV file
            writer.writerow(insertRow)


# WTG - Generic

In [None]:
def WTG(functionName, rules_list):
  rule_dict = {}
  max_retries = 3  # Maximum number of retries

  for index, rule in enumerate(rules_list):
      retries = 0
      print(f"------------------{index}-----------------------")
      print(rule)
      while retries < max_retries:
          try:
              res = functionName(rule)
              text = str(res.content)
              # Check if the text contains the desired pattern
              t_numbers = re.findall(r'[\'\"](T\d+(?:\.\d+)?)', text)
              if t_numbers:  # If the pattern is found
                  rule_dict[data['Sid'][index]] = to_markdown(text)
                  break  # Break out of the retry loop if successful
              else:
                  print("Desired pattern not found in the text. Retrying...")
                  retries += 1
                  time.sleep(1)  # Wait for a short duration before retrying

              #time.sleep(15) # remove after we have money :)
          except Exception as e:
              print(f"An error occurred: {e}")
              retries += 1
              if retries < max_retries:
                  print(f"Retrying... ({retries}/{max_retries})")
                  time.sleep(1)  # Wait for a short duration before retrying
              else:
                  print("Max retries reached. Unable to process this rule.")

  # If sending fails, attempt to send again
  try:
      # Code to send data
      pass
  except Exception as e:
      print(f"Sending failed: {e}")
      # Retry sending here

  return rule_dict


# TG - Generic

In [None]:
def get_the_most_relevate_technique(tg_dict):
    technique_ids = []
    for row, value in tg_dict.items():
        response_text = ""
        for i in value:
            text = clean_response(i)
            response_text += text
            try:
                # Extracting "TXXXX" numbers using regular expression
                technique_ids.extend(re.findall(r'[\'\"](T\d+(?:\.\d+)?)', text))
            except Exception as e:
                print(f"Error extracting technique IDs: {e}")
    return technique_ids


def stratification():
  import ast

  technique_counts = (data['technique ids'].value_counts())
  # Converting the Series to a dictionary
  technique_counts_dict = technique_counts.to_dict()
  #print(technique_counts_dict)

  # Initialize the new dictionary
  new_data = {}
  # Iterate through the original dictionary
  for key, value in technique_counts_dict.items():
      # Convert the string key to a list
      techniques = ast.literal_eval(key)
      # Iterate through the techniques in the list
      for technique in techniques:
          # Add the technique to the new dictionary
          if technique in new_data:
              new_data[technique] += value
          else:
              new_data[technique] = value

  sorted_data = dict(sorted(new_data.items(), key=lambda item: item[1], reverse=True))
  # Print the new dictionary
  return sorted_data


def tg_split_data(functionName, rules_list_index, index, fileName, limit):

  for rule in rules_list_index:
    print(f"index {index} \t Sid: {data['Sid'][index]}")

    tg_dict = {}
    count = 0 #####
    response_tg_dict_11_iteration = ""
    for batch in MITRE_Technique: # 11 files
      res = functionName(rule, batch, limit)
      sid = data['Sid'][index]
      if sid not in tg_dict:
        tg_dict[sid] = []
      try:
        tg_dict[sid].append(to_markdown(str(res.content)))
      except:
        tg_dict[sid].append(to_markdown("{}"))
      print(f"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~{count}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") #####
      print(to_markdown(str(res.content)))
      count += 1 #######
      #time.sleep(15)
    response_tg_dict_11_iteration = tg_dict



    # Get up to 3 rules (STATIC)
    new_batch = []
    technique_ids_from_11_batchs = get_the_most_relevate_technique(tg_dict)
    for technique_id in technique_ids_from_11_batchs:
        if technique_id in All_MITRE_Technique_json[0]:
            new_batch.append((technique_id, All_MITRE_Technique_json[0][technique_id]))
    new_dict = dict(new_batch)
    print(new_dict)



    # get the most frequency from top_2_new_batch:
    res_stratification = stratification()
    # Filter keys from res_stratification that are present in new_batch
    filtered_keys = [key for key in new_dict.keys() if key in res_stratification.keys()]

    # Sort the filtered keys based on counts in res_stratification
    sorted_keys = sorted(filtered_keys, key=res_stratification.get, reverse=True)

    # Get the top two keys
    top_2_new_batch = sorted_keys[:2]



    # Run new_batch 3 times
    dictionary_of_rules = {}
    tg_dict = {}

    tg_dict_Batch1 = {}
    tg_dict_Batch2 = {}
    tg_dict_Batch3 = {}

    for epoch in range(3):
        tg_dict = {}
        #time.sleep(1)
        res = functionName(rule, new_dict, limit)
        sid = data['Sid'][index]
        if sid not in tg_dict:
            tg_dict[sid] = []
        try:
            tg_dict[sid].append(to_markdown(str(res.content)))
        except:
            tg_dict[sid].append(to_markdown("{}"))

        inner_technique_ids = get_the_most_relevate_technique(tg_dict)
        for technique in inner_technique_ids:
            if technique not in dictionary_of_rules:
                dictionary_of_rules[technique] = 1
            else:
                dictionary_of_rules[technique] += 1
        print(dictionary_of_rules)
        print(f"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~BATCH {epoch + 1}~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

        if (epoch == 0):
          tg_dict_Batch1[sid] = str(res.content)
        elif (epoch == 1):
          tg_dict_Batch2[sid] = str(res.content)
        else:
          tg_dict_Batch3[sid] = str(res.content)


    if dictionary_of_rules: # This checks if the dictionary is empty
      # Step 1: Find the maximum value in dictionary_of_rules
      max_number_value = max(dictionary_of_rules.values())

      # Step 2: Collect all keys with the maximum value
      max_techniuqeId_keys = [key for key, value in dictionary_of_rules.items() if value == max_number_value]
      if len(max_techniuqeId_keys) == 1:
        sorted_values = sorted(dictionary_of_rules.values(), reverse=True)

        # Get the second highest value
        if len(sorted_values) >= 2:
            second_max_value = sorted_values[1]
            second_max_techniuqeId_keys = [key for key, value in dictionary_of_rules.items() if value == second_max_value]

            if len(second_max_techniuqeId_keys) >= 2:
              res_stratification = stratification()
              # Find the key with the highest count in the res_stratification dictionary
              max_count_key = max(res_stratification, key=res_stratification.get)
              max_techniuqeId_keys.extend(max_count_key)
            else:
              max_techniuqeId_keys.extend(second_max_techniuqeId_keys)

      elif len(max_techniuqeId_keys) > 2:
        res_stratification = stratification()

        keys_from_dict1 = set(dictionary_of_rules.keys())
        filtered_dict2 = {key: value for key, value in res_stratification.items() if key in keys_from_dict1}
        sorted_keys = sorted(filtered_dict2, key=filtered_dict2.get, reverse=True)
        max_techniuqeId_keys = sorted_keys[:2]

    else:
      max_techniuqeId_keys = []



    # Write to CSV
    """
    response_tg_dict_11_iteration => all content from 11 iteration

      **Without Competition**
        new_batch => All techniques iterating from 11 iterations on one SNORT rule.
        top_2_new_batch => From new_batch return the most 2 frequency.

      **With Competition**
        dictionary_of_rules.keys() => All techniques iterating from 3 batch.
        max_techniuqeId_keys => From Batch return the most 2 frequency.
    """

    technique_ids = (list(new_dict.keys()), top_2_new_batch, list(dictionary_of_rules.keys()), max_techniuqeId_keys, tg_dict_Batch1, tg_dict_Batch2, tg_dict_Batch3)
    appendToCSV(response_tg_dict_11_iteration, index, fileName, technique_ids)
    index += 1

# Run Experiments

without data

In [None]:
# #  Without Example Without Techniuqes Guide
# rule_dict_ZS = WTG(ZS, rules_list)
# write_csv_ZS("zero_shot.csv", rule_dict_ZS)


#  # Without Example Without Techniuqes Guide
# rule_dict_WTG = WTG(WTGWE, rules_list)
# write_csv_WTG("prompting_without_techniques_guide_zero_shot_with_limit.csv", rule_dict_WTG)


#  # With 1 Example Without Techniuqes Guide
# rule_dict_WTG1E = WTG(WTG1E, rules_list)
# write_csv_WTG("prompting_without_techniques_guide_one_shot_with_limit.csv", rule_dict_WTG1E)


 # With 2 Example Without Techniuqes Guide
rule_dict_WTG2E = WTG(WTG2E, rules_list)
write_csv_WTG("prompting_without_techniques_guide_two_shot_with_limit.csv", rule_dict_WTG2E)

with data

In [None]:
# fileName = 'prompting_with_techniques_guide_zero_shot_False.csv' #0
fileName = 'prompting_with_techniques_guide_one_shot_False.csv' #1
# fileName = 'prompting_with_techniques_guide_two_shot_False.csv' #2

In [None]:
# Run this only for the first time to create the file!!
init_file(fileName)

In [None]:
#  # Without Example With Techniuqes Guide
# rule_dict_TGWE_01 = rules_list[185:] # index 0-99
# tg_split_data(TGWE, rule_dict_TGWE_01, 185, fileName, False)

# rule_dict_TGWE_02 = rules_list[100:200] # index 100-199
# tg_split_data(TGWE, rule_dict_TGWE_02, 100, fileName)

# rule_dict_TGWE_03 = rules_list[200:] # index 200-299
# tg_split_data(TGWE, rule_dict_TGWE_03, 200, fileName)



# With 1 Example With Techniuqes Guide
rule_dict_TG1E_01 = rules_list[7:] # index 0-99
tg_split_data(TG1E, rule_dict_TG1E_01, 7, fileName, False)

# rule_dict_TG1E_02 = rules_list[100:200] # index 100-199
# tg_split_data(TG1E, rule_dict_TG1E_02, 100, fileName)

# rule_dict_TG1E_03 = rules_list[200:] # index 200-299
# tg_split_data(TG1E, rule_dict_TG1E_03, 200, fileName)



#  # With 2 Example With Techniuqes Guide
# rule_dict_TG2E_01 = rules_list # index 0-99
# tg_split_data(TG2E, rule_dict_TG2E_01, 0, fileName, False)

# rule_dict_TG2E_02 = rules_list[100:200] # index 100-199
# tg_split_data(TG2E, rule_dict_TG2E_02, 100, fileName)

# rule_dict_TG2E_03 = rules_list[200:] # index 200-299
# tg_split_data(TG2E, rule_dict_TG2E_03, 200, fileName)

index 3 	 Sid: 1129
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~0~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~1~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~2~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~3~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~4~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~5~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~6~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~7~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.core.display.Markdown object>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~8~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<IPython.co

RateLimitError: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'Number of request tokens has exceeded your daily rate limit (https://docs.anthropic.com/en/api/rate-limits); see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}}

# Evaluation


*   Persicion
*   Recall
*   F-1



In [None]:
import ast
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import math

def evaluation(true_labels, predicted_labels, is_printable):
  results = []
  recall = []
  precision = []
  f1 = []


  for i in range(len(true_labels)):
    trueList = ast.literal_eval(true_labels[i])
    predList = ast.literal_eval(predicted_labels[i])
    # Extract only the 'TXXXX' part from each string in the list
    predList = [item.split('.')[0] if '.' in item else item for item in predList]
    intersection = set(trueList).intersection(set(predList))
    #print(list(intersection))
    if (len(predList) != 0):
      recall.append(len(intersection) / len(set(trueList)))
      precision.append(len(intersection) / len(set(predList)))
      try:
        f1.append((2 * precision[i] * recall[i]) / (recall[i] + precision[i]))
      except:
        f1.append(0)

    ####### the real
    else:
      recall.append(0)
      precision.append(0)
      f1.append(0)

  # Avg.
  average_recall = sum(recall) / len(recall)
  average_precision = sum(precision) / len(precision)
  average_f1 = (2 * average_recall * average_precision) / (average_recall + average_precision)

  if is_printable:
    print("Metric    |   Score")
    print("-------------------")
    print(f"Precision |   {average_precision:.2f}")
    print(f"Recall    |   {average_recall:.2f}")
    print(f"F1 Score  |   {average_f1:.2f}")
  else:
    results.append((average_precision, average_recall, average_f1))
    return results


#### ZS

In [None]:
loadData = pd.read_csv("zero_shot.csv")
true_labels_ZS = loadData['True_labels']
predicted_labels = loadData['Technique_id']

evaluation(true_labels_ZS, predicted_labels, True)


Metric    |   Score
-------------------
Precision |   0.18
Recall    |   0.23
F1 Score  |   0.20


#### WTGWE

In [None]:
loadData = pd.read_csv("prompting_without_techniques_guide_zero_shot.csv")
true_labels_WTGWE = loadData['True_labels']
predicted_labels = loadData['Technique_id']

evaluation(true_labels_WTGWE, predicted_labels, True)


Metric    |   Score
-------------------
Precision |   0.16
Recall    |   0.25
F1 Score  |   0.20


#### WTG1E

In [None]:
loadData = pd.read_csv("prompting_without_techniques_guide_one_shot.csv")
true_labels_WTG1E = loadData['True_labels']
predicted_labels = loadData['Technique_id']

evaluation(true_labels_WTG1E, predicted_labels)


Metric    |   Score
-------------------
Precision |   0.05
Recall    |   0.10
F1 Score  |   0.07


#### WTG2E

In [None]:
loadData = pd.read_csv("prompting_without_techniques_guide_two_shot.csv")
true_labels_WTG2E = loadData['True_labels']
predicted_labels = loadData['Technique_id']

evaluation(true_labels_WTG2E, predicted_labels)


Metric    |   Score
-------------------
Precision |   0.10
Recall    |   0.05
F1 Score  |   0.07


#### TGWE

In [None]:
loadData = pd.read_csv('prompting_with_techniques_guide_zero_shot_False.csv')
true_labels_TGWE = loadData['True_labels']
predicted_labels = loadData['Without_Prompt_Limit_Without_Competition_Without_Limit_Return']

evaluation(true_labels_TGWE, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_Without_Competition_With_Limit_Return']
evaluation(true_labels_TGWE, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_With_Competition_Without_Limit_Return']
evaluation(true_labels_TGWE, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_With_Competition_With_Limit_Return']
evaluation(true_labels_TGWE, predicted_labels, True)

Metric    |   Score
-------------------
Precision |   0.07
Recall    |   0.57
F1 Score  |   0.13

Metric    |   Score
-------------------
Precision |   0.12
Recall    |   0.20
F1 Score  |   0.15

Metric    |   Score
-------------------
Precision |   0.21
Recall    |   0.39
F1 Score  |   0.28

Metric    |   Score
-------------------
Precision |   0.23
Recall    |   0.38
F1 Score  |   0.29


#### TG1E

In [None]:
loadData = pd.read_csv('prompting_with_techniques_guide_one_shot_False.csv')
true_labels_TG1E = loadData['True_labels']
predicted_labels = loadData['Without_Prompt_Limit_Without_Competition_Without_Limit_Return']

evaluation(true_labels_TG1E, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_Without_Competition_With_Limit_Return']
evaluation(true_labels_TG1E, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_With_Competition_Without_Limit_Return']
evaluation(true_labels_TG1E, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_With_Competition_With_Limit_Return']
evaluation(true_labels_TG1E, predicted_labels, True)

Metric    |   Score
-------------------
Precision |   0.07
Recall    |   0.57
F1 Score  |   0.12

Metric    |   Score
-------------------
Precision |   0.21
Recall    |   0.43
F1 Score  |   0.29

Metric    |   Score
-------------------
Precision |   0.14
Recall    |   0.29
F1 Score  |   0.19

Metric    |   Score
-------------------
Precision |   0.14
Recall    |   0.29
F1 Score  |   0.19


#### TG2E

In [None]:
loadData = pd.read_csv('prompting_with_techniques_guide_two_shot_False.csv')
true_labels_TG2E = loadData['True_labels']
predicted_labels = loadData['Without_Prompt_Limit_Without_Competition_Without_Limit_Return']

evaluation(true_labels_TG2E, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_Without_Competition_With_Limit_Return']
evaluation(true_labels_TG2E, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_With_Competition_Without_Limit_Return']
evaluation(true_labels_TG2E, predicted_labels, True)
print()


predicted_labels = loadData['Without_Prompt_Limit_With_Competition_With_Limit_Return']
evaluation(true_labels_TG2E, predicted_labels, True)

Metric    |   Score
-------------------
Precision |   0.40
Recall    |   0.70
F1 Score  |   0.51


# Visualization Data

Graph plot (zero & all without [before and after limit 2 techniques])
