In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datasets
import re
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
techniques_url =  "https://attack.mitre.org/docs/enterprise-attack-v15.1/enterprise-attack-v15.1-techniques.xlsx"
tactics_url = "https://attack.mitre.org/docs/enterprise-attack-v15.1/enterprise-attack-v15.1-tactics.xlsx"

In [3]:
df = pd.DataFrame(columns=["Question","Answer"])

In [4]:
tactics_df = pd.read_excel(tactics_url)

# Tactics

In [5]:
for index, row in tactics_df.iterrows():
    url = row.get("url", "")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    div = soup.find("div", class_="description-body")

    tactic_name = row.get("name")
    question = f"What is the description of tactic, {tactic_name} according to Mitre Attack framework?"
    df.loc[len(df)] = [question, row.get("description","") + div.get_text()]

# Techniques

In [6]:
techniques_df = pd.read_excel(techniques_url)

In [7]:
def remove_digit_brackets(text):
    pattern = r'\[\d+\]'
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text

In [8]:
def get_technique_df(row, description):
    technique_name = row.get("name")
    qna_dict = {}
    qna_dict["Question"] = [f"What is the description of technique, {technique_name} according to Mitre Attack framework?",
                            f"Which platforms does technique, {technique_name} primarily target according to Mitre Attack framework?",
                            f"How to detect or prevent technique, {technique_name} from being exploited?",
                            f"What is the technique id for technique, {technique_name} according to mitre attack framework?",
                            f"What are the tactics does this technique, {technique_name} belongs to?"]

    qna_dict["Answer"] = [row.get('description'),
                          f"The technique {technique_name} targets or exploits these platforms {row.get('platforms')}",
                          row.get('detection'),
                          f"The technique id of the technique {technique_name} is {row.get('ID')}",
                          f"The technique {technique_name} belongs to these tactics {row.get('tactics')}"]
    return pd.DataFrame(qna_dict)

In [9]:
for index, row in techniques_df.iterrows():
    url = row.get("url", "")
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    div = soup.find("div", class_="description-body")
    description = remove_digit_brackets(div.get_text())
    tmp_df = get_technique_df(row, description)

    df = pd.concat([df, tmp_df]).reset_index(drop=True)
    

In [12]:
dataset = datasets.Dataset.from_pandas(df)

In [14]:
dataset.push_to_hub("Tejeswara/cybersec_mitre_attack_tactics_techniques_instruction_data")

Uploading the dataset shards:   0%|                                                              | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|███████████████████████████████████████████████| 4/4 [00:00<00:00, 570.71ba/s][A
Uploading the dataset shards: 100%|██████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.26s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Tejeswara/mitre_attack_tactics_tachniques_instruction_cybersec/commit/7adf70b500cfaa444773b812e221ce5443745f09', commit_message='Upload dataset', commit_description='', oid='7adf70b500cfaa444773b812e221ce5443745f09', pr_url=None, pr_revision=None, pr_num=None)