In [None]:
import json

with open("datasets/syn_sentence_list.json", "r") as f:
    artificial_data = json.load(f)

len(artificial_data)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

test_size = 0.2
random_state = 0

tram_df = pd.read_json("datasets/tram_train.json")
df_train, df_val = train_test_split(tram_df, test_size=test_size, random_state=random_state)
df_train.shape, df_val.shape

In [None]:
import loader

model = loader.load_model_for_embedding("sentence-transformers/all-mpnet-base-v2")

In [None]:
from collections import Counter

# Flatten the list of labels and count the occurrences of each label
label_counts = Counter(label for labels in tram_df['labels'] for label in labels)

# Convert the counter to a DataFrame for better visualization
label_distribution = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count']).sort_values(by='count', ascending=False)
print(label_distribution)

## Synthetic Data (Rebalanced)

In [None]:
import numpy as np
from tqdm import tqdm

alpha = 0.3
beta = 0.9

artificial_data_selected = []

for idx, row in tqdm(df_train.iterrows(), total=df_train.shape[0]):
    sentence = row['sentence']
    labels = row["labels"]
    sent_encoding = model.encode(sentence)
    artificial_sent_w_labels = artificial_data.get(sentence, [])
    augmented_sentences = [item['augmented_sentence'] for item in artificial_sent_w_labels]
    artificial_sent_encoding = model.encode(augmented_sentences)
    similarity = model.similarity(sent_encoding, artificial_sent_encoding)
    
    # Find indices where similarity is between alpha and beta
    indices = np.where((similarity >= alpha) & (similarity <= beta))[1]
    
    # if no sentence matches criteria, skip it
    if len(indices) < 1:
        continue

    # if sentence has no labels, then select a random sentence
    if len(labels) == 0:
        rand_idx = np.random.choice(indices)
        artificial_data_selected.append(artificial_sent_w_labels[rand_idx])
    else:
        for idx in indices:
            selected_sentence = artificial_sent_w_labels[idx]
            artificial_data_selected.append(selected_sentence)

In [None]:
from collections import Counter

# Get the index with the maximum similarity
# Extract labels from artificial_data_selected
selected_labels = [label for item in artificial_data_selected for label in item['labels']]

# Count the occurrences of each label
selected_label_counts = Counter(selected_labels)

# Convert the counter to a DataFrame for better visualization
selected_label_distribution = pd.DataFrame.from_dict(selected_label_counts, orient='index', columns=['count']).sort_values(by='count', ascending=False)
print(selected_label_distribution)

In [None]:
import random

artificial = {
    "sentence": [],
    "labels": []
}

for label in label_distribution.index:
    count = label_distribution.loc[label, 'count']
    # get from selected_artificial_data the sentences with the label
    selected_sentences = [item for item in artificial_data_selected if label in item['labels']]
    random_sentences = random.sample(selected_sentences, count)
    artificial['sentence'].extend([data['augmented_sentence'] for data in random_sentences])
    artificial['labels'].extend([data['labels'] for data in random_sentences])

for data in artificial_data_selected:
    if len(data['labels']) == 0:
        artificial['sentence'].append(data['augmented_sentence'])
        artificial['labels'].append(data['labels'])


In [None]:
artificial_df = pd.DataFrame(artificial)
artificial_df.drop_duplicates(subset=['sentence'], inplace=True)
artificial_df

In [None]:
augmented_df = pd.concat([df_train, artificial_df], ignore_index=True)
augmented_df.reset_index(drop=True, inplace=True)
augmented_df

In [None]:
augmented_df['doc_title'].fillna('artificial_data', inplace=True)

In [None]:
# Shuffle the dataframe
augmented_df_shuffled = augmented_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

# Save to JSON file
augmented_df_shuffled.to_json("datasets/tram_train_augmented_artificial.json")

## Tram OOD 

In [None]:
from mitreattack.stix20 import MitreAttackData
from loader import load_model_for_embedding
from const import MODEL_SENTENCE_SIM
from tqdm import tqdm
import pickle

In [None]:
available_ttps = tram_df.explode('labels').labels.unique().tolist()
available_ttps

In [None]:
import markdown
from bs4 import BeautifulSoup

def clean_md(md_content):
    html_content = markdown.markdown(md_content)
    soup = BeautifulSoup(html_content, 'html.parser')
    text_content = soup.get_text()
    return text_content


mitre_attack_data = MitreAttackData("datasets/enterprise-attack.json")
techniques = mitre_attack_data.get_techniques(remove_revoked_deprecated=True)
print(f"Retrieved {len(techniques)} ATT&CK techniques ...")
ttps = []
models = {}

ood_data = {
    "sentence": [],
    "labels": []
}

for t in techniques:
    ttp_id = mitre_attack_data.get_attack_id(t["id"])
    if ttp_id in available_ttps:
        groups = mitre_attack_data.get_groups_using_technique(t["id"])
        group_relationships = [g["relationships"] for g in groups]
        group_relationships_flat = [item for sublist in group_relationships for item in sublist]
        groups_procedures = [clean_md(g.description).split("(Citation:")[0] for g in group_relationships_flat if g.relationship_type == "uses"]
        ood_data['sentence'].extend(groups_procedures)
        for g in groups_procedures:
            ood_data['labels'].extend([[ttp_id]])

        software = mitre_attack_data.get_software_using_technique(t["id"])
        software_relationships = [s["relationships"] for s in software]
        software_relationships_flat = [item for sublist in software_relationships for item in sublist]
        software_procedures = [clean_md(s.description).split("(Citation:")[0] for s in software_relationships_flat if s.relationship_type == "uses"]
        ood_data['sentence'].extend(software_procedures)
        for s in software_procedures:
            ood_data['labels'].extend([[ttp_id]])
        print(ttp_id, len(group_relationships), len(software_relationships))
        

In [None]:
df_train.shape, pd.DataFrame(ood_data).shape

In [None]:
ood_data_df = pd.DataFrame(ood_data) 
#ood_data_df.drop_duplicates(subset=['sentence'], inplace=True)
ood_data_df[ood_data_df.duplicated(subset=['sentence'])]

In [None]:
import random

selected_ood_data = {
    "sentence": [],
    "labels": []
}

for label, count in label_counts.items():
    # Get sentences from ood_data_df that contain the label
    sentences_with_label = ood_data_df[ood_data_df['labels'].apply(lambda x: label in x)]
    
    # Select at most 'count' random sentences
    selected_sentences = sentences_with_label.sample(min(count, len(sentences_with_label)), random_state=random_state)
    
    # Append the selected sentences and their labels to the selected_ood_data dictionary
    selected_ood_data["sentence"].extend(selected_sentences["sentence"].tolist())
    selected_ood_data["labels"].extend(selected_sentences["labels"].tolist())

selected_ood_data_df = pd.DataFrame(selected_ood_data)
selected_ood_data_df

In [None]:
# Group by 'sentence' and aggregate 'labels' into a list
merged_ood_data_df = ood_data_df.groupby('sentence')['labels'].apply(lambda x: list(set([label for sublist in x for label in sublist]))).reset_index()

# Rename the columns to match the original DataFrame
merged_ood_data_df.columns = ['sentence', 'labels']

merged_ood_data_df

In [None]:
merged_ood_data_df.to_csv("datasets/ood_data.csv", index=False)

In [None]:
augmented_ood_df = pd.concat([df_train, merged_ood_data_df], ignore_index=True)
augmented_ood_df.reset_index(drop=True, inplace=True)
augmented_df_shuffled = augmented_ood_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
augmented_df_shuffled['doc_title'].fillna('ood_data', inplace=True)
augmented_df_shuffled

In [None]:
augmented_df_shuffled.to_json("datasets/tram_train_augmented_ood.json")