In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
import nltk
from nltk.corpus import stopwords
# Naive Bayes 모델
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# NLTK 불용어 다운로드 및 로드
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#gpu 메모리 초기화
def clear_memory():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터 로드
try:
    cve_data = pd.read_csv('allitems_cleaned.csv', encoding='utf-8', low_memory=False)
    attack_data = pd.read_csv('enterprise-attack-v15.1.csv', encoding='utf-8')
    d3fend_data = pd.read_csv('d3fend.csv', encoding='utf-8')
except UnicodeDecodeError:
    cve_data = pd.read_csv('allitems_cleaned.csv', encoding='ISO-8859-1', low_memory=False)
    attack_data = pd.read_csv('enterprise-attack-v15.1.csv', encoding='ISO-8859-1')
    d3fend_data = pd.read_csv('d3fend.csv', encoding='ISO-8859-1')
    

In [48]:

# 텍스트 임베딩 함수
def embed_text(texts, tokenizer, model, device, max_length=512, padding=True, truncation=True):
    all_embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=truncation, padding=padding, max_length=max_length).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.append(embeddings)
    return all_embeddings

# 배치 임베딩 함수
def embed_batch(texts, tokenizer, model, device, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.extend(embeddings)
    return all_embeddings


In [4]:
clear_memory()
cve_data.tail()


Unnamed: 0,Name,Description
248181,CVE-2024-4309,SQL injection vulnerability in HubBank affecti...
248182,CVE-2024-4310,Cross-site Scripting (XSS) vulnerability in Hu...
248183,CVE-2024-4327,A vulnerability was found in Apryse WebViewer ...
248184,CVE-2024-4336,"Adive Framework 2.0.8, does not sufficiently e..."
248185,CVE-2024-4337,"Adive Framework 2.0.8, does not sufficiently e..."


아래 사용 안함. 사용하는곳부터 다시 체크

In [26]:
# BERT 모델과 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

cve_data['Description'] = cve_data['Description'].astype(str).fillna('')

# CVE 설명 임베딩
cve_data['embedding'] = embed_batch(cve_data['Description'].tolist(), tokenizer, model, device)


In [27]:
# ATT&CK 기술 설명 임베딩
attack_data['description'] = attack_data['description'].astype(str).fillna('')
attack_data['bert_embedding'] = embed_batch(attack_data['description'].tolist(), tokenizer, model, device)

# D3FEND 기술 설명 임베딩
d3fend_data['Definition'] = d3fend_data['Definition'].astype(str).fillna('')
d3fend_data['bert_embedding'] = embed_batch(d3fend_data['Definition'].tolist(), tokenizer, model, device)

In [28]:
#유사도 계산 및 매칭 함수
def find_best_match(cve_emb, tech_data, embedding_column):
    similarities = cosine_similarity([cve_emb], tech_data[embedding_column].tolist())
    best_match_idx = similarities.argmax()
    return tech_data.iloc[best_match_idx]

# CVE 데이터를 ATT&CK 및 D3FEND와 매칭
cve_data['bert_attack_match'] = [find_best_match(emb, attack_data, 'bert_embedding')['ID'] for emb in cve_data['embedding']]
cve_data['bert_d3fend_match'] = [find_best_match(emb, d3fend_data, 'bert_embedding')['ID'] for emb in cve_data['embedding']]

#bert 끝
clear_memory()


In [None]:


# Naive Bayes 모델

# TF-IDF 벡터화 및 Naive Bayes 모델 학습
tfidf = TfidfVectorizer(stop_words='english')

# 모든 데이터셋을 변환할 때 동일한 TF-IDF 벡터화기를 사용
combined_descriptions = pd.concat([attack_data['description'], d3fend_data['Definition'], cve_data['Description']])
tfidf.fit(combined_descriptions)

# ATT&CK 기술 설명 벡터화 및 모델 학습
attack_tfidf = tfidf.transform(attack_data['description'])
nb_attack_model = MultinomialNB().fit(attack_tfidf, attack_data['ID'])

# D3FEND 기술 설명 벡터화 및 모델 학습
d3fend_tfidf = tfidf.transform(d3fend_data['Definition'])
nb_d3fend_model = MultinomialNB().fit(d3fend_tfidf, d3fend_data['ID'])

# CVE 설명 벡터화
cve_tfidf = tfidf.transform(cve_data['Description']) 

# CVE 데이터를 ATT&CK 및 D3FEND와 매칭
cve_data['nb_attack_match'] = nb_attack_model.predict(cve_tfidf)
cve_data['nb_d3fend_match'] = nb_d3fend_model.predict(cve_tfidf)

#나이브 베이즈 끝
clear_memory()


In [30]:

secbert_tokenizer = BertTokenizer.from_pretrained('SecBERT-main')
secbert_model = BertModel.from_pretrained('SecBERT-main').to(device)

# CVE 설명 임베딩
cve_data['secbert_embedding'] = embed_batch(cve_data['Description'].tolist(), secbert_tokenizer, secbert_model, device)

# ATT&CK 기술 설명 임베딩
attack_data['secbert_embedding'] = embed_batch(attack_data['description'].tolist(), secbert_tokenizer, secbert_model, device)

# D3FEND 기술 설명 임베딩
d3fend_data['secbert_embedding'] = embed_batch(d3fend_data['Definition'].tolist(), secbert_tokenizer, secbert_model, device)

# CVE 데이터를 ATT&CK 및 D3FEND와 매칭
cve_data['secbert_attack_match'] = [find_best_match(emb, attack_data, 'secbert_embedding')['ID'] for emb in cve_data['secbert_embedding']]
cve_data['secbert_d3fend_match'] = [find_best_match(emb, d3fend_data, 'secbert_embedding')['ID'] for emb in cve_data['secbert_embedding']]

#secBert 끝
clear_memory()

In [None]:
cve_data.tail()

In [13]:
cve_data.drop(columns='embedding', inplace=True)
cve_data.drop(columns='secbert_embedding', inplace=True)
cve_data.to_csv('cve_with_matches__.csv', index=False)


In [None]:
cve_data.tail()

In [31]:
#매칭 근거 확인
def find_key_tokens(cve_text, tech_text, tokenizer, model, device):
    cve_embeds, cve_inputs = embed_text(cve_text, tokenizer, model, device)
    tech_embeds, tech_inputs = embed_text(tech_text, tokenizer, model, device)

    cve_tokens = tokenizer.convert_ids_to_tokens(cve_inputs['input_ids'][0])
    tech_tokens = tokenizer.convert_ids_to_tokens(tech_inputs['input_ids'][0])

    cve_embeds = cve_embeds.squeeze(0)  # Remove batch dimension
    tech_embeds = tech_embeds.squeeze(0)  # Remove batch dimension

    similarities = cosine_similarity(cve_embeds.cpu().numpy(), tech_embeds.cpu().numpy())

    cve_key_tokens = []
    tech_key_tokens = []

    for i in range(len(similarities)):
        cve_token = cve_tokens[i]
        tech_token = tech_tokens[similarities[i].argmax()]
        cve_key_tokens.append((cve_token, similarities[i].max()))
        tech_key_tokens.append((tech_token, similarities[i].max()))

    return cve_key_tokens, tech_key_tokens

In [72]:
#clear_memory()
## 0524  사용안함.
# 주요 단어 찾기 함수 (불용어 제거 포함)
def find_key_tokens(cve_text, tech_text, tokenizer, model, device, max_length=512, padding=True, truncation=True):
    cve_embeds = embed_text([cve_text], tokenizer, model, device, max_length, padding, truncation)[0]
    tech_embeds = embed_text([tech_text], tokenizer, model, device, max_length, padding, truncation)[0]

    cve_inputs = tokenizer(cve_text, return_tensors='pt', truncation=truncation, padding=padding, max_length=max_length)
    tech_inputs = tokenizer(tech_text, return_tensors='pt', truncation=truncation, padding=padding, max_length=max_length)

    cve_tokens = tokenizer.convert_ids_to_tokens(cve_inputs['input_ids'][0])
    tech_tokens = tokenizer.convert_ids_to_tokens(tech_inputs['input_ids'][0])

    similarities = cosine_similarity(cve_embeds, tech_embeds)

    cve_key_tokens = []
    tech_key_tokens = []

    for i in range(min(len(cve_tokens), similarities.shape[0])):  # 토큰 수와 임베딩 수를 맞춤
        cve_token = cve_tokens[i]
        tech_token_idx = similarities[i].argmax()
        tech_token = tech_tokens[tech_token_idx]
        if cve_token not in stop_words and tech_token not in stop_words and cve_token != '[CLS]' and cve_token != '[SEP]' and tech_token != '[CLS]' and tech_token != '[SEP]':
            cve_key_tokens.append((cve_token, similarities[i].max()))
            tech_key_tokens.append((tech_token, similarities[i, tech_token_idx]))

    return cve_key_tokens, tech_key_tokens

다음부터 다시 시작

In [5]:
# 데이터 전처리: 모든 설명을 문자열로 변환하고, NaN 값을 빈 문자열로 대체
cve_data['Description'] = cve_data['Description'].astype(str).fillna('')
attack_data['description'] = attack_data['description'].astype(str).fillna('')
d3fend_data['Definition'] = d3fend_data['Definition'].astype(str).fillna('')

# BERT 모델과 토크나이저 로드
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# secBERT 모델과 토크나이저 로드
secbert_tokenizer = BertTokenizer.from_pretrained('SecBERT-main')
secbert_model = BertModel.from_pretrained('SecBERT-main').to(device)



In [7]:
# Naive Bayes 주요 단어 찾기 함수
def find_key_tokens_nb(cve_text, tech_text, tfidf):
    cve_tfidf = tfidf.transform([cve_text])
    tech_tfidf = tfidf.transform([tech_text])

    cve_tokens = tfidf.get_feature_names_out()
    tech_tokens = tfidf.get_feature_names_out()

    cve_key_tokens = []
    tech_key_tokens = []

    cve_array = cve_tfidf.toarray().flatten()
    tech_array = tech_tfidf.toarray().flatten()


    for i in range(len(cve_array)):
        if cve_array[i] > 0 and cve_tokens[i] not in stop_words and tech_tokens[tech_array.argmax()] not in stop_words:
            cve_token = cve_tokens[i]
            tech_token = tech_tokens[tech_array.argmax()]
            cve_key_tokens.append((cve_token, cve_array[i]))
            tech_key_tokens.append((tech_token, tech_array[tech_array.argmax()]))

    return cve_key_tokens, tech_key_tokens


In [8]:
# 모든 텍스트 데이터를 합침
all_descriptions = pd.concat([cve_data['Description'], attack_data['description'], d3fend_data['Definition']])

# Naive Bayes 모델
tfidf = TfidfVectorizer(stop_words='english')

# 모든 텍스트 데이터를 사용하여 TF-IDF 학습
tfidf.fit(all_descriptions)

# ATT&CK 및 D3FEND 기술 설명 벡터화 및 모델 학습
attack_tfidf = tfidf.transform(attack_data['description'])
d3fend_tfidf = tfidf.transform(d3fend_data['Definition'])

nb_attack_model = MultinomialNB().fit(attack_tfidf, attack_data['ID'])
nb_d3fend_model = MultinomialNB().fit(d3fend_tfidf, d3fend_data['ID'])

# CVE 설명 벡터화
cve_tfidf = tfidf.transform(cve_data['Description'])

# Naive Bayes 매칭 수행
cve_data['nb_attack_match'] = nb_attack_model.predict(cve_tfidf)
cve_data['nb_d3fend_match'] = nb_d3fend_model.predict(cve_tfidf)

In [9]:
cve_data.tail()

Unnamed: 0,Name,Description,nb_attack_match,nb_d3fend_match
248181,CVE-2024-4309,SQL injection vulnerability in HubBank affecti...,T1505.001,D3-DQSA
248182,CVE-2024-4310,Cross-site Scripting (XSS) vulnerability in Hu...,T1137.003,D3-WSAA
248183,CVE-2024-4327,A vulnerability was found in Apryse WebViewer ...,T1615,D3-EHPV
248184,CVE-2024-4336,"Adive Framework 2.0.8, does not sufficiently e...",T1563.002,D3-WSAA
248185,CVE-2024-4337,"Adive Framework 2.0.8, does not sufficiently e...",T1563.002,D3-WSAA


In [10]:
# 유사도 계산 및 매칭 함수 (유사도 점수 포함)
def find_best_match_with_score(cve_emb, tech_data, embedding_column):
    similarities = cosine_similarity([cve_emb], tech_data[embedding_column].tolist())
    best_match_idx = similarities.argmax()
    return tech_data.iloc[best_match_idx], similarities[0, best_match_idx]

# CVE 설명 임베딩 (전체 데이터)
cve_data['bert_embedding'] = embed_text(cve_data['Description'].tolist(), bert_tokenizer, bert_model, device)
cve_data['secbert_embedding'] = embed_text(cve_data['Description'].tolist(), secbert_tokenizer, secbert_model, device)

In [11]:

# ATT&CK 및 D3FEND 기술 설명 임베딩
attack_data['bert_embedding'] = embed_text(attack_data['description'].tolist(), bert_tokenizer, bert_model, device)
d3fend_data['bert_embedding'] = embed_text(d3fend_data['Definition'].tolist(), bert_tokenizer, bert_model, device)
attack_data['secbert_embedding'] = embed_text(attack_data['description'].tolist(), secbert_tokenizer, secbert_model, device)
d3fend_data['secbert_embedding'] = embed_text(d3fend_data['Definition'].tolist(), secbert_tokenizer, secbert_model, device)


In [87]:
print(len(cve_data))
print(type(cve_data))
#파일에바로옮김으로써 임시 실험결과 사용
suff = cve_data.sample(frac=1)  # row 전체 shuffle
suff = suff.sample(frac=1).reset_index(drop=True)  # shuffling하고 index reset
print(suff.head())
f = open("results.json", "w+")

248186
<class 'pandas.core.frame.DataFrame'>
             Name                                        Description  \
0   CVE-2014-5787  The Ninja Chicken (aka mominis.Generic_Android...   
1   CVE-2024-0502  A vulnerability was found in SourceCodester Ho...   
2  CVE-2018-13182  The mintToken function of a smart contract imp...   
3  CVE-2021-22393  There is a denial of service vulnerability in ...   
4  CVE-2024-29033  OAuthenticator provides plugins for JupyterHub...   

  nb_attack_match nb_d3fend_match  \
0       T1608.003           D3-CP   
1       T1587.004          D3-LFP   
2       T1134.003          D3-CRO   
3       T1102.002          D3-EHB   
4       T1136.002          D3-DAM   

                                      bert_embedding  \
0  [-0.06675164, 0.15789232, 0.1672521, -0.050908...   
1  [-0.3285495, -0.022576556, 0.08307483, 0.02890...   
2  [-0.30898392, -0.2672998, 0.12467569, 0.008109...   
3  [-0.36092278, -0.038602345, 0.248543, 0.005265...   
4  [-0.12961851, 0.

In [89]:
def find_key_tokens(cve_text, tech_text, tokenizer, model, device, max_length=512, padding=True, truncation=True):
    # CVE 텍스트와 기술 텍스트의 토큰화 및 임베딩 생성
    cve_inputs = tokenizer(cve_text, return_tensors='pt', truncation=truncation, padding=padding, max_length=max_length).to(device)
    tech_inputs = tokenizer(tech_text, return_tensors='pt', truncation=truncation, padding=padding, max_length=max_length).to(device)

    with torch.no_grad():
        cve_outputs = model(**cve_inputs)
        tech_outputs = model(**tech_inputs)

    cve_embeds = cve_outputs.last_hidden_state.squeeze().cpu().numpy()
    tech_embeds = tech_outputs.last_hidden_state.squeeze().cpu().numpy()

    cve_tokens = tokenizer.convert_ids_to_tokens(cve_inputs['input_ids'][0])
    tech_tokens = tokenizer.convert_ids_to_tokens(tech_inputs['input_ids'][0])

    # 각 토큰 임베딩 벡터 간의 유사도 계산
    similarities = cosine_similarity(cve_embeds, tech_embeds)

    cve_key_tokens = []
    tech_key_tokens = []

    for i in range(len(cve_tokens)):
        if i >= similarities.shape[0]:  # 토큰 수와 임베딩 수를 맞춤
            break
        cve_token = cve_tokens[i]
        tech_token_idx = similarities[i].argmax()
        tech_token = tech_tokens[tech_token_idx]
        if cve_token not in stop_words and tech_token not in stop_words and cve_token != '[CLS]' and cve_token != '[SEP]' and tech_token != '[CLS]' and tech_token != '[SEP]':
            cve_key_tokens.append((cve_token, similarities[i].max()))
            tech_key_tokens.append((tech_token, similarities[i, tech_token_idx]))

    return cve_key_tokens, tech_key_tokens


In [96]:
def convert_to_float(obj):
    if isinstance(obj, np.float32):
        return float(obj)
    elif isinstance(obj, list):
        return [convert_to_float(i) for i in obj]
    elif isinstance(obj, tuple):
        return tuple(convert_to_float(i) for i in obj)
    elif isinstance(obj, dict):
        return {k: convert_to_float(v) for k, v in obj.items()}
    return obj

In [98]:
import time
import json
import numpy as np
# 매칭 근거를 포함한 데이터프레임 생성
matching_data = []
f = open("results.json", "w+")
for idx, row in suff.iterrows():
    start = time.time()
    cve_text = row['Description']
    print(f"{idx}/{len(suff)} 시작.")
    # BERT 매칭
    bert_attack_match, bert_attack_score = find_best_match_with_score(row['bert_embedding'], attack_data, 'bert_embedding')
    bert_d3fend_match, bert_d3fend_score = find_best_match_with_score(row['bert_embedding'], d3fend_data, 'bert_embedding')

    attack_match_bert_text = attack_data[attack_data['ID'] == bert_attack_match['ID']].iloc[0]['description']
    d3fend_match_bert_text = d3fend_data[d3fend_data['ID'] == bert_d3fend_match['ID']].iloc[0]['Definition']

    cve_key_tokens_bert, attack_key_tokens_bert = find_key_tokens(cve_text, attack_match_bert_text, bert_tokenizer, bert_model, device)
    _, d3fend_key_tokens_bert = find_key_tokens(cve_text, d3fend_match_bert_text, bert_tokenizer, bert_model, device)

    # secBERT 매칭
    secbert_attack_match, secbert_attack_score = find_best_match_with_score(row['secbert_embedding'], attack_data, 'secbert_embedding')
    secbert_d3fend_match, secbert_d3fend_score = find_best_match_with_score(row['secbert_embedding'], d3fend_data, 'secbert_embedding')

    attack_match_secbert_text = attack_data[attack_data['ID'] == secbert_attack_match['ID']].iloc[0]['description']
    d3fend_match_secbert_text = d3fend_data[d3fend_data['ID'] == secbert_d3fend_match['ID']].iloc[0]['Definition']

    cve_key_tokens_secbert, attack_key_tokens_secbert = find_key_tokens(cve_text, attack_match_secbert_text, secbert_tokenizer, secbert_model, device)
    d3fend_key_tokens_secbert, _ = find_key_tokens(cve_text, d3fend_match_secbert_text, secbert_tokenizer, secbert_model, device)

    # Naive Bayes 매칭
    attack_match_nb = attack_data[attack_data['ID'] == row['nb_attack_match']].iloc[0]['description']
    d3fend_match_nb = d3fend_data[d3fend_data['ID'] == row['nb_d3fend_match']].iloc[0]['Definition']

    cve_key_tokens_nb, attack_key_tokens_nb = find_key_tokens_nb(cve_text, attack_match_nb, tfidf)
    d3fend_key_tokens_nb, _ = find_key_tokens_nb(cve_text, d3fend_match_nb, tfidf)

    data_json = {
        'cve_id': row['Name'],
        'cve_description': cve_text,
        'bert_attack_match': bert_attack_match['ID'],
        'bert_d3fend_match': bert_d3fend_match['ID'],
        'secbert_attack_match': secbert_attack_match['ID'],
        'secbert_d3fend_match': secbert_d3fend_match['ID'],
        'nb_attack_match': row['nb_attack_match'],
        'nb_d3fend_match': row['nb_d3fend_match'],
        'cve_key_tokens_bert': cve_key_tokens_bert,
        'attack_key_tokens_bert': attack_key_tokens_bert,
        'd3fend_key_tokens_bert': d3fend_key_tokens_bert,
        'cve_key_tokens_secbert': cve_key_tokens_secbert,
        'attack_key_tokens_secbert': attack_key_tokens_secbert,
        'd3fend_key_tokens_secbert': d3fend_key_tokens_secbert,
        'cve_key_tokens_nb': cve_key_tokens_nb,
        'attack_key_tokens_nb': attack_key_tokens_nb,
        'd3fend_key_tokens_nb': d3fend_key_tokens_nb
    }
    # 변환된 데이터
    converted_data = convert_to_float(data_json)
    # print(converted_data)
    json.dump(converted_data, f, indent=4)
    matching_data.append(converted_data)
    end = time.time()
    print(f"{idx}/{len(suff)} 종료. {end-start:.2f} 초 수행")
    # if idx == 3:
    #     break
f.close()

0/248186 시작.
0/248186 종료. 0.60 초 수행
1/248186 시작.
1/248186 종료. 0.44 초 수행
2/248186 시작.
2/248186 종료. 0.42 초 수행
3/248186 시작.
3/248186 종료. 0.46 초 수행
4/248186 시작.
4/248186 종료. 0.44 초 수행
5/248186 시작.
5/248186 종료. 0.42 초 수행
6/248186 시작.
6/248186 종료. 0.47 초 수행
7/248186 시작.
7/248186 종료. 0.51 초 수행
8/248186 시작.
8/248186 종료. 0.50 초 수행
9/248186 시작.
9/248186 종료. 0.47 초 수행
10/248186 시작.
10/248186 종료. 0.50 초 수행
11/248186 시작.
11/248186 종료. 0.53 초 수행
12/248186 시작.
12/248186 종료. 0.51 초 수행
13/248186 시작.
13/248186 종료. 0.50 초 수행
14/248186 시작.
14/248186 종료. 0.50 초 수행
15/248186 시작.
15/248186 종료. 0.55 초 수행
16/248186 시작.
16/248186 종료. 0.51 초 수행
17/248186 시작.
17/248186 종료. 0.50 초 수행
18/248186 시작.
18/248186 종료. 0.51 초 수행
19/248186 시작.
19/248186 종료. 0.55 초 수행
20/248186 시작.
20/248186 종료. 0.51 초 수행
21/248186 시작.
21/248186 종료. 0.55 초 수행
22/248186 시작.
22/248186 종료. 0.46 초 수행
23/248186 시작.
23/248186 종료. 0.48 초 수행
24/248186 시작.
24/248186 종료. 0.50 초 수행
25/248186 시작.
25/248186 종료. 0.44 초 수행
26/248186 시작.
26/248186 종료. 0.51

KeyboardInterrupt: 

In [99]:
matching_data

[{'cve_id': 'CVE-2014-5787',
  'cve_description': 'The Ninja Chicken (aka mominis.Generic_Android.Ninja_Chicken) application 1.7.6 for Android does not verify X.509 certificates from SSL servers, which allows man-in-the-middle attackers to spoof servers and obtain sensitive information via a crafted certificate.',
  'bert_attack_match': 'T1218.015',
  'bert_d3fend_match': 'D3-CP',
  'secbert_attack_match': 'T1553.004',
  'secbert_d3fend_match': 'D3-CP',
  'nb_attack_match': 'T1608.003',
  'nb_d3fend_match': 'D3-CP',
  'cve_key_tokens_bert': [('ninja', 0.3976874351501465),
   ('chicken', 0.4085695743560791),
   ('(', 0.8097253441810608),
   ('aka', 0.4418894648551941),
   ('mom', 0.4180621802806854),
   ('##ini', 0.48612552881240845),
   ('##s', 0.6071467399597168),
   ('.', 0.7746217846870422),
   ('generic', 0.5340967178344727),
   ('_', 0.6168649196624756),
   ('android', 0.4804013967514038),
   ('.', 0.779060423374176),
   ('ninja', 0.4459727108478546),
   ('_', 0.6460260152816772),

In [100]:
import xml.etree.ElementTree as ET
# XML 생성 함수
def create_xml(data):
    root = ET.Element("Root")

    for item in data:
        cve_entry = ET.SubElement(root, "CVE_Entry")

        for key, value in item.items():
            if isinstance(value, list) and all(isinstance(i, tuple) for i in value):
                tokens_element = ET.SubElement(cve_entry, key)
                for token, score in value:
                    token_element = ET.SubElement(tokens_element, "Token")
                    token_element.set("score", str(score))
                    token_element.text = token
            else:
                element = ET.SubElement(cve_entry, key)
                element.text = str(value)

    tree = ET.ElementTree(root)
    tree.write("output_after.xml", encoding="utf-8", xml_declaration=True)

create_xml(matching_data)

---------------------------
토큰화된 단어의 상관관계성 분석
단어 토큰화된것 출력해보기
---------------------------