# Imports

In [17]:
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import os
import re
import time

# Read in data

In [18]:
# read in cve.csv
cve = pd.read_csv("raw/cve.csv", header=None, names=["cve_id", "description"], sep=",", quotechar='"', skipinitialspace=True, encoding="utf-8")
cve.describe()

Unnamed: 0,cve_id,description
count,238918,238918
unique,238918,217014
top,CVE-1999-0001,Rejected reason: DO NOT USE THIS CANDIDATE NUM...
freq,1,1309


In [19]:
invalid_string = "Rejected reason:"

# Filter out rows containing the invalid string
cve = cve[~cve['description'].str.contains(invalid_string)]
cve.describe()

Unnamed: 0,cve_id,description
count,225017,225017
unique,225017,215611
top,CVE-1999-0001,Adobe Experience Manager versions 6.5.18 and e...
freq,1,98


In [22]:
cve.head()

Unnamed: 0,cve_id,description
0,CVE-1999-0001,ip_input.c in BSD-derived TCP/IP implementatio...
1,CVE-1999-0002,Buffer overflow in NFS mountd gives root acces...
2,CVE-1999-0003,Execute commands as root via buffer overflow i...
3,CVE-1999-0004,MIME buffer overflow in email clients e.g. So...
4,CVE-1999-0005,Arbitrary command execution via IMAP buffer ov...


In [29]:
cve_list = cve.to_dict(orient='records')
print(cve_list[1])

{'cve_id': 'CVE-1999-0002', 'description': 'Buffer overflow in NFS mountd gives root access to remote attackers  mostly in Linux systems.'}


In [49]:
AI_word = ['(A|a)rtificial (I|i)ntelligence',
            'LLMs?',
            '(L|l)arge (L|l)anguage (M|m)odels?',
            '(c|C)hat(gpt|GPT)',
            '(gpt|GPT)4',
            'NLP',
            '(n|N)atural (L|l)anguage',
            '(d|D)eep (L|l)earning',
            '(m|M)achine (L|l)earning']
temp = '|'.join(AI_word)

important_word = ['poison|poisoning|poisoned',
                    'evade|evasion',
                    'prompt injection',
                    'backdoor|backdoored',
                    'jailbreak',
                    'hijacking',
                    'extraction attacks?',
                    'privacy leakage',
                    'data leak|data leaking|data leakage',
                    'denial of service|DDOS',
                    'deepfake',
                    'adversarial examples?|adversarial attacks?|adversarial samples?',
                    'ransomwares?']
tech_str = '|'.join(important_word)

In [50]:
import re

pattern_whole = r'\b(?:{})\b'.format(temp)
# pattern_initial = r'\bAI\b'
AI_filter = [] 

for cve in cve_list:
    try:
        descri = cve['description']
        
        matches_whole = re.findall(pattern_whole, descri,flags=re.IGNORECASE)
        
        if matches_whole:
            #print(matches_whole)
            AI_filter.append(cve)
        
    except Exception as e:
        print(f"An error occurred: {e}")

print('number of papers filtered: ',len(AI_filter))

number of papers filtered:  437


In [51]:
import re

pattern_ai = r'\b(?:{})\b'.format(tech_str)
# pattern_initial = r'\bAI\b'
result=[]
for cve in AI_filter:
    try:
        descri = cve['description']
        
        matches_ai = re.findall(pattern_ai, descri,flags=re.IGNORECASE)
        
        if matches_ai:
            #print(matches_whole)
            result.append(cve)
        
    except Exception as e:
        print(f"An error occurred: {e}")

print('number of papers filtered: ',len(result))

number of papers filtered:  122


In [52]:
print(result[:10])

[{'cve_id': 'CVE-2021-29519', 'description': 'TensorFlow is an end-to-end open source platform for machine learning. The API of `tf.raw_ops.SparseCross` allows combinations which would result in a `CHECK`-failure and denial of service. This is because the implementation(https://github.com/tensorflow/tensorflow/blob/3d782b7d47b1bf2ed32bd4a246d6d6cadc4c903d/tensorflow/core/kernels/sparse_cross_op.cc#L114-L116) is tricked to consider a tensor of type `tstring` which in fact contains integral elements. Fixing the type confusion by preventing mixing `DT_STRING` and `DT_INT64` types solves this issue. The fix will be included in TensorFlow 2.5.0. We will also cherrypick this commit on TensorFlow 2.4.2  TensorFlow 2.3.3  TensorFlow 2.2.3 and TensorFlow 2.1.4  as these are also affected and still in supported range.'}, {'cve_id': 'CVE-2021-29522', 'description': 'TensorFlow is an end-to-end open source platform for machine learning. The `tf.raw_ops.Conv3DBackprop*` operations fail to validate 

In [48]:
print(AI_filter[:10])

[{'cve_id': 'CVE-2015-9501', 'description': 'The Artificial Intelligence theme before 1.2.4 for WordPress has XSS because Genericons HTML files are unnecessarily placed under the web root.'}, {'cve_id': 'CVE-2019-15896', 'description': 'An issue was discovered in the LifterLMS plugin through 3.34.5 for WordPress. The upload_import function in the class.llms.admin.import.php script is prone to an unauthenticated options import vulnerability that could lead to privilege escalation (administrator account creation)  website redirection  and stored XSS.'}, {'cve_id': 'CVE-2021-30348', 'description': 'Improper validation of LLM utility timers availability can lead to denial of service in Snapdragon Auto  Snapdragon Compute  Snapdragon Connectivity  Snapdragon Consumer Electronics Connectivity  Snapdragon Consumer IOT  Snapdragon Industrial IOT  Snapdragon Mobile  Snapdragon Voice & Music'}, {'cve_id': 'CVE-2021-39311', 'description': 'The link-list-manager WordPress plugin is vulnerable to R

# Create Model

In [23]:
# create a sentence transformer model
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=sentence_model)

# Extract top phrase

In [24]:
start = time.time()
cve["keywords"] = cve["description"].apply(lambda x: kw_model.extract_keywords(x,
                                        keyphrase_ngram_range=(1, 3),
                                        stop_words="english",
                                        use_mmr=True, diversity=0.9, top_n=3))
end = time.time()
print(f'time spent: {end - start:.1f} seconds')

KeyboardInterrupt: 

In [None]:
# write out cve_kw.csv
cve.to_csv("parsed/cve_kw.csv", index=False) 

In [None]:
# store keywords in a list
cve_kw = []
for i in range(len(cve)):
    for j in range(len(cve["keywords"][i])):
            cve_kw.append(cve["keywords"][i][j][0])

# remove duplicates
cve_kw = list(dict.fromkeys(cve_kw))

In [None]:
with open("parsed/cve_kw.txt", "w") as fd:
    for kw in cve_kw:
        fd.write(kw)
        fd.write(os.linesep)

In [None]:
cve_nouns = []
# remove non nouns
import nltk
for kp in cve_kw:
    noun = ""
    for kw in kp.split():
        if nltk.pos_tag([kw])[0][1].startswith("NN"):
            if noun == "":
                noun = kw
            else:
                noun += " " + kw
    if noun != "":
        cve_nouns.append(noun)

# remove duplicates
cve_nouns = list(dict.fromkeys(cve_nouns))

In [None]:
# extract single nouns
cve_single_nouns = []
for kp in cve_nouns:
    if len(kp.split()) == 1:
        cve_single_nouns.append(kp)

# write to file
with open("parsed/cve_single_words_removed.txt", "w") as fd:
    for kw in cve_single_nouns:
        fd.write(kw)
        fd.write(os.linesep)

# remove single words
cve_nouns = [x for x in cve_nouns if len(x.split()) > 1]


In [None]:
# write to file
with open("parsed/cve_kw_nouns_only.txt", "w") as fd:
    for kw in cve_nouns:
        fd.write(kw)
        fd.write(os.linesep)