# NER BERT Experiment 1
This code will install and run all the required packages needed to run this note book.  
It is recommended to use a venv to not compromise system packages.  

This code will extract phrases that could be considered vulnerability phrases in NVD dataset.
The code will process and prepare the phrases to be used for BERT fine tuning

In [None]:
!pip install wheel 
!pip install setuptools
!pip install --upgrade pip
!pip install torch torchvision
!pip install wget
!pip install tensorflow
!pip install spacy
!python3 -m spacy download en_core_web_lg
!git clone https://github.com/huggingface/transformers.git
!cd transformers; pip install -e .;
!cp custom_run.sh transformers/examples/legacy/token-classification/

In [None]:
import string

In [None]:
print(string.punctuation)
# s.translate(str.maketrans('', '', string.punctuation))

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

NVD_DATA = ['https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2021.json.gz',
           'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-2020.json.gz']

import gzip
import shutil
import wget
import os
import json

# Download the file (if we haven't already)
for url in NVD_DATA:
    f_name_comp = url.split('/')[-1]
    f_name_decomp = url.split('/')[-1].strip('.gz')
  
    if not os.path.exists(f_name_comp):
        wget.download(url, f_name_comp)

    if not os.path.exists(f_name_decomp):
        with gzip.open(f_name_comp, 'rb') as f_in:
            with open(f_name_decomp, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
                
# Import description of CVEs from nvd json files
cve_dict_list = []
for url in NVD_DATA:
    f_name_decomp = url.split('/')[-1].strip('.gz')
    with open(f_name_decomp) as f:
        cve_dict_list.append(json.load(f))

In [None]:
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_lg")

In [None]:
corpus = ''
counter = 0
LIMIT = 5000

sentence_set = set()
SENT_FLAG = 'SENTSTARTFLAG'

for cve_dict in cve_dict_list:
    for cve in cve_dict['CVE_Items']:
        desc_list = cve['cve']['description']['description_data']
        for desc in desc_list:
            sentences = [x.lower() for x in desc['value'].split('. ')]
            for sentence in sentences:
                if sentence in sentence_set:
                    continue
                
                s = sentence.translate(str.maketrans(' ', ' ', '''!"#$%&'()*+,/:;<=>?@[\]^_`{|}~'''))
                sentence_set.add(s)
                corpus += f'{SENT_FLAG} {s}\n'

                counter += 1
                if counter >= LIMIT:
                    break
            if counter >= LIMIT:
                break
        if counter >= LIMIT:
            break
    if counter >= LIMIT:
        break

print(len(sentence_set))
# sentence_set = set()

In [None]:
a = [['DET', 'ADJ', 'NOUN', 'NOUN', 'NOUN'],
['DET', 'NOUN', 'PROPN', 'PROPN', 'NOUN'],
['DET', 'ADJ', 'NOUN', 'NOUN', 'NOUN'],
['DET', 'PROPN', 'NOUN', 'NOUN'],
['DET', 'PROPN', 'NOUN', 'NOUN'],
['DET', 'NOUN', 'NOUN', 'NOUN'],
['DET', 'ADJ', 'NOUN', 'NOUN', 'NOUN'],
['DET', 'NOUN', 'NOUN', 'VERB', 'NOUN'],
['DET', 'NOUN', 'PUNCT', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'NOUN', 'PUNCT', 'NOUN'],
['DET', 'ADJ', 'NOUN', 'NOUN', 'NOUN'],
['DET', 'ADJ', 'ADJ', 'NOUN', 'NOUN'],
['DET', 'NOUN', 'PUNCT', 'VERB', 'NOUN', 'VERB', 'NOUN'],
['DET', 'VERB', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'NOUN'],
['DET', 'ADJ', 'PROPN', 'NOUN', 'NOUN'],
['DET', 'NOUN', 'PUNCT', 'ADP', 'PUNCT', 'ADJ', 'NOUN']]

s = set()
for i in a:
    for j in i:
        s.add(j)
print(s)

In [None]:
# nlp.max_length = 10000000
doc = nlp(corpus)

In [None]:
def is_in_black_list(w_list):
    BLK_LIST = [
        ['exploitable', 'vulnerability'], 
        ['exploit', 'vulnerability'],
        ['vulnerability']]
    
    lower_case_list = [x.lower() for x in w_list]
    for unwanted in BLK_LIST:
        if lower_case_list == unwanted:
            return True
    return False


def get_vul_phrase(w_qu, p_qu):
    rev_w_qu = reversed(w_qu)
    rev_p_qu = reversed(p_qu)
    acceptable_pos = {'ADP', 'DET', 'NOUN', 'PUNCT', 'ADJ', 'PROPN', 'VERB'}
    
    w_result = []
    p_result = []
    
   
    for w, p in zip(rev_w_qu, rev_p_qu):
        if p in acceptable_pos:
            w_result.append(w)
            p_result.append(p)
        else:
            if is_in_black_list(reversed(w_result)):
                return False, False
            else:
                return list(reversed(w_result)), list(reversed(p_result))
        
        if p_result == ['NOUN', 'DET']:
            return False, False
    
        if p_result == ['NOUN', 'ADP']:
            return False, False
        
        if p == 'DET' and len(p_result) > 2:
            break       

    if is_in_black_list(reversed(w_result)):
        return False, False
    
    if p_result[-1] == 'PUNCT':
        w_result = w_result[:-1]
        p_result = p_result[:-1]
    
    if p_result[-1] == 'DET':
        w_result = w_result[:-1]
        p_result = p_result[:-1]
    
    return list(reversed(w_result)), list(reversed(p_result))

def update_ner_tag(w_list, v_list):
    last_index = len(v_list) - 1
    rev_w_list = reversed(w_list)
    for i, w in enumerate(rev_w_list):
        if v_list[last_index - i]['tok'] != w:
            raise ValueError
        if i == (len(w_list) - 1):
            v_list[last_index - i]['tag'] = 'B-VUL'
        else:
            v_list[last_index - i]['tag'] = 'I-VUL'

In [None]:
import queue
wrd_queue = queue.Queue(20) 
pos_queue = queue.Queue(20) 

vocab_pos_ner_list = []

FLAG_WORD = 'vulnerability'

extracted_phrases = set()
sent_length = []
for sent in doc.sents:
    sent_length.append(len(list(sent)))
    
    for tok in sent:
        if wrd_queue.full():
            _ = wrd_queue.get()
            _ = pos_queue.get()
        
        tk = tok.text
        tp = tok.pos_
#         if tok.text == SENT_FLAG:
#             tk = '-DOCSTART-'
#             tp = '-DOCSTART-'
            
        vocab = {
            'tok': tk,
            'pos': tp,
            'tag': 'O'}
            
        vocab_pos_ner_list.append(vocab)
        wrd_queue.put_nowait(tk)
        pos_queue.put_nowait(tp)
        if tok.text.lower() == FLAG_WORD:
            w, p = get_vul_phrase(list(wrd_queue.queue), 
                                  list(pos_queue.queue))
            if w:
                update_ner_tag(w, vocab_pos_ner_list)
                extracted_phrases.add(' '.join(list(w)).strip(SENT_FLAG).strip())
#                 print(list(w))
#                 print(list(p))
#                 print('\n')

In [None]:
print(len(extracted_phrases))
extracted_phrases

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
sent_length.sort()
pd.Series(sent_length).hist(bins=50, figsize=(8,5))
# plt.xticks(np.linspace(0,1000,50));
plt.xticks(rotation=90);
print(pd.Series(sent_length).mean())

In [None]:
df = pd.DataFrame(vocab_pos_ner_list)
print(df.shape)

In [None]:
df.loc[df['tok'] == SENT_FLAG]
df.to_csv('vul_tags.csv', index=False)
train_perc = int(df.shape[0] * 50/100)
dev_perc = train_perc + int(df.shape[0] * 25/100)

df.loc[0:train_perc].to_csv('train.txt.tmp', sep=' ', columns=['tok', 'tag'], index=False, header=False)
df.loc[train_perc: dev_perc].to_csv('dev.txt.tmp', sep=' ', columns=['tok', 'tag'], index=False, header=False) #, header=['-DOCSTART-', 'O'])
df.loc[dev_perc:].to_csv('test.txt.tmp', sep=' ', columns=['tok', 'tag'], index=False, header=False) #header=['-DOCSTART-', 'O'])

labels = df['tag'].unique()
with open('labels.txt', 'w') as f:
    for l in labels:
        f.write(f'{l}\n')

In [None]:
!cp train.txt.tmp transformers/examples/legacy/token-classification/
!cp dev.txt.tmp transformers/examples/legacy/token-classification/
!cp test.txt.tmp transformers/examples/legacy/token-classification/
!cp labels.txt transformers/examples/legacy/token-classification/

In [None]:
!pip install -r requirements.txt

In [None]:
!cd transformers/examples/legacy/token-classification; ./custom_run.sh