**Title**: CVSS prediction\
**Description**: Load all pre-trained models to predict CVSS score\
**Developer**: Teck Lim\
**Create date**: 04/06/2021

# Import packages

In [None]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import math
import textwrap
from google.colab import drive

wrapper = textwrap.TextWrapper(initial_indent='  ', subsequent_indent='  ', width=120)

!pip install transformers

In [None]:
drive.mount('/content/gdrive')

# Semantic similarity search

## KNN

In [None]:
!pip install faiss
!pip install faiss-gpu

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

In [None]:
file_path = './gdrive/Shareddrives/ucsd_drive/Data/cve.json'
with open(file_path, 'r') as fp:
    data = json.load(fp) 
len(data)

In [None]:
cve_id = list()
text = list()
for idx in range(len(data)):
    try:
          cve_id.append(data[idx]['cve']['CVE_data_meta']['ID'])
          text.append(' '.join([text['value'] for text in data[idx]['cve']['description']['description_data']]))
    except KeyError:
        print(idx)
        break

df = pd.DataFrame({'cve_id': cve_id, 'text': text})

In [None]:
sentences = df.text.values
len(sentences)

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

ui_output_dir = './gdrive/Shareddrives/ucsd_drive/Model/UI'
ui_model = BertForSequenceClassification.from_pretrained(ui_output_dir, output_hidden_states=True)
ui_tokenizer = BertTokenizer.from_pretrained(ui_output_dir)
ui_model.to(device)

pr_output_dir = './gdrive/Shareddrives/ucsd_drive/Model/PR'
pr_model = BertForSequenceClassification.from_pretrained(pr_output_dir, output_hidden_states=True)
pr_tokenizer = BertTokenizer.from_pretrained(pr_output_dir)
pr_model.to(device)

In [None]:
import torch
def text_to_embedding(tokenizer, model, max_len, in_text):
    encoded_dict = tokenizer.encode_plus(
                        in_text,                      # Sentence to encode.
                        add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                        max_length = max_len,         # Pad & truncate all sentences.
                        padding='max_length',
                        # pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True, # Construct attn. masks.
                        return_tensors = 'pt',        # Return pytorch tensors.
                    )
    input_ids = encoded_dict['input_ids']
    attn_mask = encoded_dict['attention_mask']

    model.eval()

    input_ids = input_ids.to(device)
    attn_mask = attn_mask.to(device)

    with torch.no_grad():
        result = model(input_ids=input_ids,
                    token_type_ids=None,
                    attention_mask=attn_mask)

    # print(result.hidden_states[12][0][0])
    layer_i = 12
    batch_i = 0
    token_i = 0

    logits = result.logits
    logits = logits.detach().cpu().numpy()

    vec = result.hidden_states[layer_i][batch_i][token_i]
    vec = vec.detach().cpu().numpy()

    return logits, vec

In [None]:
# vecs = list()
vecs_stacked = None
for idx, input_text in enumerate(sentences[:1000]):
    logits, vec = text_to_embedding(ui_tokenizer, ui_model, 512, input_text)
    vecs_stacked = vec if vecs_stacked is None else np.vstack((vecs_stacked, vec))
    # vecs.append(vec)
    if (idx + 1) % 1000 == 0:
        print('Processing index: {}'.format(idx + 1))

print(vecs_stacked.shape)

In [None]:
# vecs = np.array(vecs)
ui_vecs = vecs_stacked
ui_vecs.shape

In [None]:
# vecs = list()
vecs_stacked = None
for idx, input_text in enumerate(sentences[:1000]):
    logits, vec = text_to_embedding(pr_tokenizer, pr_model, 512, input_text)
    vecs_stacked = vec if vecs_stacked is None else np.vstack((vecs_stacked, vec))
    # vecs.append(vec)
    if (idx + 1) % 1000 == 0:
        print('Processing index: {}'.format(idx + 1))

print(vecs_stacked.shape)

In [None]:
# vecs = np.array(vecs)
pr_vecs = vecs_stacked
pr_vecs.shape

In [None]:
df_vectorized = pd.DataFrame(data=vecs_stacked)
df_vectorized.insert(loc=0, column='cve_id', value=df['cve_id'])

save_path = './gdrive/Shareddrives/ucsd_drive/Data/cve_vectorized.csv'
df_vectorized.to_csv(save_path)

In [None]:
vecs = ui_vecs

In [None]:
import faiss
import time

cpu_index = faiss.IndexFlatL2(vecs.shape[1])

n_gpu = 1

print('Number of GPU: {} using {}'.format(faiss.get_num_gpus(), n_gpu))

co = faiss.GpuMultipleClonerOptions()
co.shard = True

gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=co, ngpu=n_gpu)

t0 = time.time()

gpu_index.add(vecs)

elapsed = time.time() - t0
print('Building index took {} seconds'.format(elapsed))

In [None]:
def find_top_3(input_text_vec):
  D, I = gpu_index.search(input_text_vec.reshape(1, 768), k=3)

  print('Top 3 results')

  for i in range(I.shape[1]):
    result_i = I[0, i]
    print(result_i)
    cve_id = df.iloc[result_i].cve_id
    text = df.iloc[result_i].text

    print(wrapper.fill(cve_id))
    print(wrapper.fill('L2 distance: {}'.format(D[0, i])))
    print(wrapper.fill(text))
    print('')

In [None]:
sentences[0]

In [None]:
input_text_1 = 'Stack-based buffer overflow in the jpc_tsfb_getbands2 function in jpc_tsfb.c in JasPer before 1.900.30 allows ' \
      'remote attackers to have unspecified impact via a crafted image.'
input_text_2 = 'Ubiquiti Networks EdgeSwitch version 1.7.3 and prior suffer from an improperly neutralized element in an OS command ' \
      'due to lack of protection on the admin CLI, leading to code execution and privilege escalation greater than administrators themselves ' \
      'are allowed. An attacker with access to an admin account could escape the restricted CLI and execute arbitrary shell instructions.'
input_text_3 = 'A "javascript:" url loaded by a malicious page can obfuscate its location by blanking the URL displayed in the addressbar, ' \
      'allowing for an attacker to spoof an existing page without the malicious page\'s address being displayed correctly. This vulnerability affects Firefox < 52.'
input_text_4 = "stack over flow that caused by user inserting long text in chrome browser address url bar"
input_text = input_text_1

In [None]:
logits, vec = text_to_embedding(pr_tokenizer, pr_model, 512, input_text)
find_top_3(vec)


In [None]:
logits, vec = text_to_embedding(ui_tokenizer, ui_model, 512, input_text)
find_top_3(vec)