# Import packages

In [2]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive

!pip install transformers



# Load the pre-trained models

In [4]:
drive.mount('/content/gdrive')
ui_output_dir = './gdrive/Shareddrives/twlim_ucsd_drive/Model/UI'
pr_output_dir = './gdrive/Shareddrives/twlim_ucsd_drive/Model/PR'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device('cuda')

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [6]:
from transformers import BertForSequenceClassification, BertTokenizer

ui_model = BertForSequenceClassification.from_pretrained(ui_output_dir, output_hidden_states=True)
ui_tokenizer = BertTokenizer.from_pretrained(ui_output_dir)
ui_model.to(device)

pr_model = BertForSequenceClassification.from_pretrained(pr_output_dir, output_hidden_states=True)
pr_tokenizer = BertTokenizer.from_pretrained(pr_output_dir)
pr_model.to(device)

print('All models loaded')

All models loaded


In [7]:
import torch
from keras.preprocessing.sequence import pad_sequences

def text_to_embedding(tokenizer, model, max_len, in_text):
  encoded_dict = tokenizer.encode_plus(
                      in_text,                      # Sentence to encode.
                      add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                      max_length = max_len,         # Pad & truncate all sentences.
                      padding='max_length',
                      # pad_to_max_length = True,
                      truncation=True,
                      return_attention_mask = True, # Construct attn. masks.
                      return_tensors = 'pt',        # Return pytorch tensors.
                   )
  input_ids = encoded_dict['input_ids']
  attn_mask = encoded_dict['attention_mask']

  model.eval()

  input_ids = input_ids.to(device)
  attn_mask = attn_mask.to(device)

  with torch.no_grad():
    result = model(input_ids=input_ids,
                   token_type_ids=None,
                   attention_mask=attn_mask)

  # print(result.hidden_states[12][0][0])
  layer_i = 12
  batch_i = 0
  token_i = 0

  logits = result.logits
  logits = logits.detach().cpu().numpy()

  vec = result.hidden_states[layer_i][batch_i][token_i]
  vec = vec.detach().cpu().numpy()

  return logits, vec

# CVSS and metrics prediction

In [8]:
input_text_1 = 'Sudo before 1.6.6 contains an off-by-one error that can result in a heap-based buffer overflow that may allow ' \
      'local users to gain root privileges via special characters in the -p (prompt) argument, which are not properly expanded.'
input_text_2 = 'Ubiquiti Networks EdgeSwitch version 1.7.3 and prior suffer from an improperly neutralized element in an OS command ' \
      'due to lack of protection on the admin CLI, leading to code execution and privilege escalation greater than administrators themselves ' \
      'are allowed. An attacker with access to an admin account could escape the restricted CLI and execute arbitrary shell instructions.'
input_text_3 = 'A "javascript:" url loaded by a malicious page can obfuscate its location by blanking the URL displayed in the addressbar, ' \
      'allowing for an attacker to spoof an existing page without the malicious page\'s address being displayed correctly. This vulnerability affects Firefox < 52.'
input_text_4 = "stack over flow that caused by user inserting long text in chrome browser address url bar"

input_text = input_text_2
len(input_text)

380

In [9]:
import textwrap

wrapper = textwrap.TextWrapper(initial_indent='  ', subsequent_indent='  ', width=120)
print('Embedding: \n\n', wrapper.fill(input_text))

print('\nPredictions:\n')
logits, vec = text_to_embedding(ui_tokenizer, ui_model, 512, input_text)
if np.argmax(logits, axis=1) == 0:
  print('  UI: None')
else:
  print('  UI: Required')
# print('\nEmbedding shape:', str(vec.shape))

logits, vec = text_to_embedding(pr_tokenizer, pr_model, 512, input_text)
if np.argmax(logits, axis=1) == 0:
  print('  PR: None')
elif np.argmax(logits, axis=1) == 1:
  print('  PR: Low')
else:
  print('  PR: High')
# print('\nEmbedding shape:', str(vec.shape))


Embedding: 

   Ubiquiti Networks EdgeSwitch version 1.7.3 and prior suffer from an improperly neutralized element in an OS command
  due to lack of protection on the admin CLI, leading to code execution and privilege escalation greater than
  administrators themselves are allowed. An attacker with access to an admin account could escape the restricted CLI and
  execute arbitrary shell instructions.

Predictions:

  UI: None
  PR: High


# Semantic similarity search

## KNN

In [10]:
!pip install faiss
!pip install faiss-gpu

Collecting faiss
[?25l  Downloading https://files.pythonhosted.org/packages/ef/2e/dc5697e9ff6f313dcaf3afe5ca39d7d8334114cbabaed069d0026bbc3c61/faiss-1.5.3-cp37-cp37m-manylinux1_x86_64.whl (4.7MB)
[K     |████████████████████████████████| 4.7MB 7.4MB/s 
Installing collected packages: faiss
Successfully installed faiss-1.5.3
Collecting faiss-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/5d/36/383911b8edf8c29cb7e9e8aee4e6b69b0f36c52237e3a06ce64a9551ef22/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl (89.4MB)
[K     |████████████████████████████████| 89.4MB 56kB/s 
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.0


In [11]:
file_path = './gdrive/Shareddrives/twlim_ucsd_drive/Data/cve.json'
with open(file_path, 'r') as fp:
    data = json.load(fp) 
len(data)

150600

In [12]:
cve_id = list()
text = list()
label = list()
for idx in range(len(data)):
    try:
        if data[idx].get('impact') and data[idx]['impact'].get('baseMetricV3'):
            cve_id.append(data[idx]['cve']['CVE_data_meta']['ID'])
            text.append(' '.join([text['value'] for text in data[idx]['cve']['description']['description_data']]))
            label.append(0 if data[idx]['impact']['baseMetricV3']['cvssV3']['userInteraction'] == 'NONE' else 1)
    except KeyError:
        print(idx)
        break

df = pd.DataFrame({'cve_id': cve_id, 'label': label, 'text': text})

In [13]:
sentences = df.text.values[:1000]
len(sentences)

1000

In [14]:
vecs = list()
for input_text in sentences:
  logits, vec = text_to_embedding(ui_tokenizer, ui_model, 512, input_text)
  vecs.append(vec)
vecs = np.array(vecs)

In [15]:
import faiss
import time

cpu_index = faiss.IndexFlatL2(vecs.shape[1])

n_gpu = 1

print('Number of GPU: {} using {}'.format(faiss.get_num_gpus(), n_gpu))

co = faiss.GpuMultipleClonerOptions()
co.shard = True

gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=co, ngpu=n_gpu)

t0 = time.time()

gpu_index.add(vecs)

elapsed = time.time() - t0
print('Building index took {} seconds'.format(elapsed))

Number of GPU: 1 using 1
Building index took 0.0015549659729003906 seconds


In [16]:
def find_top_3(input_text_vec):
  D, I = gpu_index.search(input_text_vec.reshape(1, 768), k=3)

  print('Top 3 results')

  for i in range(I.shape[1]):
    result_i = I[0, i]
    print(result_i)
    cve_id = df.iloc[result_i].cve_id
    text = df.iloc[result_i].text

    print(wrapper.fill(cve_id))
    print(wrapper.fill('L2 distance: {}'.format(D[0, i])))
    print(wrapper.fill(text))
    print('')

In [17]:
sentences[0]

'Stack-based buffer overflow in the jpc_tsfb_getbands2 function in jpc_tsfb.c in JasPer before 1.900.30 allows remote attackers to have unspecified impact via a crafted image.'

In [20]:
input_text_1 = 'Stack-based buffer overflow in the jpc_tsfb_getbands2 function in jpc_tsfb.c in JasPer before 1.900.30 allows ' \
      'remote attackers to have unspecified impact via a crafted image.'
input_text_2 = 'Ubiquiti Networks EdgeSwitch version 1.7.3 and prior suffer from an improperly neutralized element in an OS command ' \
      'due to lack of protection on the admin CLI, leading to code execution and privilege escalation greater than administrators themselves ' \
      'are allowed. An attacker with access to an admin account could escape the restricted CLI and execute arbitrary shell instructions.'
input_text_3 = 'A "javascript:" url loaded by a malicious page can obfuscate its location by blanking the URL displayed in the addressbar, ' \
      'allowing for an attacker to spoof an existing page without the malicious page\'s address being displayed correctly. This vulnerability affects Firefox < 52.'
input_text_4 = "stack over flow that caused by user inserting long text in chrome browser address url bar"
input_text = input_text_4

In [21]:
logits, vec = text_to_embedding(ui_tokenizer, ui_model, 512, input_text)
find_top_3(vec)


Top 3 results
123
  CVE-2020-26519
  L2 distance: 22.360809326171875
  Artifex MuPDF before 1.18.0 has a heap based buffer over-write when parsing JBIG2 files allowing attackers to cause a
  denial of service.

487
  CVE-2020-35522
  L2 distance: 24.911407470703125
  In LibTIFF, there is a memory malloc failure in tif_pixarlog.c. A crafted TIFF document can lead to an abort,
  resulting in a remote denial of service attack.

823
  CVE-2017-9937
  L2 distance: 25.4649658203125
  In LibTIFF 4.0.8, there is a memory malloc failure in tif_jbig.c. A crafted TIFF document can lead to an abort
  resulting in a remote denial of service attack.

