# DSE 260A Capston Project - Winter 2021 

Author: Saba Janamian  
Created: 1/24/2021  
----

In [31]:
import os
import json
import spacy
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [15]:
with open('data/nvdcve-1.1-2020.json') as f:
    cve_dict = json.load(f)

In [70]:
corpus = ''
counter = 0
LIMIT = 10
for cve in cve_dict['CVE_Items']:
    desc_list = cve['cve']['description']['description_data']
    for desc in desc_list:
        corpus += desc['value'] +'\n'
    counter += 1
    if counter == LIMIT:
        break

In [71]:
doc = nlp(corpus)

In [72]:
dash = '-' * 80
print(dash)
print("{:<50}{:<10}{:<10}".format("TEXT", "POS", "DEP"))
print(dash)
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<50}{token_pos:<10}{token_dep:<10}")

--------------------------------------------------------------------------------
TEXT                                              POS       DEP       
--------------------------------------------------------------------------------
In                                                ADP       prep      
getProcessRecordLocked                            PROPN     nsubjpass 
of                                                ADP       prep      
ActivityManagerService.java                       NOUN      pobj      
isolated                                          VERB      amod      
apps                                              NOUN      pobj      
are                                               AUX       auxpass   
not                                               PART      neg       
handled                                           VERB      ROOT      
correctly                                         ADV       advmod    
.                                                 PUNCT  

In [73]:
dash = '-' * 80
print(dash)
print("{:<50}{:<10}".format("TEXT", "ENTITY"))
print(dash)    
for ent in doc.ents:
    # Print the entity text and its label
    print(f"{ent.text:<50}{ent.label_:<10}")

--------------------------------------------------------------------------------
TEXT                                              ENTITY    
--------------------------------------------------------------------------------
Android-8.1                                       GPE       
Android                                           PERSON    
Android-8.1                                       GPE       
Android                                           PERSON    
onCreate of InstallStart.java                     WORK_OF_ART
Android Versions:                                 ORG       
rw_i93_send_cmd_write_single_block of rw_i93.cc   PERSON    
Android                                           PERSON    
Android-8.1                                       GPE       
Android                                           PERSON    
Sensor.cpp                                        ORG       
Android-8.1                                       GPE       
Android                                     

In [74]:
# Analyze syntax
print("Noun phrases:")
pp.pprint(set([chunk.text for chunk in doc.noun_chunks]))

Noun phrases:
{   '/A',
    'A-137014293References',
    'A-139738828',
    'A-140055304',
    'A-140195904',
    'A-142938932',
    'ActivityManagerService.java',
    'Android Versions',
    'Android kernel Android ID',
    'Android kernelAndroid',
    'Android-10 Android ID',
    'Android-8.0',
    'Android-8.0 Android ID',
    'Android-8.0 Android-8.1 Android-9 Android-10Android',
    'AndroidVersions',
    'ID',
    'LowEnergyClient::MtuChangedCallback',
    'N',
    'Product',
    'Sensor.cpp',
    'System execution privileges',
    'User interaction',
    'WallpaperManagerService.java',
    'a permissions bypass',
    'a possible arbitrary write',
    'a possible information disclosure',
    'a possible package validation bypass',
    'a possible sysui crash',
    'a race condition',
    'a use',
    'ashmem.c',
    'bounds',
    'btm_acl.cc',
    'btm_read_remote_ext_features_complete',
    'calc_vm_may_flags',
    'check',
    'exploitation',
    'exploitation Product',
    'fl

In [75]:
print("Verbs:")
pp.pprint(set([token.lemma_ for token in doc if token.pos_ == "VERB"]))

Verbs:
{   'check',
    'corrupt',
    'could',
    'exceed',
    'handle',
    'isolate',
    'lead',
    'miss',
    'need',
    'read',
    'share',
    'write'}
