In [143]:
!pip install lxml jsonpath_ng requests pivottablejs pandas
from jsonpath_ng import jsonpath, parse
import csv
import requests
import zipfile
import io
import json
import os

def get_data(uri):
    fn = uri.split('/')[-1]
    
    if os.path.exists(fn):
        print('opening {}'.format(fn))
    else:
        print('downloading {}'.format(fn))
        res = requests.get(uri)
        with open(fn,'wb') as outf:
            print('wrote {} bytes'.format(outf.write(res.content)))
    
    z = zipfile.ZipFile(fn)
    assert len(z.filelist) == 1
    data= z.open(z.filelist[0].orig_filename).read()
    if z.filelist[0].orig_filename.endswith('.json'):
        data= json.loads(data)
    z.close()
    return data

data_uris = [
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2018.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2017.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2016.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2015.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2014.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2013.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2012.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2011.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2010.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2009.json.zip',
    'https://static.nvd.nist.gov/feeds/json/cve/1.0/nvdcve-1.0-2008.json.zip'
]

sevlist = [
    'accessVector',
    'accessComplexity',
    'authentication',
    'confidentialityImpact',
    'integrityImpact',
    'availabilityImpact',
    'baseScore'
]

sev_items = map(lambda _: [_,parse('impact.baseMetricV2.cvssV2.{}'.format(_))], sevlist)

columns = dict(sev_items,
    cve_id = parse('cve.CVE_data_meta.ID'),
    vendor = parse('cve.affects.vendor.vendor_data.[*].vendor_name'),
    product = parse('cve.affects.vendor.vendor_data.[*].product.[*].product_data.[*].product_name'),
    cwe = parse('cve.problemtype.problemtype_data.[*].description.[*].value'),
)
    
def gpv(data, path):
    """get values from a path"""
    res = path.find(data)
    values = list(set([_ for _ in map(lambda _:_.value, res)]))
    if len(values) == 1:
        return values[0]
    else:
        return ','.join(values)
    
def cve_extract_fields(items):
    for item in items:
        yield {
            k:gpv(item, jp) for k, jp in columns.items()
        }



In [17]:
from lxml import etree, objectify


def get_xml():
    cwe = get_data('https://cwe.mitre.org/data/xml/views/2000.xml.zip')
    cwe = etree.fromstring(cwe)
    root = cwe.getroottree().getroot()
    for elem in root.getiterator():
        if not hasattr(elem.tag, 'find'): continue  # (1)
        i = elem.tag.find('}')
        if i >= 0:
            elem.tag = elem.tag[i+1:]
    objectify.deannotate(root, cleanup_namespaces=True)
    return cwe

cwe = get_xml()

opening 2000.xml.zip


In [18]:
weaknesses = cwe.xpath('/Weakness_Catalog/Weaknesses/*')
categories = cwe.xpath('/Weakness_Catalog/Categories/*')
# views
# external references
cwe_cats = {}
for c in categories:
    d = {
      "name": None,
      "id": None,
      "parents": [],
      "platforms": None,
      "modes": None,
      "consequence": None
     }
    for k,v in c.items():
        if k.lower() in d:
            if k == 'ID':
                d[k.lower()]=int(v)
            else:
                d[k.lower()]=v
    cwe_cats[d['id']] = d
        

In [8]:
CVEs = []
for uri in data_uris:
    CVEs.extend(get_data(uri)['CVE_Items'])


cve_out = []
for d in filter(lambda _: _['cwe'], cve_extract_fields(CVEs)):
    cve_out.append(d)

with open('CVEs.json', 'w') as outf:
    json.dump(cve_out, outf, indent=1)
    
CWEs = []

cwe = get_data('https://cwe.mitre.org/data/csv/2000.csv.zip')
for line in csv.DictReader(cwe.decode('utf-8').split('\n')):
    CWEs.append(dict(line))
    
print('{} CWEs read'.format(len(CWEs)))



def parse_with_fm(cwe, field, filters, maps):
    rws = cwe[field].split('::')
    for f in [lambda _:_] + filters:
        rws = filter(f, rws)
    
    for m in maps:
        rws = map(m, rws)
    
    rws = filter(lambda _:_, rws)
    
    return ', '.join(list(set(rws)))

def get_f12(f):
    f = f.split(':')
    if f[2].endswith('PREVALENCE'):
        return f[0].split(' ')[0] +'-' +f[1]
    
    return '-'.join(f[1:3])

def get_modes(f):
    m = f.split(':')
    try:
        return m[m.index('PHASE')+1]
    except:
        return ''

def get_consequences(f):
    return f.split(':')[1]
    
cwe_fields = dict(
    name=lambda cwe:cwe['Name'],
    id=lambda cwe:cwe['ID'],
    parents= 
    lambda cwe: parse_with_fm(cwe, 'Related Weaknesses', filters=[
        lambda _:_.startswith('NATURE:ChildOf')
    ], maps=[
        lambda _:_.split(':')[3]
    ]),
    platforms = lambda cwe: parse_with_fm(cwe, 'Applicable Platforms',filters=[],maps=[get_f12]),
    modes=lambda cwe:parse_with_fm(cwe, 'Modes Of Introduction', filters=[],maps=[get_modes]),
    consequence=lambda cwe:parse_with_fm(cwe, 'Common Consequences',filters=[], maps=[get_consequences])
)
    
def cwe_extract_details(items):
    
    for item in items:
        yield {
            k:fn(item) for k, fn in cwe_fields.items()
        }
    
cwe_out=[]
for d in cwe_extract_details(CWEs):
    cwe_out.append(d)

opening nvdcve-1.0-2018.json.zip
opening nvdcve-1.0-2017.json.zip
opening nvdcve-1.0-2016.json.zip
opening nvdcve-1.0-2015.json.zip
opening nvdcve-1.0-2014.json.zip
opening nvdcve-1.0-2013.json.zip
opening nvdcve-1.0-2012.json.zip
opening nvdcve-1.0-2011.json.zip
opening nvdcve-1.0-2010.json.zip
opening nvdcve-1.0-2009.json.zip
opening nvdcve-1.0-2008.json.zip
opening 2000.csv.zip
730 CWEs read


In [10]:
with open('CWEs.json', 'w') as outf:
    json.dump( cwe_out, outf, indent=1)

In [63]:
cwes = {}

for cwe in cwe_out + list(cwe_cats.values()):
    i = int(cwe['id'])
    cwes[i] = cwe
    cwe['children']= []
    cwe['cves'] = 0
    cwe['score'] = 0.0

for i, cwe in cwes.items():
    if not cwe['parents']:
        continue
    for p in map(int, cwe['parents'].split(',')):
        cwes[p]['children'].append(cwe)

In [64]:
cwes[399]

{'children': [],
 'consequence': None,
 'cves': 0,
 'id': 399,
 'modes': None,
 'name': 'Resource Management Errors',
 'parents': [],
 'platforms': None,
 'score': 0.0}

In [113]:
import collections
years_with = collections.Counter()
years_without = collections.Counter()

def get_yr(cve):
    return cve[4:8]

for cve in cve_out:
    year = int(get_yr(cve['cve_id']))
    for cwe in cve['cwe'].split(','):
        try:
            cwe = int(cwe.split('-')[1])
        except:
            years_without[year]+= 1
            continue
        cwes[cwe]['cves'] += 1
        try:
            cwes[cwe]['score'] += float(cve['baseScore'])
        except ValueError:
            continue
        years_with[year]+= 1
zeros = []
for cwe in cwes.keys():
    try:
        cwes[cwe]['average'] = cwes[cwe]['score']/cwes[cwe]['cves']
    except ZeroDivisionError:
        zeros.append(cwe)
for z in set(zeros):
    if z in cwes:
        del cwes[z]

In [100]:
print('Year\tProportion of CVEs with a CWE listed')
for year in years_with.keys():
    total = float(years_with[year])/(years_without[year] + years_with[year])
    print('{}\t{:.0%}'.format(year,total))

Year	Proportion of CVEs with a CWE listed
2018	98%
2017	99%
2016	87%
2015	83%
2014	86%
2013	80%
2012	79%
2011	83%
2010	79%
2009	85%
2008	89%


In [164]:
top=20

for i, cwe in enumerate(sorted(cwes.values(), key=lambda k:int(k['score']), reverse=True)[:top]):
    print('{:02.0f} {score:>10,.0f}  Avg: {average:.0f} {name}'.format(i, **cwe))
    

00  1,151,636  Avg: 8 Improper Restriction of Operations within the Bounds of a Memory Buffer
01    588,708  Avg: 6 Permissions, Privileges, and Access Controls
02    515,422  Avg: 4 Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting')
03    507,678  Avg: 7 Improper Neutralization of Special Elements used in an SQL Command ('SQL Injection')
04    487,669  Avg: 6 Improper Input Validation
05    312,977  Avg: 7 Resource Management Errors
06    309,620  Avg: 4 Information Exposure
07    202,780  Avg: 8 Improper Control of Generation of Code ('Code Injection')
08    196,929  Avg: 5 Cryptographic Issues
09    195,196  Avg: 6 Improper Access Control
10    190,622  Avg: 6 Improper Limitation of a Pathname to a Restricted Directory ('Path Traversal')
11    158,302  Avg: 7 Cross-Site Request Forgery (CSRF)
12    139,428  Avg: 7 Numeric Errors
13    118,898  Avg: 7 Improper Authentication
14     68,828  Avg: 6 Credentials Management
15     55,619  Avg: 8 Use After