In [None]:
# Collect external weblinks fron NVD (national vulnerability database)

import os
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from urllib.request import Request, urlopen

save_dir = '/data/zhaohan/adv-reasoning/data/cyberkg-raw/nvd_cve_info/'
os.makedirs(save_dir, exist_ok=True)

root = 'https://nvd.nist.gov/vuln/full-listing'
response = requests.get(root)
html = BeautifulSoup(response.content, 'html.parser')

monthes = ['January', 'February', 'March', 'April', 
           'May', 'June', 'July', 'August', 'September', 
           'October', 'November', 'December']

cve_urls = []
for ele in tqdm(html.find_all('a'), desc='getting cve urls in NVD'):
    if ele.text in monthes:
        y_m_url = ele.get('href')  # link to year-month cve-id list, e.g. '/vuln/full-listing/2021/1'
        y_m_url = os.path.join('https://nvd.nist.gov', y_m_url.strip('/'))
        
#         y_m_html = BeautifulSoup(requests.get(y_m_url).content, 'html.parser')
        req = Request(y_m_url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        y_m_html = BeautifulSoup(webpage, 'html.parser')
        for ele_2 in y_m_html.find_all('a'):
            if 'CVE-' in ele_2.text \
            and ele_2.text.split('-')[1].isnumeric() \
            and ele_2.text.split('-')[2].isnumeric():
                cve_urls.append(os.path.join('https://nvd.nist.gov', ele_2.get('href').strip('/')))
    
nvd_cve_info = defaultdict(dict)
# {
#     cve-id: 'nvd': <nvd webpage url> (str),
#             'external': {
#                 0: {
#                     'url': <external webpage url> (str),
#                     'types': [types of current resource]  list(str),
#                 },
#                 1: {
#                     'url': <external webpage url> (str),
#                     'types': [types of current resource]  list(str),
#                 },
#                 ...
#             }
# }
counter = 0
for cve_url in tqdm(cve_urls, desc='parsing cve-nvd webpages'):
    assert cve_url.startswith('https://nvd.nist.gov/vuln/detail/CVE-')
    cve_id = cve_url.split('/')[-1]
    nvd_cve_info[cve_id]['nvd'] = cve_url
    nvd_cve_info[cve_id]['external'] = defaultdict(dict)
    
    # cve_html = BeautifulSoup(requests.get(cve_url).content, 'html.parser')
    req = Request(cve_url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    cve_html = BeautifulSoup(webpage, 'html.parser')
    for tr in cve_html.find_all('tr'):
        if tr.has_attr('data-testid') and tr.get('data-testid').startswith('vuln-hyperlinks-row'):
            row_id = int(tr.get('data-testid').split('-')[-1]) 

            a = tr.find('a', attrs={'target': '_blank'}) # only one
            if a.get('href') is not None: 
                nvd_cve_info[cve_id]['external'][row_id]['url'] = a.get('href') 
            else:
                nvd_cve_info[cve_id]['external'][row_id]['url'] = ''
            nvd_cve_info[cve_id]['external'][row_id]['types'] = []
            for span in tr.find_all('span', attrs={'class': 'badge'}):
                nvd_cve_info[cve_id]['external'][row_id]['types'].append(span.text)

    if len(nvd_cve_info) > 2000:
        with open(os.path.join(save_dir, '%d.json' % counter), 'w') as f:
            json.dump(nvd_cve_info, f)
            print('saved %s' % os.path.join(save_dir, '%d.json' % counter))
        counter += 1
        nvd_cve_info = defaultdict(dict)

if len(nvd_cve_info) > 0:
    with open(os.path.join(save_dir, '%d.json' % counter), 'w') as f:
        json.dump(nvd_cve_info, f)
        print('saved %s' % os.path.join(save_dir, '%d.json' % counter))
        
print('Done')
            

In [3]:
# analysis
import os
import json
import validators
from collections import defaultdict
import pandas as pd

cve_exurl_dict = defaultdict(set)
exurl_cve_dict = defaultdict(set)
url_types_dict = defaultdict(set)
for subdir, dirs, files in os.walk('./nvd_cve_info'):
    for file in files:
        path = os.path.join(subdir, file)
        with open(path) as json_file:
            data = json.load(json_file)
            for cve_id, info in data.items():
                if len(info['external']) > 0: 
                    for idx, url_info in info['external'].items():
                        if validators.url(url_info['url']):
                            cve_exurl_dict[cve_id].add(url_info['url'])
                            exurl_cve_dict[url_info['url']].add(cve_id)
                            url_types_dict[url_info['url']] |= set(url_info['types'])

count = 0
for k, v in cve_exurl_dict.items():
    count += len(set(v))
print('CVE num: %d, cumulative url num %d, unique url num %d\n' 
      % (len(cve_exurl_dict), count, len(url_types_dict)))

all_types = set()
for url, types in url_types_dict.items():
    all_types |= set(types)
print('All reported external link types: %s\n' % str(list(all_types)))

# count sources of external urls
domain_freq = {}
for url in url_types_dict.keys():
    if validators.url(url):
        domain = url.split('/')[2]
        if domain not in domain_freq:
            domain_freq[domain] = 0
        domain_freq[domain] += 1
    
topk_domain = []
for i, k in enumerate(sorted(domain_freq.keys(), key=domain_freq.get, reverse=True)[:50]):
    topk_domain.append(k)
    print(i+1, k, domain_freq[k])
    
# source specific url analysis
save_path = '/data/zhaohan/adv-reasoning/data/cyberkg-raw/cve_ex_links/'

for i, domain in enumerate(topk_domain):
    saved_info = {
            'cve-ids': [],
            'ex_url': [],
            'types': [],
        }
    for url, types in url_types_dict.items():
        if domain in url:
            saved_info['cve-ids'].append(','.join(list(exurl_cve_dict[url])))            
            saved_info['ex_url'].append(url)
            saved_info['types'].append(','.join(list(types)))
            
    df = pd.DataFrame.from_dict(saved_info)
    os.makedirs(save_path, exist_ok=True)
    df.to_csv(os.path.join(save_path, 'top%d.%s.%d.csv' % (i+1, domain, domain_freq[domain])), sep='|')
    

CVE num: 150367, cumulative url num 611731, unique url num 361650

All reported external link types: ['Patch', 'Mailing List', 'Product', 'Third Party Advisory', 'VDB Entry', 'Tool Signature', 'Mitigation', 'Broken Link', 'US Government Resource', 'Exploit', 'Not Applicable', 'Press/Media Coverage', 'Issue Tracking', 'Release Notes', 'Technical Description', 'Vendor Advisory', 'Permissions Required']

1 www.securityfocus.com 60510
2 exchange.xforce.ibmcloud.com 33381
3 github.com 19952
4 www.vupen.com 11667
5 www.exploit-db.com 11218
6 oval.cisecurity.org 10213
7 www.securitytracker.com 9769
8 www.openwall.com 6561
9 bugzilla.redhat.com 6296
10 securityreason.com 5233
11 marc.info 5164
12 securitytracker.com 4727
13 lists.fedoraproject.org 4519
14 packetstormsecurity.com 4496
15 www.debian.org 4309
16 lists.opensuse.org 4001
17 tools.cisco.com 3988
18 portal.msrc.microsoft.com 3885
19 seclists.org 3501
20 lists.apache.org 3228
21 www.redhat.com 3138
22 bugzilla.mozilla.org 3109
23 www.