##### This file contains crawling codes to get all cwe info from https://cwe.mitre.org

In [1]:
# NOTE: extend this file to crawl more cwe info if need, so far we need mitigation onlu

## Collect weblink for all CWEs

we collect cwe urls related to cves from https://www.cvedetails.com/cwe-definitions.php

In [12]:
import os
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict

url_save_dir = './'  # save {cve: webpage url}

root = 'https://www.cvedetails.com/cwe-definitions.php'
soup = BeautifulSoup(requests.get(root).content, 'html.parser')

page_urls = []
for ele in tqdm(soup.find_all('a'), desc="crawling page num"):
    if ele.has_attr('href'):
        href = ele.get('href')
        if 'cwe-definitions' in href and 'cwelist.html' in href:
            page_urls.append("https://www.cvedetails.com" + href)

cwe_urls = defaultdict(str)  # 'cwe-id': url
for url in tqdm(page_urls, desc=f"collecting CVE urls"):
    html = BeautifulSoup(requests.get(url).content, 'html.parser') 
    for ele in html.find_all('a'):
        if ele.has_attr('title') and "CWE definition" in ele.get('title'):
            cwe_id = ele.text
            cwe_urls[cwe_id] = 'https://cwe.mitre.org/data/definitions/%s.html' % cwe_id

with open(os.path.join(url_save_dir, 'cwe_urls.json'), 'w') as f:  
    json.dump(cwe_urls, f)
    print('total CWE num: %d' % len(cwe_urls))


crawling page num: 100%|██████████| 160/160 [00:00<00:00, 621954.25it/s]
collecting CVE urls: 100%|██████████| 14/14 [00:04<00:00,  3.04it/s]

total CWE num: 668





## Crawl CWE details

In [13]:
import os
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict

save_dir = '/data/zhaohan/adv-reasoning/data/cyberkg-raw/cwe'
url_load_dir = './'
with open(os.path.join(url_load_dir, 'cwe_urls.json'), 'r') as f:
    cwe_urls = json.load(f)
    
    
# cwe_detail = {'cwe-id':  {
#                 'mitigation': {
#                         phase name (str): str
#                     }
#                 }    
#            }
cwe_detail = defaultdict(dict)

for cwe_id, cwe_url in tqdm(cwe_urls.items(), disable=False):
    cwe_html = BeautifulSoup(requests.get(cwe_url).content, 'html.parser')
    
    #--------------- crawl mitigation ---------------#
    phase_text_dict = defaultdict(list)
    
    miti_div = cwe_html.find('div', attrs={'name':'oc_%s_Potential_Mitigations' % cwe_id, 
                                           'id': 'oc_%s_Potential_Mitigations' % cwe_id,
                                           'class': 'expandblock'})
    if miti_div is None:
        continue
        
    for phase_td in miti_div.find_all('td', attrs={'valign':'top'}): # each block of phase
        phase_p = phase_td.find('p', attrs={'class': 'subheading'})
        if phase_p is None: # desc before concrete mitigation in each phase
            continue
        else:
            phase_names = phase_p.text # exactly 1 ':' in there
            assert phase_names.startswith('Phase:') or phase_names.startswith('Phases:')
            
            if phase_names.startswith('Phase:'):
                phase_names = phase_names[len('Phase:'):]
            else:
                phase_names = phase_names[len('Phases:'):]
            phase_names = [name.strip() for name in phase_names.split(';')]
            
        text = []
        for div in phase_td.find_all('div', attrs={'class': 'indent'}):
            text.append(div.text)
        text = '\n'.join(text).strip()

        for name in phase_names:
            phase_text_dict[name].append(text)
            
    cwe_detail[cwe_id]['mitigation'] = defaultdict(str)
    for name, text_list in phase_text_dict.items():
        cwe_detail[cwe_id]['mitigation'][name] = ' \n '.join(text_list)
    
save_dir = os.path.join(save_dir, 'cwe_detail.json')
with open(save_dir, 'w') as f:  
    json.dump(cwe_detail, f)
    print('saved at %s' % save_dir)


100%|██████████| 668/668 [02:32<00:00,  4.37it/s]

saved at /data/zhaohan/adv-reasoning/data/cyberkg-raw/cwe/cwe_detail.json



