##### This file contains crawling codes to get cve info (including basic cwe info related to each cve) from https://www.cvedetails.com

## Collect urls for each CVE
- all years

In [1]:
""" 
Steps to crawl data from 'https://www.cvedetails.com/vulnerabilities-by-types.php'
and construct our cyber KG.

Step1 : Crawl each line of the top50 vendor table with following information
        - 'vulner-link'  (str), 

Step2 : In the 'vulner-link', crawl A LIST of
        - 'page-urls'    (str)
        
Step3:  Based on step2 'page-urls', crawl A DICT of
        - 'cve-id':'cve-links'    (str: str),
        
Step4 : Based on step3, for each 'cve-link', we crawl following information:
        - 'cve-desc'     (str)   # Descriptions
        - 'pub-time'     (str)   # Publish time
        - 'score'        (str)   # CVSS score
        - 'vulner-types' (str)   # Vulnerability type
        - 'cwe-link'     (str)
        - 'product-info' (List[List[str]]) 
                          # List of ('vendor', 'product-name', 'version', 'product-type')
        
Step5: Based on step4, for the 'cwe-link', find
        - 'cwe-id'       (str),
        - 'cwe-desc'     (str),  # CWE descriptions
        
Step6: Based on step3,4,5, we construct a csv where each line contains following info
        - 0th col: row index
        - 1st col: 'cve-id'       (str)
        - 2nd col: 'cve-link'     (str)
        - 3rd col: 'cve-desc'     (str)
        - 4th col: 'pub-time'     (str)
        - 5th col: 'score'        (str)
        - 6th col: 'vulner-types' (str,str,str)
        - 7th col: 'cwe-id'       (str)
        - 8th col: 'cwe-url'     (str)
        - 9th col: 'cwe-def'      (str)  # replace textual description with CWE definition
        - 10th col: 'cwe-rel'     (str,str,str;str,str,str;str,str,str)  # relevant CWE info
                                  combination of 'relationships','cwe-id','cwe-name'
        - 11th col: List of ('vendor', 'product-name', 'version', 'product-type'), (List[List[str]])
                    but we save into string format: 'vendor,product-name,version,product-type;...'
                    where ',' splits elements, ';' splits lines, no ',' or ';' at the beginning
                    or end.
        
"""
import os
import json
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import OrderedDict, defaultdict

# root = 'https://www.cvedetails.com/top-50-vendors.php'
root = 'https://www.cvedetails.com/vulnerabilities-by-types.php'
url_save_dir = './cve_url/'  # save {cve: webpage url}
os.makedirs(url_save_dir, exist_ok=True)

response = requests.get(root)
soup = BeautifulSoup(response.content, 'html.parser')

# pages_to_cve = []   # len=50
# for ele in tqdm(soup.find_all('a'), desc=f"crawling {root}"):
#     if ele.has_attr('title') and "All vulnerabilities related to products of" in ele.get('title'):
#         pages_to_cve.append("https://www.cvedetails.com" + ele.get('href'))

pages_to_cve = []
for ele in tqdm(soup.find_all('a'), desc=f"crawling {root}"):
    if ele.has_attr('href'):
        href = ele.get('href')
        if href.startswith('/vulnerability-list/year-') and href.endswith('/vulnerabilities.html'):
            pages_to_cve.append("https://www.cvedetails.com" + href)

page_urls = []
page_num = set()
for url in tqdm(pages_to_cve, desc=f"crawling 'page' urls"):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser') 
    for ele in soup.find_all('a'):
        if ele.has_attr('title') and "Go to page" in ele.get('title'):
            page_urls.append("https://www.cvedetails.com" + ele.get('href'))
            page_num.add(ele.text)

# print(len(page_urls))

all_urls = OrderedDict()  # 'cve-id': url
for url in tqdm(page_urls, desc=f"collecting CVEs with urls"):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser') 
    for ele in soup.find_all('a'):
        if ele.has_attr('title') and "security vulnerability details" in ele.get('title'):
            cve_id = ele.text
            if cve_id not in all_urls: # exactly one url for each cve-id
                all_urls[cve_id] = "https://www.cvedetails.com" + ele.get('href')

with open(os.path.join(url_save_dir, 'all_urls.json'), 'w') as f:  # TODO: save-path move into config
    json.dump(all_urls, f)
    print('total CVE num: %d' % len(all_urls))

url_by_year = defaultdict(dict)
for k, v in tqdm(all_urls.items()):
    y = int(k.split('-')[1])
    url_by_year[y].update({k: v})

for y in sorted(list(url_by_year.keys())):
    with open(os.path.join(url_save_dir, str(y)+'.json'), 'w') as f:  # TODO: save-path move into config
        json.dump(url_by_year[y], f)
        print('%d CVE num: %d' % (y, len(url_by_year[y])))
print('Done')

# NOTE: run this cell in jupyter notebook

crawling https://www.cvedetails.com/vulnerabilities-by-types.php: 100%|██████████| 399/399 [00:00<00:00, 256400.69it/s]
crawling 'page' urls: 100%|██████████| 24/24 [00:12<00:00,  1.89it/s]
collecting CVEs with urls: 100%|██████████| 3413/3413 [34:13<00:00,  1.66it/s] 
 48%|████▊     | 81744/169990 [00:00<00:00, 817401.80it/s]

total CVE num: 169990


100%|██████████| 169990/169990 [00:00<00:00, 798636.29it/s]


1999 CVE num: 863
2000 CVE num: 1235
2001 CVE num: 1538
2002 CVE num: 2356
2003 CVE num: 1500
2004 CVE num: 2645
2005 CVE num: 4626
2006 CVE num: 6993
2007 CVE num: 6459
2008 CVE num: 7001
2009 CVE num: 4905
2010 CVE num: 5051
2011 CVE num: 4605
2012 CVE num: 5426
2013 CVE num: 6146
2014 CVE num: 8299
2015 CVE num: 7935
2016 CVE num: 9228
2017 CVE num: 14451
2018 CVE num: 15681
2019 CVE num: 15429
2020 CVE num: 17946
2021 CVE num: 18120
2022 CVE num: 1552
Done


## Crawl details from each CVE webpage
- specify range of crawling years
- run this part backup

In [None]:
import os
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import OrderedDict

years = range(2010, 2022)  # adjustable
url_load_dir = './cve_url/' # load {cve: webpage url}
save_dir = '/data/zhaohan/adv-reasoning/data/cyberkg-raw/cve'

for year in years:
    cve_urls = OrderedDict()
    with open(os.path.join(url_load_dir, str(year)+'.json'), 'r') as f:
        cve_urls.update(json.load(f))
    cve_urls = OrderedDict(reversed(list(cve_urls.items())))

    cve_info = {'cve-id':      [],
                'cve-url':     [], 
                'cve-desc':    [],
                'pub-time':    [],
                'score':       [],
                'vulner-types':[],
                'cwe-id':      [],
                'cwe-url':     [],
                'cwe-def':     [],
                'cwe-rel':     [],
                'pd-info':     [],
               }
    it, counter = 0, 1
    for cve_id, url in tqdm(cve_urls.items(), desc="crawling CVE-%d detailed info" % year):  
        cve_info['cve-id'].append(cve_id)
        cve_info['cve-url'].append(url)

        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # description
        ele = soup.find('div', attrs={'class':'cvedetailssummary'})
        if ele == None:
            cve_info['cve-desc'].append('None')
        else:
            cve_desc = ele.get_text().split('\n\n')[0].strip()
            cve_info['cve-desc'].append(cve_desc.replace('|', '<OR-OPERATOR>'))

        # publish time
        ele = soup.find('span', attrs={'class':'datenote'})
        if ele == None:
            cve_info['pub-time'].append('None')
        else:
            cve_info['pub-time'].append(ele.text.split('\t')[1].split(':')[1].strip())

        # CVSS score
        ele = soup.find('div', attrs={'class':'cvssbox'})
        try: cve_info['score'].append(ele.text)
        except: 
            cve_info['score'].append('0.0')
            print('\n%s has no score' % (url))

        # vulnerability type(s)
        table = soup.find('table', attrs={'id':'cvssscorestable'})
        if table==None:
            print('\n%s has no attribute table' % url)
            cve_info['vulner-types'].append('None')
        else:
            for row in table.find_all('tr'):
                if 'Vulnerability Type' in row.text:
                    vul_types = []
                    for vul_type in row.find_all('span'):
                        vul_types.append(vul_type.text.strip())

                    if len(vul_types)==0: # no type
                        cve_info['vulner-types'].append('None')
                    else:
                        for _i in range(len(vul_types)):
                            vul_types[_i].replace(',', '<comma>')
                        cve_info['vulner-types'].append(','.join(vul_types))
                    break

        # # gained access
        # for row in table.find_all('tr'):
        #     if 'Gained Access' in row.text:
        #         cve_info['gained-acs'].append(row.find('span').text)
        #         break

        # cwe info
        if table == None:
            cve_info['cwe-id'].append('None')
            cve_info['cwe-url'].append('None')
            cve_info['cwe-def'].append('None')
            cve_info['cwe-rel'].append('None')
            print('\n%s has no attribute table' % url)

        else:
            for row in table.find_all('tr'):
                if 'CWE ID' in row.text:
                    if row.find('a') == None:
                        cve_info['cwe-id'].append('None')
                        cve_info['cwe-url'].append('None')
                        cve_info['cwe-def'].append('None')
                        cve_info['cwe-rel'].append('None')
                        print('\n%s not link to any CWE' % url)
                        break

                    cwe_url = 'https:' + row.find('a').get('href') 
                    cwe_id = row.find('a').text
                    cwe_def, cwe_rel = 'None', 'None'

                    cwe_html = BeautifulSoup(requests.get(cwe_url).content, 'html.parser')
                    cwe_table = cwe_html.find('table', attrs={'class':'details'})

                    # have url and a table
                    if cwe_table != None:  

                        # find cwe name
                        if cwe_html.find('h1') != None:
                            cwe_def = cwe_html.find('h1').text

                        for cwe_row in cwe_table.find_all('tr'):
                            if 'CWE Definition' in cwe_row.text:
                                if cwe_row.find('a') == None or cwe_row.find('a').get('href') == None:
                                    break
                                cwe_detail_url = cwe_row.find('a').get('href')
                                cwe_detail_html = BeautifulSoup(requests.get(cwe_detail_url).content, 'html.parser')

                                # find cwe relations
                                rel_table = cwe_detail_html.find('div', attrs={'id': 'Relationships'})
                                if rel_table == None:
                                    print('%s has no CWE relation table' % cwe_detail_url)
                                    break
                                    
                                rel_cwe_details = set()
                                for detail_row in rel_table.find_all('tr', attrs={'class':'primary Weakness'}):
                                    row_text = detail_row.text
                                    if not ('Nature' in row_text and 'Type' in row_text \
                                            and 'ID' in row_text and 'Name' in row_text):
                                        texts = []
                                        for col in detail_row.find_all('td'):
                                            if col.text!=None:
                                                texts.append(col.text)
                                        texts[3] = texts[3].replace(',', '<comma>')
                                        rel_cwe_details.add(','.join([texts[0], texts[2], texts[3]]))

                                for detail_row in rel_table.find_all('tr', attrs={'class':'primary Category'}):
                                    row_text = detail_row.text
                                    if not ('Nature' in row_text and 'Type' in row_text \
                                            and 'ID' in row_text and 'Name' in row_text):
                                        texts = []
                                        for col in detail_row.find_all('td'):
                                            if col.text!=None:
                                                texts.append(col.text)
                                        texts[3] = texts[3].replace(',', '<comma>')
                                        rel_cwe_details.add(','.join([texts[0], texts[2], texts[3]]))

                                if len(rel_cwe_details) > 0:
                                    cwe_rel = ';'.join(rel_cwe_details)
                                break

                    cve_info['cwe-id'].append(cwe_id)
                    cve_info['cwe-url'].append(cwe_url)
                    cve_info['cwe-def'].append(cwe_def)
                    cve_info['cwe-rel'].append(cwe_rel)
                    break

        # product info: List of ('vendor', 'product-name', 'version', 'product-type')
        table = soup.find('table', attrs={'id':'vulnprodstable'})

        try:
            pd_info = []
            for row in table.find_all('tr'):
                if '#' in row.text and 'Vendor' in row.text and 'Update' in row.text:
                    continue

                cols = row.find_all('td')
                pd_type = cols[1].text.strip().replace(',', '<comma>').replace(';', '<semicolon>')
                pd_vendor = cols[2].text.strip().replace(',', '<comma>').replace(';', '<semicolon>')
                pd_name = cols[3].text.strip().replace(',', '<comma>').replace(';', '<semicolon>')
                pd_ver = cols[4].text.strip().replace(',', '<comma>').replace(';', '<semicolon>')

                if pd_ver == '-':
                    pd_ver = ''
                pd_info.append(','.join([pd_type, pd_vendor, pd_name, pd_ver]))
            pd_info = ';'.join(pd_info)

            cve_info['pd-info'].append(pd_info)
        except:
            print('\n%s pd-info is None' % url)
            cve_info['pd-info'].append('None')

        # turncate & save into csv file
        if len(cve_info['cve-url']) == 5000 or it == len(cve_urls)-1:
            df = pd.DataFrame.from_dict(cve_info)

            save_path = os.path.join(save_dir, str(year))
            os.makedirs(save_path, exist_ok=True)
            df.to_csv(os.path.join(save_path, f'cve_{counter}.csv'), sep='|')

            cve_info = {'cve-id':      [],
                        'cve-url':     [], 
                        'cve-desc':    [],
                        'pub-time':    [],
                        'score':       [],
                        'vulner-types':[],
                        'cwe-id':      [],
                        'cwe-url':     [],
                        'cwe-def':     [],
                        'cwe-rel':     [],
                        'pd-info':     [],
                       }
            for k, v in cve_info.items():
                assert len(v)==len(cve_info['cve-id']), '%s has len %d' % (k, len(v))

            counter += 1
        it += 1
print("Done")

# NOTE: run this cell in terminal

crawling CVE-2010 detailed info:   0%|          | 19/5050 [00:21<1:23:58,  1.00s/it]


https://www.cvedetails.com/cve/CVE-2010-0206/ pd-info is None


crawling CVE-2010 detailed info:   0%|          | 20/5050 [00:22<1:15:25,  1.11it/s]


https://www.cvedetails.com/cve/CVE-2010-0207/ pd-info is None


crawling CVE-2010 detailed info:   0%|          | 23/5050 [00:25<1:22:17,  1.02it/s]


https://www.cvedetails.com/cve/CVE-2010-0747/ pd-info is None


crawling CVE-2010 detailed info:   1%|          | 63/5050 [01:08<1:59:35,  1.44s/it]

### testing

In [1]:
import os
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from collections import OrderedDict

cve_info = {'cve-id':      [],
            'cve-url':     [], 
            'cve-desc':    [],
            'pub-time':    [],
            'score':       [],
            'vulner-types':[],
            'cwe-id':      [],
            'cwe-url':     [],
            'cwe-def':     [],
            'cwe-rel':     [],
            'pd-info':     [],
           }

response = requests.get("https://www.cvedetails.com/cve/CVE-2018-11450/")
soup = BeautifulSoup(response.content, 'html.parser')
    
table = soup.find('table', attrs={'id':'vulnprodstable'})

try:
    pd_info = []
    for row in table.find_all('tr'):
        if '#' in row.text and 'Vendor' in row.text and 'Update' in row.text:
            continue

        cols = row.find_all('td')
        pd_type = cols[1].text.strip()
        pd_vendor = cols[2].text.strip()
        pd_name = cols[3].text.strip()
        pd_ver = cols[4].text.strip()

        if pd_ver == '-':
            pd_ver = ''
        pd_info.append(','.join([pd_type, pd_vendor, pd_name, pd_ver]))
    pd_info = ';'.join(pd_info)

    cve_info['pd-info'].append(pd_info)
except:
    print('\n%s pd-info is None' % url)
    cve_info['pd-info'].append('None')

cve_info

{'cve-id': [],
 'cve-url': [],
 'cve-desc': [],
 'pub-time': [],
 'score': [],
 'vulner-types': [],
 'cwe-id': [],
 'cwe-url': [],
 'cwe-def': [],
 'cwe-rel': [],
 'pd-info': ['Application,Siemens,Teamcenter Product Lifecycle Management,9.1.2.5']}

In [1]:
a =set()
a.add('123')
a.add('456')
';'.join(a)

'123;456'