In [1]:
#IMPORTING LIBRARIES
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
import ast
import os  

In [10]:
#CREATING A FOLDER TO STORE THE DATA
if not os.path.exists('db'):
    os.mkdir('db')

In [4]:
#KEYWORDS TO ENHANCE THE SEARCH RESULTS
keywords = ['scada', 'plc', 'hmi', 'rtu', 'dcs', 'ics', 'Industrial+Automation', 'Critical+Infrastructure']

In [5]:
def cve_retriever(url) :
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    tbody = soup.find_all('tbody')
    tr_elements = tbody[0].find_all('tr')
    cves = []
    for i in range(len(tr_elements)) : 
        cve = {}
        cve['ID'] = tr_elements[i].find_all('th')[0].text
        cve['Description'] = tr_elements[i].find_all('td')[0].find_all('p')[0].text
        cve['Description'] = re.sub('\n', '', cve['Description'])
        cve['Published'] =  tr_elements[i].find_all('td')[0].find_all('span')[0].text
        cve['CVSS'] = tr_elements[i].find_all('td')[1].find_all('a')[0].text
        cves.append(cve)
    return cves

In [6]:
#RETRIEVING THE DATA
res = []
for keyword in keywords : 
    url = "https://nvd.nist.gov/vuln/search/results?form_type=Basic&results_type=overview&query="+keyword+"&search_type=all&isCpeNameSearch=false"
    sub_res = cve_retriever(url)
    res.append(sub_res)

In [7]:
#REFORMING DATA
flatten_cves = [item for sublist in res for item in sublist]
unique_vulnerabilities = []
seen_cve_ids = set()
for vulnerability in flatten_cves:
    cve_id = vulnerability['ID']
    if cve_id not in seen_cve_ids:
        unique_vulnerabilities.append(vulnerability)
        seen_cve_ids.add(cve_id)

for vuln in unique_vulnerabilities:
    vuln['Severity'] = vuln['CVSS'].split(' ')[1]
    vuln['CVSS'] = vuln['CVSS'].split(' ')[0]

In [18]:
#SAVING DATA FOR FUTUR USE
res_df = pd.DataFrame(unique_vulnerabilities)
res_df.to_csv('db/short_cves.csv', index=False)

In [13]:
#EXTENDING THE DATA 
def vuln_research(id) : 
    url = "https://nvd.nist.gov/vuln/detail/"+id
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    hyperlinks = soup.find_all('div', id='vulnHyperlinksPanel')
    hyperlinks = hyperlinks[0].find_all('tbody')[0].find_all('tr')
    links = []
    for hyperlink in hyperlinks : 
        link = {}
        link['link'] = hyperlink.find_all('a')[0].text
        badges = []
        for badge in hyperlink.find_all('span', class_='badge') : 
            badges.append(badge.text)
        link['badges'] = badges
        links.append(link)
    #-------------------------------------------------------------------
    data = soup.find_all('div', id='vulnTechnicalDetailsDiv')
    trs = data[0].find_all('tbody')[0].find_all('tr')
    res = []
    for tr in trs :
        cwe = {}
        cwe['ID'] = tr.find_all('td')[0].text
        cwe['ID'] = re.sub('\n', '', cwe['ID'])
        cwe_link = tr.find_all('td')[0].find_all('a')
        if cwe_link:  # Check if the link exists
            cwe['href'] = cwe_link[0]['href']
        else:
            cwe['href'] = 'NA'
        cwe['name'] = tr.find_all('td')[1].text
        res.append(cwe)
    #-------------------------------------------------------------------
    data = json.loads(soup.find_all('input', id='cveTreeJsonDataHidden')[0]['value'])
    list_of_dicts = data[0]['containers'][0]['cpes']
    cpes = []
    for cpe in list_of_dicts : 
        cpe_dict = {}
        cpe23 = cpe['cpe23Uri']
        vendor = cpe23.split(':')[3]
        product = cpe23.split(':')[4]
        if cpe['rangeDescription'] == '' : 
            version = cpe23.split(':')[5]
        else :
            version = cpe['rangeDescription']
        
        cpe_dict['vendor'] = vendor
        cpe_dict['product'] = product
        cpe_dict['version'] = version
        cpes.append(cpe_dict)
        
    vuln = {}
    vuln['id'] = id
    vuln['links'] = links
    vuln['cwes'] = res
    vuln['cpes'] = cpes
    return vuln 

In [14]:
extended_vulnerabilities = []

for i in range(len(unique_vulnerabilities)) :
    vuln = unique_vulnerabilities[i] 
    print(i)
    id = vuln['ID']
    vuln_dict = vuln_research(id)
    extended_vulnerabilities.append(vuln_dict)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157


In [17]:
#SAVING DATA FOR FUTUR USE
extended_vulnerabilities_df = pd.DataFrame(extended_vulnerabilities)
extended_vulnerabilities_df.to_csv('db/extended_cves.csv', index=False)

In [19]:
#MERGING THE DATA
for i in range(len(unique_vulnerabilities)) :
    vuln = unique_vulnerabilities[i]
    vuln['links'] = extended_vulnerabilities[i]['links']
    vuln['cwes'] = extended_vulnerabilities[i]['cwes']
    vuln['cpes'] = extended_vulnerabilities[i]['cpes']

In [22]:
#SAVING DATA FOR FUTUR USE
final_vulnerabilities_df = pd.DataFrame(unique_vulnerabilities)
final_vulnerabilities_df.to_csv('db/final_cves.csv', index=False)

In [97]:
#REFORMING DATA FOR STATISTICS
cves = pd.read_csv('db/final_cves.csv')
cves['Published'] = pd.to_datetime(cves['Published'])

  cves['Published'] = pd.to_datetime(cves['Published'])


In [50]:
#LINKS
links = list(cves['links'])
cves.drop('links', axis=1, inplace=True)
links = [ast.literal_eval(link) for link in links]

In [98]:
def serialize_links(cve):
    cve_links = []
    for link in cve:
        cve_links.append(link['link'] + " \nBadges : " + "".join(badge for badge in link['badges']))
    
    return "".join(link+'\n' for link in cve_links)

cves_links = []
for cve in links:
    cves_links.append(serialize_links(cve))

In [99]:
#CWES
cwes = list(cves['cwes'])
structured_cwes = []
for cwe in cwes : 
    cwe = ast.literal_eval(cwe)
    structured_cwes.append(cwe)

cwes = structured_cwes

In [100]:
#CPES
cpes = list(cves['cpes'])
cpes = [ast.literal_eval(cpe) for cpe in cpes]

In [101]:
def vendors(cve): 
    vendor_list = set()
    for cpe in cve:
        vendor_list.add(cpe['vendor'])
    vendor_list = list(vendor_list)
    return vendor_list

In [102]:
def products(cve):
    product_list = set()
    for cpe in cve:
            product = cpe['product'] + ', version : ' + cpe['version']
            product_list.add(product)
    product_list = list(product_list)
    return product_list

In [103]:
#VENDORS
cves_vendors = []
for cve in cpes:
    cves_vendors.append(vendors(cve))

#PRODUCTS
cves_products = []
for cve in cpes:
    cves_products.append(products(cve))

In [104]:
#MERGING THE DATA
cves['links'] = cves_links
cves['vendors'] = cves_vendors
cves['products'] = cves_products
cves.drop('cpes', axis=1, inplace=True)
cves.to_csv('db/cves.csv', index=False)

In [105]:
#SAVING FINAL DATA
cves.to_csv('db/cves.csv', index=False)

In [106]:
cves.head()

Unnamed: 0,ID,Description,Published,CVSS,Severity,links,cwes,vendors,products
0,CVE-2023-0956,External input could be used on TEL-STER TelWi...,2023-08-03 15:15:10-04:00,7.5,HIGH,https://cert.pl/posts/2023/07/CVE-2023-0956/ \...,"[{'ID': 'CWE-22', 'href': 'http://cwe.mitre.or...",[tel-ster],"[telwin_scada_webinterface, version : version..."
1,CVE-2023-3329,SpiderControl SCADA Webserver versions 2.08 an...,2023-08-02 19:15:10-04:00,6.5,MEDIUM,https://www.cisa.gov/news-events/ics-advisorie...,"[{'ID': 'CWE-22', 'href': 'http://cwe.mitre.or...",[spidercontrol],"[scadawebserver, version : versions up to (in..."
2,CVE-2023-2866,If an attacker can trick an authenticated user...,2023-06-07 17:15:13-04:00,7.8,HIGH,https://www.cisa.gov/news-events/ics-advisorie...,"[{'ID': 'CWE-345', 'href': 'http://cwe.mitre.o...",[advantech],"[webaccess, version : 8.4.5]"
3,CVE-2023-2187,On Triangle MicroWorks' SCADA Data Gateway ver...,2023-06-07 03:15:08-04:00,5.3,MEDIUM,https://www.trellix.com/en-us/about/newsroom/s...,"[{'ID': 'NVD-CWE-noinfo', 'href': 'NA', 'name'...",[trianglemicroworks],"[scada_data_gateway, version : versions up to..."
4,CVE-2023-2186,On Triangle MicroWorks' SCADA Data Gateway ver...,2023-06-07 03:15:08-04:00,9.8,CRITICAL,https://www.trellix.com/en-us/about/newsroom/s...,"[{'ID': 'CWE-134', 'href': 'http://cwe.mitre.o...",[trianglemicroworks],"[scada_data_gateway, version : versions up to..."
