# Load the CVE entries as of November 18, 2020

In [33]:
import json # to serialize CVE entries

In [2]:
with open('data/allitems_cve.txt', 'rb') as f:
    allitems = f.read()

In [3]:
allitems_decoded = allitems.decode('ISO-8859-1')

In [8]:
allitems_split = allitems_decoded.split('======================================================')

In [11]:
# Length of all CVE entries (including candidates)
# -1 because first entry explains the CVE version
# Note that candidates have beed deprecated: https://cve.mitre.org/about/faqs.html#cve_list_retire_term_cve
# Note that to obtain CVSS scores, it will have to be obtained from NVD

len(allitems_split) - 1

191976

# Find the CVE entries that possibly reference github

In [14]:
github_references = []
for i in allitems_split:
    if "https://github.com" in i:
        github_references.append(i)

In [15]:
# Number of CVE entries that reference github
len(github_references)

14362

In [29]:
# CVE with links to github that are not necessarily public references

github_references_nonRef = []

for x in github_references:
    a = x.strip().split('\n')
    for i in a:
        if "https://github.com" in i and "Reference:" not in i:
            github_references_nonRef.append(x)
            print(x, '======================================================')
            break


Name: CVE-2017-1000254
Status: Candidate
Phase: Assigned(20171006)
Reference: BID:101115
Reference: URL:http://www.securityfocus.com/bid/101115
Reference: CONFIRM:https://curl.haxx.se/673d0cd8.patch
Reference: CONFIRM:https://curl.haxx.se/docs/adv_20171004.html
Reference: CONFIRM:https://support.apple.com/HT208331
Reference: DEBIAN:DSA-3992
Reference: URL:http://www.debian.org/security/2017/dsa-3992
Reference: GENTOO:GLSA-201712-04
Reference: URL:https://security.gentoo.org/glsa/201712-04
Reference: REDHAT:RHSA-2018:2486
Reference: URL:https://access.redhat.com/errata/RHSA-2018:2486
Reference: REDHAT:RHSA-2018:3558
Reference: URL:https://access.redhat.com/errata/RHSA-2018:3558
Reference: SECTRACK:1039509
Reference: URL:http://www.securitytracker.com/id/1039509

libcurl may read outside of a heap allocated buffer when doing FTP.
When libcurl connects to an FTP server and successfully logs in
(anonymous or not), it asks the server for the current directory with
the `PWD` command. The se

In [32]:
# CVE with links to github that are not public references
len(github_references_nonRef)

23

In [101]:
class CVE:
    def __init__(self, name, status, phase, refs, commits, comment, originalEntry):
        self.name = name
        self.status = status
        self.phase = phase
        self.refs = refs
        self.commits = commits
        self.comment = comment
        self.originalEntry = originalEntry
        
    def toDict(self):
        return {
            'name': self.name,
            'status': self.status,
            'phase': self.phase,
            'refs': self.refs,
            'commits': self.commits,
            'comment': self.comment
        }

# Check if sha1
# https://stackoverflow.com/a/32234251
def is_sha1(maybe_sha):
    if len(maybe_sha) != 40:
        return False
    try:
        sha_int = int(maybe_sha, 16)
    except ValueError:
        return False
    return True

In [103]:
# Extract all the github references with commits 
# (full sha1s from URLs formatted like: https://github.com/<orgUser>/<repo>/commit/<sha1>
#      or PR URL that contains /commits/<sha1>)
github_references_commits = []

for idx, x in enumerate(github_references):
    a = x.strip().split('\n')
    
    refs = []
    commits = []
        
    for i in a[1:]:
        if "https://github.com" in i and "Reference:" in i and 'commit' in i:            
            # repo commit
            if is_sha1(i.split('/commit/')[-1].split('#')[0].split('.')[0].split('?')[0].split(',')[0].split('/')[0].split('%')[0]):
                refs.append(i.split('Reference:')[-1].strip())
                commits.append(i.split('/commit/')[-1].split('#')[0].split('.')[0].split('?')[0].split(',')[0].split('/')[0].split('%')[0])
            # PR commit
            elif is_sha1(i.split('/commits/')[-1].split('#')[0].split('.')[0].split('?')[0].split(',')[0].split('/')[0].split('%')[0]):
                refs.append(i.split('Reference:')[-1].strip())
                commits.append(i.split('/commits/')[-1].split('#')[0].split('.')[0].split('?')[0].split(',')[0].split('/')[0].split('%')[0])
            
    if commits:
        b = x.strip().split('\n\n') # index 1 is the comment, index 2 is the votes

        name = a[0].strip().split('Name: ')[-1] # first line is always name

        status = None
        if 'Status: ' in a[1]:
            status = a[1].strip().split('Status: ')[-1]

        phase = None
        if 'Phase: ' in a[2]:
            phase = a[2].strip().split('Phase: ')[-1]

        comment = b[1].strip()
        
        github_references_commits.append(CVE(name, status, phase, refs, commits, comment, x))
    

In [113]:
# CVE list entries with github references with full sha1 commits 
len(github_references_commits)

5206

In [122]:
# How many CVE list entries with github references with full sha1 commits refer to more than 1 commit?
count = 0
for x in github_references_commits:
    if len(x.commits) > 1:
        count+=1
print(count)

829


In [123]:
len(github_references_commits) - count

4377

In [None]:
# TODO: Check if there are single files in the commits

In [117]:
# Dump the CVE list entries with github references with full sha1 commits 
with open('data/github_references_commits.json', 'w') as f:
    f.write(json.dumps([x.toDict() for x in github_references_commits]))