# [DiverseVul](https://surrealyz.github.io/files/pubs/raid23-diversevul.pdf)


We start by identifying 29 security issue websites, and then narrow it down to 2 websites with most git system commits 1. From these websites, we crawl the issue title, body, and relevant git commit URLs. Since developer’s discussions may reference both vulnerability-fixing commits and vulnerability-introducing commits, we use two heuristics to exclude vulnerability-introducing commits. First, we exclude all commit URLs mentioned in comments containing keywords “introduced" and “first included"; and second, we manually go over all commits that changed at least 10 functions and exclude ones that introduced vulnerability. We keep the remaining commits in our dataset. Next, we parse the git commit URLs to extract the projects and commit IDs. We clone the projects and extract the commits from these projects. We identify the C/C++ related code files in the commits. Then, we extract all functions that were changed in these commits, and also functions that did not change in the files. Same as ReVeal [ 4], we label the before-commit version of a changed
function to be vulnerable, and the after-commit version to be non-
vulnerable. We label all unchanged functions in the related code files
to be non-vulnerable. Like prior work, we deduplicate functions
by their MD5 hashes, and we do not normalize the code before
deduplication. We keep track of the set of unique MD5s when
processing the functions. We process all the vulnerable functions
before the nonvulnerable ones. If the MD5 of a function already
exists in this set, we do not include the function again in the data.
In total, we have collected 7,514 commits from 797 projects, which
result in 18,945 vulnerable functions and 330,492 non-vulnerable
functions, covering 150 CWEs

In [33]:
import json
import pandas as pd
import numpy as np

import difflib

In [2]:
DIVERSEVUL_PATH = "/data/dok/viola/data/diversevul/diversevul_20230702.json"

#data = [json.loads(line)
#        for line in open(DIVERSEVUL_PATH, 'r', encoding='utf-8')]

#for line in data[:5]:
#    print(data)    

with open(DIVERSEVUL_PATH, "r") as f:
    lines = f.readlines()
    #data = json.load(f)
    print(f"read {len(lines)} lines")
    dfs = []
    for idx, l in enumerate(lines):
        dict = json.loads(l)
        if not dict['cwe']:
            dict['cwe'] = [np.NaN]
        #if len(dict['cwe']) > 1:
            #print(idx, dict)
        df = pd.DataFrame.from_dict(dict)
        if not df.empty:
            dfs.append(df)
        else:
            print(idx, dict)

    result = pd.concat(dfs, ignore_index=True)

print(result.size)
result.head()
    

#data = json.load(open(DIVERSEVUL_PATH, "r"))
#df_dv = pd.DataFrame.from_dict(data, orient="index") 
#pd.read_json(DIVERSEVUL_PATH, orient="records", lines=True, chunksize=5)
#df_dv.head()

read 330492 lines
3272848


Unnamed: 0,func,target,cwe,project,commit_id,hash,size,message
0,int _gnutls_ciphertext2compressed(gnutls_sessi...,1,,gnutls,7ad6162573ba79a4392c63b453ad0220ca6c5ace,73008646937836648589283922871188272089,157,added an extra check while checking the padding.
1,static char *make_filename_safe(const char *fi...,1,CWE-264,php-src,055ecbc62878e86287d742c7246c21606cee8183,211824207069112513181516095447837228041,22,Improve check for :memory: pseudo-filename in ...
2,"unpack_Z_stream(int fd_in, int fd_out)\n{\n\tI...",1,,busybox,251fc70e9722f931eec23a34030d05ba5f747b0e,21401706257394042943815500829552774160,232,uncompress: fix buffer underrun by corrupted i...
3,"static void cirrus_do_copy(CirrusVGAState *s, ...",1,CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,135590882627853658533498335902319684573,66,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ..."
4,"glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...",1,CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,27696392987383562433164405181263025184,18,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ..."


In [3]:
result.iloc[550:555]
# Entries get duplicated if they are asigned to more than 1 CWE

Unnamed: 0,func,target,cwe,project,commit_id,hash,size,message
550,AvahiDnsPacket *avahi_recv_dns_packet_ipv4(\n ...,1,CWE-399,avahi,46109dfec75534fe270c0ab902576f685d5ab3a6,85362802703699340052618486296940557334,156,socket: Still read corrupt packets from the so...
551,"strip_leading_slashes (char *name, int strip_l...",1,CWE-22,patch,685a78b6052f4df6eac6d625a545cfb54a6ac0e1,264885369498121145908754829228247076588,23,Do not let a malicious patch create files abov...
552,int mainloop(CLIENT *client) {\n\tstruct nbd_r...,1,CWE-119,nbd,4ed24fe0d64c7cc9963c57b52cad1555ad7c6b60,72590078856250562196902269873002966436,92,r134: CVE-2005-3534
553,int mainloop(CLIENT *client) {\n\tstruct nbd_r...,1,CWE-119,nbd,3ef52043861ab16352d49af89e048ba6339d6df8,168268959423393978625601902172027564201,93,"Fix buffer size checking\n\nYes, this means we..."
554,int mainloop(CLIENT *client) {\n\tstruct nbd_r...,1,CWE-787,nbd,3ef52043861ab16352d49af89e048ba6339d6df8,168268959423393978625601902172027564201,93,"Fix buffer size checking\n\nYes, this means we..."


In [4]:
targets = result.groupby(['target'])['target'].count()

print(targets)


target
0    385464
1     23642
Name: target, dtype: int64


In [5]:
#test = result.iloc[:10000].copy()
full_test = result.copy()
def add_signature(row):
    func = row['func']
    sign = func.split('{')[0]
    return sign

full_test['signature'] = full_test.apply(add_signature, axis=1)

full_test.head(5)

Unnamed: 0,func,target,cwe,project,commit_id,hash,size,message,signature
0,int _gnutls_ciphertext2compressed(gnutls_sessi...,1,,gnutls,7ad6162573ba79a4392c63b453ad0220ca6c5ace,73008646937836648589283922871188272089,157,added an extra check while checking the padding.,int _gnutls_ciphertext2compressed(gnutls_sessi...
1,static char *make_filename_safe(const char *fi...,1,CWE-264,php-src,055ecbc62878e86287d742c7246c21606cee8183,211824207069112513181516095447837228041,22,Improve check for :memory: pseudo-filename in ...,static char *make_filename_safe(const char *fi...
2,"unpack_Z_stream(int fd_in, int fd_out)\n{\n\tI...",1,,busybox,251fc70e9722f931eec23a34030d05ba5f747b0e,21401706257394042943815500829552774160,232,uncompress: fix buffer underrun by corrupted i...,"unpack_Z_stream(int fd_in, int fd_out)\n"
3,"static void cirrus_do_copy(CirrusVGAState *s, ...",1,CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,135590882627853658533498335902319684573,66,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ...","static void cirrus_do_copy(CirrusVGAState *s, ..."
4,"glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...",1,CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,27696392987383562433164405181263025184,18,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ...","glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV..."


In [13]:

# find all functions with matching signature and project:
duplicate_signatures = full_test[full_test.duplicated(subset=['project', 'signature'], keep=False)]
print(len(duplicate_signatures))

# drop identic functions (introduced to map 1 function to several CWEs):
duplicate_signatures = duplicate_signatures.drop_duplicates(subset=['hash'], keep=False)

vulnerable = duplicate_signatures[duplicate_signatures['target']==1]
#vulnerable.rename(columns={'func': 'vuln'})

benign = duplicate_signatures[duplicate_signatures['target']==0]
#benign.rename(columns={'func': 'benign'})

print(f"{len(duplicate_signatures)} duplicate functions, {len(vulnerable)} vulnerable, {len(benign)} benign")





220648
70993 duplicate functions, 13314 vulnerable, 57679 benign


In [32]:
# Top-20 functions with most versions in dataset
#print(duplicate_signatures.groupby(['project', 'signature']).size().sort_values(ascending=False).head(20))

# ... additionally grouped by target
#print(duplicate_signatures.groupby(['project', 'signature', 'target']).size().sort_values(ascending=False).head(20))

#sanity check: more than 2 versions if grouped by commit_id? :( TODO check signatures!!!!!
#print(duplicate_signatures.groupby(['project', 'signature', 'commit_id']).size().sort_values(ascending=False).head(20))
versions_per_function = duplicate_signatures.groupby(['project', 'signature', 'commit_id']).size().reset_index(name='versions')
versions_per_function.groupby('versions').size()



versions
1     42850
2     12628
3       279
4       100
5        45
6        31
7        15
8        16
9         7
10        7
11        7
12        4
13        1
14        2
17        1
18        1
19        2
20        6
24        1
25        2
26        1
27        2
31        1
35        2
36        1
42        1
45        1
46        1
90        1
dtype: int64

In [26]:


pairs = vulnerable.merge(benign, on=['project', 'signature', 'commit_id'], suffixes = ('_vuln', '_benign'))
print(len(pairs))

# sanity check
print((pairs['message_vuln'] == pairs['message_benign']).all())

pairs = pairs.rename(columns={'message_vuln':'message' })
pairs = pairs.drop(['target_vuln', 'target_benign', 'message_benign'], axis=1)
pairs.head()

12515
True


Unnamed: 0,func_vuln,cwe_vuln,project,commit_id,hash_vuln,size_vuln,message,signature,func_benign,cwe_benign,hash_benign,size_benign
0,int _gnutls_ciphertext2compressed(gnutls_sessi...,,gnutls,7ad6162573ba79a4392c63b453ad0220ca6c5ace,73008646937836648589283922871188272089,157,added an extra check while checking the padding.,int _gnutls_ciphertext2compressed(gnutls_sessi...,int _gnutls_ciphertext2compressed(gnutls_sessi...,,158496975013272959571533274715541422662,156
1,static char *make_filename_safe(const char *fi...,CWE-264,php-src,055ecbc62878e86287d742c7246c21606cee8183,211824207069112513181516095447837228041,22,Improve check for :memory: pseudo-filename in ...,static char *make_filename_safe(const char *fi...,static char *make_filename_safe(const char *fi...,CWE-264,107340962273293518000751223695455368366,22
2,"unpack_Z_stream(int fd_in, int fd_out)\n{\n\tI...",,busybox,251fc70e9722f931eec23a34030d05ba5f747b0e,21401706257394042943815500829552774160,232,uncompress: fix buffer underrun by corrupted i...,"unpack_Z_stream(int fd_in, int fd_out)\n","unpack_Z_stream(int fd_in, int fd_out)\n{\n\tI...",,15654791308084388206201802153946726327,235
3,"static void cirrus_do_copy(CirrusVGAState *s, ...",CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,135590882627853658533498335902319684573,66,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ...","static void cirrus_do_copy(CirrusVGAState *s, ...","static void cirrus_do_copy(CirrusVGAState *s, ...",CWE-787,136027807783301598046603041783042921475,68
4,"glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...",CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,27696392987383562433164405181263025184,18,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ...","glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...","glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...",CWE-787,102419161251855648406111818716560619478,24


In [35]:
def to_diff_file(this, other):
    """
    produces a diff file comparing two files/methods.
    """
    diff = difflib.ndiff(this.splitlines(1), other.splitlines(1))
    result = '\n'.join(diff)
    return result


def separate_diff(diff):
    """
    :returns Dictionary of added and removed statements along with line number.
    """
    lines = diff.split("\n")
    added = []  # List[Tuple[int, str]]
    deleted = []  # List[Tuple[int, str]]
    count_deletions = 0
    count_additions = 0

    for line in lines:
        line = line.rstrip()
        count_deletions += 1
        count_additions += 1

        if line.startswith("-"):
            deleted.append((count_deletions, line[1:]))
            count_additions -= 1

        if line.startswith("+"):
            added.append((count_additions, line[1:]))
            count_deletions -= 1

        if line == r"\ No newline at the end of file":
            count_deletions -= 1
            count_additions -= 1

    return added, deleted

pairs['diff'] = pairs.apply(lambda row: to_diff_file(row['func_vuln'], row['func_benign']), axis=1)
pairs.head()

Unnamed: 0,func_vuln,cwe_vuln,project,commit_id,hash_vuln,size_vuln,message,signature,func_benign,cwe_benign,hash_benign,size_benign,diff
0,int _gnutls_ciphertext2compressed(gnutls_sessi...,,gnutls,7ad6162573ba79a4392c63b453ad0220ca6c5ace,73008646937836648589283922871188272089,157,added an extra check while checking the padding.,int _gnutls_ciphertext2compressed(gnutls_sessi...,int _gnutls_ciphertext2compressed(gnutls_sessi...,,158496975013272959571533274715541422662,156,int _gnutls_ciphertext2compressed(gnutls_ses...
1,static char *make_filename_safe(const char *fi...,CWE-264,php-src,055ecbc62878e86287d742c7246c21606cee8183,211824207069112513181516095447837228041,22,Improve check for :memory: pseudo-filename in ...,static char *make_filename_safe(const char *fi...,static char *make_filename_safe(const char *fi...,CWE-264,107340962273293518000751223695455368366,22,static char *make_filename_safe(const char *...
2,"unpack_Z_stream(int fd_in, int fd_out)\n{\n\tI...",,busybox,251fc70e9722f931eec23a34030d05ba5f747b0e,21401706257394042943815500829552774160,232,uncompress: fix buffer underrun by corrupted i...,"unpack_Z_stream(int fd_in, int fd_out)\n","unpack_Z_stream(int fd_in, int fd_out)\n{\n\tI...",,15654791308084388206201802153946726327,235,"unpack_Z_stream(int fd_in, int fd_out)\n\n ..."
3,"static void cirrus_do_copy(CirrusVGAState *s, ...",CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,135590882627853658533498335902319684573,66,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ...","static void cirrus_do_copy(CirrusVGAState *s, ...","static void cirrus_do_copy(CirrusVGAState *s, ...",CWE-787,136027807783301598046603041783042921475,68,static void cirrus_do_copy(CirrusVGAState *s...
4,"glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...",CWE-787,qemu,b2eb849d4b1fdb6f35d5c46958c7f703cf64cfef,27696392987383562433164405181263025184,18,"CVE-2007-1320 - Cirrus LGD-54XX ""bitblt"" heap ...","glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...","glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(CirrusV...",CWE-787,102419161251855648406111818716560619478,24,"glue(cirrus_bitblt_rop_fwd_, ROP_NAME)(Cirru..."


In [40]:
print(pairs.loc[1, 'diff'])

print('**************')

print(pairs.loc[1, 'func_vuln'])
print('**************')
print(pairs.loc[1, 'func_benign'])
print('**************')

for l in difflib.unified_diff(pairs.loc[1, 'func_vuln'], pairs.loc[1, 'func_benign']):
    print(l)

  static char *make_filename_safe(const char *filename TSRMLS_DC)

  {

- 	if (*filename && strncmp(filename, ":memory:", sizeof(":memory:")-1)) {

? 	                 ^^^^                                            --

+ 	if (*filename && memcmp(filename, ":memory:", sizeof(":memory:"))) {

? 	                 ^^^

  		char *fullpath = expand_filepath(filename, NULL TSRMLS_CC);

  

  		if (!fullpath) {

  			return NULL;

  		}

  

  		if (PG(safe_mode) && (!php_checkuid(fullpath, NULL, CHECKUID_CHECK_FILE_AND_DIR))) {

  			efree(fullpath);

  			return NULL;

  		}

  

  		if (php_check_open_basedir(fullpath TSRMLS_CC)) {

  			efree(fullpath);

  			return NULL;

  		}

  		return fullpath;

  	}

  	return estrdup(filename);

  }
**************
static char *make_filename_safe(const char *filename TSRMLS_DC)
{
	if (*filename && strncmp(filename, ":memory:", sizeof(":memory:")-1)) {
		char *fullpath = expand_filepath(filename, NULL TSRMLS_CC);

		if (!fullpath) {
			return NULL;
