In [40]:
import pandas as pd
import sqlite3 as lite
from sqlite3 import Error
from pathlib import Path
import ast
import tqdm
import difflib

In [41]:
def create_connection(db_file):
    """
    create a connection to sqlite3 database
    """
    conn = None
    try:
        conn = lite.connect(db_file, timeout=10)  # connection via sqlite3
        # engine = sa.create_engine('sqlite:///' + db_file)  # connection via sqlalchemy
        # conn = engine.connect()
    except Error as e:
        print(e)
    return conn


DATA_PATH = Path.cwd().parents[0] / 'Data'

conn = create_connection(DATA_PATH / "CVEfixes.db")

In [42]:
df_commit = pd.read_sql('SELECT * FROM commits', con=conn)
df_file = pd.read_sql('SELECT * FROM file_change', con=conn)
df_method = pd.read_sql('SELECT * FROM method_change', con=conn)
df_cve = pd.read_sql('SELECT * FROM cve', con=conn)
df_fixes = pd.read_sql('SELECT * FROM fixes', con=conn)
df_cwe_class = pd.read_sql('SELECT * FROM cwe_classification', con=conn)
df_cwe = pd.read_sql('SELECT * FROM cwe', con=conn)
df_repo = pd.read_sql('SELECT * FROM repository', con = conn)

Changed files per commit:

In [43]:
file_change_per_commit = df_file.groupby(['hash']).size().reset_index(name='file_changes')
print(file_change_per_commit.head())

file_change_per_commit.groupby('file_changes').size()

                                       hash  file_changes
0  000030feb7a30f193197f1aab8a7b04a26b42735             1
1  0002d106a6cd35cb0a6fe03246531a4e3f32c9d0             1
2  0010d28de1b15d51db3976080e26357fa7144436             1
3  00194f5fe462123f70b0bae7987317b52898b868             1
4  001a3278b5572e52c0ecac0bd1157bf2599502b7             1


file_changes
1       3699
2       1409
3        813
4        406
5        248
        ... 
402        1
611        1
890        1
1776       1
2142       1
Length: 79, dtype: int64

Find local fixes:

In [44]:
# only consider commits with single file change

single_change = file_change_per_commit[file_change_per_commit['file_changes']==1] #TODO consider changes of 2 or more files?
filtered = df_file[df_file['hash'].isin(single_change['hash'])]
#print(filtered.head())
filtered.groupby('programming_language').size().sort_values(ascending=False).head(10)


programming_language
C              1790
PHP             611
C++             450
JavaScript      157
Objective-C     116
Python          111
Java             86
Ruby             53
Markdown         51
TypeScript       41
dtype: int64

In [45]:
df_local_diff = filtered.copy() #df_file[df_file.programming_language=='C'].copy()
df_local_diff['diff_added'] = df_local_diff.apply(lambda row: ast.literal_eval(row.diff_parsed)['added'], axis=1)
df_local_diff['diff_deleted'] = df_local_diff.apply(lambda row: ast.literal_eval(row.diff_parsed)['deleted'], axis=1)
df_local_diff = df_local_diff.reset_index(drop=True)

In [46]:
def is_local_fix(row):
    added = row['diff_added']
    deleted = row['diff_deleted']

    if added:
        first_add = added[0][0]
        last_add = first_add
        for line in added[1:]:
            last_add += 1
            if line[0] > last_add:
                return False 
         
        if deleted:
            first_del = deleted[0][0]
            # add & delete at same position?
            if first_del >= first_add and first_del <= last_add: # check me
                last_del = first_del
                for line in deleted[1:]:
                    last_del += 1
                    if line[0] > last_del:
                         return False 
                return True

            else:
                return False 
        return True
         
    # nothing added, deletion only
    first_del = deleted[0][0]
    last_del = first_del
    for line in deleted[1:]:
        last_del += 1
        if line[0] > last_del:
            return False    
    return True


df_local_diff = df_local_diff[df_local_diff.apply(is_local_fix, axis=1)]
df_local_diff = df_local_diff.reset_index(drop=True)
# 1654 commits w local single file change

df_local_diff.groupby('programming_language').size().sort_values(ascending=False)

programming_language
C                   730
PHP                 310
C++                 204
JavaScript           85
Objective-C          56
Markdown             38
Ruby                 38
Python               36
Java                 32
TypeScript           21
HTML                 18
Perl                 18
Go                   14
Shell                12
C#                    7
Scala                 7
Rust                  6
SQL                   5
CoffeeScript          4
unknown               3
Swift                 3
TeX                   1
Batchfile             1
Lua                   1
Jupyter Notebook      1
Haskell               1
Matlab                1
dtype: int64

In [47]:
df_local_diff_with_cve = df_local_diff.merge(df_fixes, how="inner", on=['hash']) # add CVEs
df_local_diff_with_cve = df_local_diff_with_cve.merge(df_cwe_class, how="inner", on=['cve_id']) # add CWEs
df_local_diff_with_cve = df_local_diff_with_cve.merge(df_cwe, how="inner", on=['cwe_id']) # add CWE details

# CWE: Common Weakness Enumeration -> vulnerability
# CVE: Common Vulnerabilities and Exposures -> specific instance within a product or system

print(df_local_diff_with_cve.groupby('cwe_name').size().sort_values(ascending=False))
df_local_diff_with_cve.head()

cwe_name
Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting')                       238
Out-of-bounds Read                                                                                         118
Exposure of Sensitive Information to an Unauthorized Actor                                                  98
Out-of-bounds Write                                                                                         97
NULL Pointer Dereference                                                                                    94
                                                                                                          ... 
Improper Neutralization                                                                                      1
Improper Control of Filename for Include/Require Statement in PHP Program ('PHP Remote File Inclusion')      1
Improper Control of Dynamically-Managed Code Resources                                                 

Unnamed: 0,file_change_id,hash,filename,old_path,new_path,change_type,diff,diff_parsed,num_lines_added,num_lines_deleted,...,diff_deleted,cve_id,repo_url,cwe_id,index,cwe_name,description,extended_description,url,is_category
0,174571154001437,04906bd5de2f220bf100b605dad37b4a1d9a91a6,saver.cpp,kscreensaver/saver.cpp,kscreensaver/saver.cpp,ModificationType.MODIFY,"@@ -151,6 +151,7 @@ void KPasswordDlg::keyPres...","{'added': [(154, ' timer.stop();')]...",1,0,...,[],CVE-1999-0731,https://github.com/KDE/kde1-kdebase,NVD-CWE-Other,1339,Other,NVD is only using a subset of CWE for mapping ...,Insufficient Information,https://nvd.nist.gov/vuln/categories,0
1,47598748964815,c5be6209311d4a8f10fda37d0d3f876c1b33b77b,svr_principal.c,src/lib/kadm5/srv/svr_principal.c,src/lib/kadm5/srv/svr_principal.c,ModificationType.MODIFY,"@@ -186,7 +186,7 @@ check_1_6_dummy(kadm5_prin...","{'added': [(189, ' if (password == NULL || ...",1,1,...,"[(189, if (!(mask & KADM5_ATTRIBUTES) ||)]",CVE-2012-1013,https://github.com/krb5/krb5,NVD-CWE-Other,1339,Other,NVD is only using a subset of CWE for mapping ...,Insufficient Information,https://nvd.nist.gov/vuln/categories,0
2,242257866677552,08c642c09c38a9c6454ab43a9b53b2a89b9eef99,ldap_principal2.c,src/plugins/kdb/ldap/libkdb_ldap/ldap_principa...,src/plugins/kdb/ldap/libkdb_ldap/ldap_principa...,ModificationType.MODIFY,"@@ -296,6 +296,7 @@ process_db_args(krb5_conte...","{'added': [(299, ' arg = (arg != NU...",1,0,...,[],CVE-2016-3119,https://github.com/krb5/krb5,NVD-CWE-Other,1339,Other,NVD is only using a subset of CWE for mapping ...,Insufficient Information,https://nvd.nist.gov/vuln/categories,0
3,43090082980843,c290f8358acaeffd8e0c551ddcc24d1206143376,tty_io.c,drivers/tty/tty_io.c,drivers/tty/tty_io.c,ModificationType.MODIFY,"@@ -1873,6 +1873,7 @@ static int tty_open(stru...","{'added': [(1876, '\t\t\ttty_driver_kref_put(d...",1,0,...,[],CVE-2011-5321,https://github.com/torvalds/linux,NVD-CWE-Other,1339,Other,NVD is only using a subset of CWE for mapping ...,Insufficient Information,https://nvd.nist.gov/vuln/categories,0
4,188310393560112,b35cc8225845112a616e3a2266d2fde5ab13d3ab,compress_offload.c,sound/core/compress_offload.c,sound/core/compress_offload.c,ModificationType.MODIFY,"@@ -407,6 +407,10 @@ static int snd_compr_allo...","{'added': [(410, '\tif (params->buffer.fragmen...",4,0,...,[],CVE-2012-6703,https://github.com/torvalds/linux,NVD-CWE-Other,1339,Other,NVD is only using a subset of CWE for mapping ...,Insufficient Information,https://nvd.nist.gov/vuln/categories,0


In [48]:
# filter out large changes & unknown programming languages
df_small_local_diff = df_local_diff[pd.to_numeric(df_local_diff['num_lines_added'])<4]
df_small_local_diff = df_small_local_diff[pd.to_numeric(df_small_local_diff['num_lines_deleted'])<4]
df_small_local_diff = df_small_local_diff[df_small_local_diff['programming_language']!='unknown']
df_small_local_diff = df_small_local_diff.reset_index(drop=True)

df_small_local_diff.groupby('programming_language').size().sort_values(ascending=False)

programming_language
C               526
PHP             247
C++             108
JavaScript       65
Objective-C      44
Ruby             29
Java             24
Markdown         21
Python           19
HTML             14
Perl             13
TypeScript       13
Shell             9
Go                8
Scala             6
C#                5
CoffeeScript      4
Rust              4
SQL               4
Haskell           1
Swift             1
TeX               1
Batchfile         1
dtype: int64

In [49]:
# TODO Drop this?
# Without filtering for single file changes: 
df_diff = df_file.copy() # all files
df_diff['diff_added'] = df_diff.apply(lambda row: ast.literal_eval(row.diff_parsed)['added'], axis=1)
df_diff['diff_deleted'] = df_diff.apply(lambda row: ast.literal_eval(row.diff_parsed)['deleted'], axis=1)
print('all changed files:', len(df_diff))

df_diff = df_diff[df_diff.apply(is_local_fix, axis=1)]
print('locally changed files:', len(df_diff))
df_diff = df_diff.reset_index(drop=True)
# 17329 files w local change
df_diff.groupby('programming_language').size().sort_values(ascending=False)

all changed files: 29309
locally changed files: 17322


programming_language
unknown             3792
PHP                 3468
C                   2205
Markdown            1239
JavaScript          1071
C++                  626
Ruby                 573
HTML                 558
Java                 498
Shell                461
Python               458
TypeScript           337
SQL                  297
Go                   249
Batchfile            246
CoffeeScript         162
CSS                  152
Lua                  141
C#                   130
Perl                 114
Objective-C          107
Scala                 96
TeX                   73
Haskell               69
Rust                  45
Jupyter Notebook      36
Matlab                34
PowerShell            33
Swift                 30
R                     12
Erlang                 6
None                   4
dtype: int64

In [50]:
# TODO Drop?
df_filtered = df_diff[pd.to_numeric(df_diff['num_lines_added'])<3]
df_filtered = df_filtered[pd.to_numeric(df_filtered['num_lines_deleted'])<3]
df_filtered = df_filtered.reset_index(drop=True)

df_filtered.groupby('programming_language').size().sort_values(ascending=False)

programming_language
C                   1522
PHP                 1427
JavaScript           563
Markdown             404
HTML                 384
Ruby                 357
Shell                313
Java                 295
C++                  277
Python               209
Batchfile            185
unknown              163
SQL                  150
TypeScript           113
Go                   106
CoffeeScript          82
C#                    80
Objective-C           75
TeX                   64
Perl                  64
Scala                 49
CSS                   47
Lua                   26
Jupyter Notebook      25
Rust                  23
Matlab                16
Haskell               15
PowerShell            13
R                      6
Swift                  5
Erlang                 2
None                   2
dtype: int64

In [51]:
def inspect_change(row):
    for col in ['diff_added', 'diff_deleted']:
        print(col, ":", row[col])

In [52]:
# Util functions to parse diffs
# @see https://github.com/ishepard/pydriller/blob/master/pydriller/domain/commit.py
from pyparsing import Dict, List, Tuple


def parse_diff(unified_diff): #str) -> Dict[str, List[Tuple[int, str]]]:
    """
    Returns a dictionary with the added and deleted lines.
    The dictionary has 2 keys: "added" and "deleted", each containing the
    corresponding added or deleted lines. For both keys, the value is a
    list of Tuple (int, str), corresponding to (number of line in the file,
    actual line).

    :return: Dictionary
    """
    #lines = unified_diff.split("\n")
    modified_lines = {
        "added": [],
        "deleted": [],
    }  # type: Dict[str, List[Tuple[int, str]]]

    count_deletions = 0
    count_additions = 0

    for line in unified_diff:
        line = line.rstrip()
        count_deletions += 1
        count_additions += 1

        if line.startswith("@@"):
            count_deletions, count_additions = get_line_numbers(line)

        if line.startswith("-"):
            modified_lines["deleted"].append((count_deletions, line[1:]))
            count_additions -= 1

        if line.startswith("+"):
            modified_lines["added"].append((count_additions, line[1:]))
            count_deletions -= 1

        if line == r"\ No newline at end of file":
            count_deletions -= 1
            count_additions -= 1

    return modified_lines


def get_line_numbers(line: str) -> Tuple[int, int]:
    token = line.split(" ")
    numbers_old_file = token[1]
    numbers_new_file = token[2]
    delete_line_number = (
            int(numbers_old_file.split(",")[0].replace("-", "")) - 1
    )
    additions_line_number = int(numbers_new_file.split(",")[0]) - 1
    return delete_line_number, additions_line_number



Cut files into prompt, vul, fix, remainder:

In [53]:
def update_diff(row, lines_before, lines_after):
    diff = difflib.unified_diff(lines_before, lines_after, n=1)
    try:
        _ = next(diff)
        _ = next(diff) # skip first 2 lines
    except(StopIteration):
        print('Hä', row.loc['file_change_id'])
    diff_parsed = parse_diff(diff)
    #print(diff_parsed)
    row['diff_added'] = diff_parsed['added']
    row['diff_deleted'] = diff_parsed['deleted']


def create_sample(row):
    lines_before = row['code_before'].splitlines(True)
    lines_after = row['code_after'].splitlines(True)
   
    if row['diff_added']:
        #assert row['diff_added'][0][1] == lines_after[row['diff_added'][0][0]], f"Non matching linenumber in diff added."
        if not (row['diff_added'][0][1].rstrip() == lines_after[row['diff_added'][0][0]-1].rstrip() ):
            #print(f"Non matching linenumbers in diff (add).")
            update_diff(row, lines_before, lines_after)
        first_add = row['diff_added'][0][0]-1
        last_add = row['diff_added'][-1][0]-1
        first_change = first_add
        last_change = last_add
    if row['diff_deleted']:
        if not (row['diff_deleted'][0][1].rstrip() == lines_before[row['diff_deleted'][0][0]-1].rstrip() ):
            print(f"Non matching linenumbers in diff (del).")
            update_diff(row, lines_before, lines_after)
        first_del = row['diff_deleted'][0][0]-1
        last_del = row['diff_deleted'][-1][0]-1
        if row['diff_added']:
            first_change = min(first_change, first_del)
            last_change = max(last_add, last_del)
            if first_add == first_del and last_add == last_del:
                vul = "".join(lines_before[first_change:last_change+1])
                patch = "".join(lines_after[first_change:last_change+1])
                remainder = "".join(lines_after[last_change+1:])
            else:
                #inspect_change(row)
                vul = "".join(lines_before[first_del:last_del+1])
                patch = "".join(lines_after[first_add:last_add+1])
                remainder = "".join(lines_before[last_del+1:])
                # TODO handle non overlapping adds & dels
                if first_del > last_add or first_add > last_del:
                    print('ALARM')
        else:
            first_change = first_del
            last_change = last_del
            vul = "".join(lines_before[first_del:last_del+1])
            patch = ""
            remainder = "".join(lines_before[last_del+1:])
    else:
        vul = ""
        patch = "".join(lines_after[first_add:last_add+1])
        remainder = "".join(lines_after[last_add+1:])

    prompt = "".join(lines_before[:first_change]) # the line before the first change is the prompt.
    
    row.loc['prompt'] = prompt
    row.loc['target_vul'] = vul
    row.loc['target_patch'] = patch
    row.loc['remainder'] = remainder

    #check this!!!
    #assert row.loc['target_patch'] != row.loc['target_vul'], f"Error in file_change_id {row.loc['file_change_id']}, vul == patch."
    if not row.loc['target_patch'] != row.loc['target_vul']:
        print(f"Error in file_change_id {row.loc['file_change_id']}, vul == patch.")
    if not row.loc['prompt'] + row.loc['target_patch'] + row.loc['remainder'] == row.loc['code_after']:
         print(f"Error in code_after for file_change_id {row.loc['file_change_id']}")
    if not row.loc['prompt'] + row.loc['target_vul'] + row.loc['remainder'] == row.loc['code_before']:
        print(f"Error in code_before for file_change_id {row.loc['file_change_id']}")
        
    return row

# apply on files with local changes
#df_small_local_diff[df_small_local_diff['programming_language']== 'Python'].apply(create_sample, axis=1)
#df_small_local_diff[df_small_local_diff['programming_language']== 'C'].apply(create_sample, axis=1)
#df_small_local_diff[df_small_local_diff['programming_language']== 'C++'].apply(create_sample, axis=1)
df_small_local_diff = df_small_local_diff.reindex(columns=df_small_local_diff.columns.to_list()+ ['prompt', 'target_vul', 'target_patch', 'remainder'])
df_small_local_diff = df_small_local_diff.apply(create_sample, axis=1)
print(df_small_local_diff.columns)

#problematisch: file_change_id '279629478150458'

Index(['file_change_id', 'hash', 'filename', 'old_path', 'new_path',
       'change_type', 'diff', 'diff_parsed', 'num_lines_added',
       'num_lines_deleted', 'code_after', 'code_before', 'nloc', 'complexity',
       'token_count', 'programming_language', 'diff_added', 'diff_deleted',
       'prompt', 'target_vul', 'target_patch', 'remainder'],
      dtype='object')


In [54]:
# Save to file
output_cols = ['file_change_id', 'programming_language', 'prompt', 'target_vul', 'target_patch', 'remainder']
df_small_local_diff[output_cols].to_csv(DATA_PATH / 'small_local_changes.csv', index=False)



In [57]:
# add CWEs
df_small_local_diff_with_cve = df_small_local_diff.merge(df_fixes, how="inner", on=['hash']) # add CVEs
df_small_local_diff_with_cve = df_small_local_diff_with_cve.merge(df_cwe_class, how="inner", on=['cve_id']) # add CWEs
df_small_local_diff_with_cve = df_small_local_diff_with_cve.merge(df_cwe, how="inner", on=['cwe_id']) # add CWE details

# CWE: Common Weakness Enumeration -> vulnerability
# CVE: Common Vulnerabilities and Exposures -> specific instance within a product or system

print(df_small_local_diff_with_cve.groupby('cwe_name').size().sort_values(ascending=False))

output_cols = ['file_change_id', 'programming_language', 'prompt', 'target_vul', 'target_patch', 'remainder', 'cve_id', 'cwe_id', 'cwe_name']
df_small_local_diff_with_cve[output_cols].to_csv(DATA_PATH / 'small_local_changes_CWE.csv', index=False)

cwe_name
Improper Neutralization of Input During Web Page Generation ('Cross-site Scripting')    203
Exposure of Sensitive Information to an Unauthorized Actor                               87
Out-of-bounds Read                                                                       73
NULL Pointer Dereference                                                                 65
Out-of-bounds Write                                                                      62
                                                                                       ... 
Improper Neutralization of Alternate XSS Syntax                                           1
Improper Locking                                                                          1
Improper Control of Dynamically-Managed Code Resources                                    1
Incorrect Comparison                                                                      1
Weak Password Requirements                                             

In [23]:
# TODO check the rest

print('Fixes:', df_fixes.columns)
print('File changes:', df_file.columns)

for row in df_file['diff_parsed'][:5]:
    print('added:',
            len(ast.literal_eval(row)['added']),
            'deleted:', 
            len(ast.literal_eval(row)['deleted']))



Fixes: Index(['cve_id', 'hash', 'repo_url'], dtype='object')
File changes: Index(['file_change_id', 'hash', 'filename', 'old_path', 'new_path',
       'change_type', 'diff', 'diff_parsed', 'num_lines_added',
       'num_lines_deleted', 'code_after', 'code_before', 'nloc', 'complexity',
       'token_count', 'programming_language'],
      dtype='object')
added: 52 deleted: 0
added: 28 deleted: 10
added: 5 deleted: 3
added: 8 deleted: 4
added: 1 deleted: 1


In [None]:

df_c_diff = df_file[df_file.programming_language=='C'].copy()
df_c_diff['diff_added'] = df_c_diff.apply(lambda row: ast.literal_eval(row.diff_parsed)['added'], axis=1)
df_c_diff['diff_deleted'] = df_c_diff.apply(lambda row: ast.literal_eval(row.diff_parsed)['deleted'], axis=1)
df_c_diff = df_c_diff.reset_index(drop=True)
df_c_diff.head(5)

Unnamed: 0,file_change_id,hash,filename,old_path,new_path,change_type,diff,diff_parsed,num_lines_added,num_lines_deleted,code_after,code_before,nloc,complexity,token_count,programming_language,diff_added,diff_deleted
0,7461293048740,2864e767053317538feafa815046fff89e5a16be,dl-load.c,elf/dl-load.c,elf/dl-load.c,ModificationType.MODIFY,"@@ -149,21 +149,31 @@ local_strdup (const char...","{'added': [(152, ' const char *const start = ...",28,10,/* Map in a shared object's segments from the ...,/* Map in a shared object's segments from the ...,952.0,260.0,6592.0,C,"[(152, const char *const start = name;), (15...","[(158, /* $ORIGIN is not expanded for SU..."
1,253444729592566,2864e767053317538feafa815046fff89e5a16be,syslog.c,misc/syslog.c,misc/syslog.c,ModificationType.MODIFY,"@@ -177,10 +177,14 @@ vsyslog(pri, fmt, ap)\n ...","{'added': [(180, '\t\t/* Append a newline if n...",8,4,"/*\n * Copyright (c) 1983, 1988, 1993\n *\tThe...","/*\n * Copyright (c) 1983, 1988, 1993\n *\tThe...",229.0,42.0,1240.0,C,"[(180, \t\t/* Append a newline if necessary. ...","[(180, \t\t++v;), (181, \t\tv->iov_base = (cha..."
2,88393274694273,2864e767053317538feafa815046fff89e5a16be,regex.c,posix/regex.c,posix/regex.c,ModificationType.MODIFY,"@@ -164,46 +164,6 @@ char *realloc ();\n # de...","{'added': [(239, '#ifndef emacs'), (240, '/* H...",37,40,/* Extended regular expression matching and se...,/* Extended regular expression matching and se...,2789.0,805.0,14578.0,C,"[(239, #ifndef emacs), (240, /* How many chara...","[(167, /* How many characters in the character..."
3,55018883367054,2864e767053317538feafa815046fff89e5a16be,tst-getlogin.c,,posix/tst-getlogin.c,ModificationType.ADD,"@@ -0,0 +1,58 @@\n+/* Copyright (C) 1999 Free ...","{'added': [(1, '/* Copyright (C) 1999 Free Sof...",58,0,/* Copyright (C) 1999 Free Software Foundation...,,35.0,4.0,129.0,C,"[(1, /* Copyright (C) 1999 Free Software Found...",[]
4,156577410779052,2864e767053317538feafa815046fff89e5a16be,getlogin_r.c,sysdeps/unix/getlogin_r.c,sysdeps/unix/getlogin_r.c,ModificationType.MODIFY,"@@ -40,20 +40,20 @@ getlogin_r (name, name_len...","{'added': [(43, ' /* Get name of tty connecte...",16,16,/* Reentrant function to return the current lo...,/* Reentrant function to return the current lo...,46.0,5.0,195.0,C,"[(43, /* Get name of tty connected to fd 0. ...","[(43, {), (44, int d = __open (""/dev/tty..."


: 

In [24]:
LINES_IN_TARGET = 1
df_diff['prompt'] = None
df_diff['target_vul'] = None
df_diff['target_patch'] = None
df_diff['remainder'] = None


def inspect_change(idx):
    for col in ['diff_parsed', 'diff_added', 'diff_deleted']:
        print(col, ":", df_diff[col][idx])


def first_change(idx):
    if df_diff['diff_added'][idx]:
        if df_diff['diff_deleted'][idx]:
            return min(df_diff['diff_added'][idx][0][0], df_diff['diff_deleted'][idx][0][0])
        else:
            return df_diff['diff_added'][idx][0][0]
    else:
        return df_diff['diff_deleted'][idx][0][0]


# add prompt for ltr generation + vulnerable & fixed target
for i in range(len(df_diff['code_before'])):
    changed = first_change(i)
    lines_before = df_diff['code_before'][i].splitlines(True)
    lines_after = df_diff['code_after'][i].splitlines(True)

    df_diff.loc[i, 'prompt'] = "".join(lines_before[:changed-1]) # the line before the first change is the prompt.
    df_diff.loc[i, 'target_vul'] = "".join(lines_before[changed-1:min(changed+LINES_IN_TARGET-1, len(lines_before))]) 
    df_diff.loc[i, 'target_patch'] = "".join(lines_after[changed-1:min(changed+LINES_IN_TARGET-1, len(lines_after))])
    df_diff.loc[i, 'remainder'] = "".join(lines_after[changed: len(lines_after)])

df_diff.head(5)

Unnamed: 0,file_change_id,hash,filename,old_path,new_path,change_type,diff,diff_parsed,num_lines_added,num_lines_deleted,...,nloc,complexity,token_count,programming_language,diff_added,diff_deleted,prompt,target_vul,target_patch,remainder
0,100562011220946,2864e767053317538feafa815046fff89e5a16be,ChangeLog,ChangeLog,ChangeLog,ModificationType.MODIFY,"@@ -1,3 +1,55 @@\n+1999-11-09 Ulrich Drepper ...","{'added': [(1, '1999-11-09 Ulrich Drepper <d...",52,0,...,,,,Batchfile,"[(1, 1999-11-09 Ulrich Drepper <drepper@cygn...",[],,1999-11-08 Ulrich Drepper <drepper@cygnus.co...,1999-11-09 Ulrich Drepper <drepper@cygnus.co...,\n\t* elf/dl-load.c (_dl_dst_count): Allow $OR...
1,169254254633061,2864e767053317538feafa815046fff89e5a16be,search.texi,manual/search.texi,manual/search.texi,ModificationType.MODIFY,"@@ -475,9 +475,11 @@ elements.\n To remove a s...","{'added': [(478, 'and a pointer to the parent ...",5,3,...,,,,TeX,"[(478, and a pointer to the parent of the dele...","[(478, and the data if this tree node is retur...","@node Searching and Sorting, Pattern Matching,...",and the data if this tree node is returned by ...,and a pointer to the parent of the deleted nod...,function. If there is no matching entry in th...
2,253444729592566,2864e767053317538feafa815046fff89e5a16be,syslog.c,misc/syslog.c,misc/syslog.c,ModificationType.MODIFY,"@@ -177,10 +177,14 @@ vsyslog(pri, fmt, ap)\n ...","{'added': [(180, '\t\t/* Append a newline if n...",8,4,...,229.0,42.0,1240.0,C,"[(180, \t\t/* Append a newline if necessary. ...","[(180, \t\t++v;), (181, \t\tv->iov_base = (cha...","/*\n * Copyright (c) 1983, 1988, 1993\n *\tThe...",\t\t++v;\n,\t\t/* Append a newline if necessary. */\n,\t\tif (buf[bufsize - 1] != '\n')\n\t\t {\n\t...
3,197890948684264,2864e767053317538feafa815046fff89e5a16be,Makefile,posix/Makefile,posix/Makefile,ModificationType.MODIFY,"@@ -57,7 +57,7 @@ include ../Makeconfig\n \n a...","{'added': [(60, '\t\t tst-preadwrite test-vf...",1,1,...,,,,Shell,"[(60, \t\t tst-preadwrite test-vfork regexbu...","[(60, \t\t tst-preadwrite test-vfork regexbu...","# Copyright (C) 1991,92,93,94,95,96,97,98,99 F...",\t\t tst-preadwrite test-vfork regexbug1\n,\t\t tst-preadwrite test-vfork regexbug1 tst...,"ifeq (yes,$(build-shared))\ntest-srcs\t:= glob..."
4,55018883367054,2864e767053317538feafa815046fff89e5a16be,tst-getlogin.c,,posix/tst-getlogin.c,ModificationType.ADD,"@@ -0,0 +1,58 @@\n+/* Copyright (C) 1999 Free ...","{'added': [(1, '/* Copyright (C) 1999 Free Sof...",58,0,...,35.0,4.0,129.0,C,"[(1, /* Copyright (C) 1999 Free Software Found...",[],,,/* Copyright (C) 1999 Free Software Foundation...,This file is part of the GNU C Library.\n\n...


In [25]:
output_cols = ['prompt', 'target_vul', 'target_patch', 'remainder']
df_diff[output_cols].to_csv(DATA_PATH / 'ltr_samples.csv', index=False)

In [26]:
def inspect_entry(i:int):
    print(df_diff['prompt'][i] + 
    "VUL:"+"~"*20 +"\n" + 
    df_diff['target_vul'][i] + 
    "PATCH:"+"~"*20 +"\n" + 
    df_diff['target_patch'][i] +
    "REMAINDER:"+"~"*20 +"\n" + 
    df_diff['remainder'][i])

inspect_entry(0)

for i in range(len(df_diff['prompt'])):
    assert df_diff['prompt'][i] + df_diff['target_patch'][i] + df_diff['remainder'][i] == df_diff['code_after'][i]
    assert df_diff['prompt'][i] + df_diff['target_vul'][i] + df_diff['remainder'][i] == df_diff['code_before'][i], f"Error in line {i}"

VUL:~~~~~~~~~~~~~~~~~~~~
1999-11-08  Ulrich Drepper  <drepper@cygnus.com>
PATCH:~~~~~~~~~~~~~~~~~~~~
1999-11-09  Ulrich Drepper  <drepper@cygnus.com>
REMAINDER:~~~~~~~~~~~~~~~~~~~~

	* elf/dl-load.c (_dl_dst_count): Allow $ORIGIN to point to
	directory with the reference since this is as secure as using the
	object with the dependency.
	(_dl_dst_substitute): Likewise.

	* elf/dl-load.c (_dl_dst_count): Change strings in first two
	strncmp calls to allow reuse.
	(_dl_dst_substitute): Likewise.

1999-11-01  Arnold D. Robbins  <arnold@skeeve.com>

	* posix/regex.c (init_syntax_once): move below definition of
	ISALNUM etc., then use ISALNUM to init the table, so that
	the word ops will work if i18n'ed.
	(SYNTAX): And subscript with 0xFF for 8bit character sets.

1999-11-09  Andreas Jaeger  <aj@suse.de>

	* sysdeps/unix/getlogin_r.c (getlogin_r): Sync with getlogin
	implementation for ttyname_r call; fix inverted condition; return
	ut_user.  Closes PR libc/1438.

1999-11-09  Ulrich Drepper 

AssertionError: Error in line 0

In [None]:
df_method.head(5)
print(len(df_method))
method_diff = df_method[['file_change_id', 'signature']].value_counts().reset_index(name='count')
method_diff['code_before'] = None
method_diff['code_after'] = None
for idx in tqdm(df_method.index):
    if idx % 1000 == 0:
        print(idx)
    if df_method.loc[idx, 'before_change'] == 'True':
        method_diff.loc[(method_diff['file_change_id']==df_method.loc[idx, 'file_change_id']) & 
                    (method_diff['signature']==df_method.loc[idx, 'signature']), 'code_before'] = df_method.loc[idx,'code']

    else:
        method_diff.loc[(method_diff['file_change_id']==df_method.loc[idx, 'file_change_id']) & 
                    (method_diff['signature']==df_method.loc[idx, 'signature']), 'code_after'] = df_method.loc[idx,'code']
        
    
method_diff.head(5)

: 

In [None]:
method_diff.to_csv(DATA_PATH / 'methods.csv', index=True)

: 

In [None]:
from typing import List, Dict, Tuple

# See https://github.com/ishepard/pydriller/blob/master/pydriller/domain/commit.py

def get_line_numbers(line: str) -> Tuple[int, int]:
    token = line.split(" ")
    numbers_old_file = token[1]
    numbers_new_file = token[2]
    delete_line_number = (
            int(numbers_old_file.split(",")[0].replace("-", "")) - 1
    )
    additions_line_number = int(numbers_new_file.split(",")[0]) - 1
    return delete_line_number, additions_line_number

def to_diff_parsed(diff:str) -> Dict[str, List[Tuple[int, str]]]:
    """
    Returns a dictionary with the added and deleted lines.
    The dictionary has 2 keys: "added" and "deleted", each containing the
    corresponding added or deleted lines. For both keys, the value is a
    list of Tuple (int, str), corresponding to (number of line in the file,
    actual line).
    :return: Dictionary
    """
    lines = diff.split("\n")
    modified_lines = {
        "added": [],
        "deleted": [],
    }  # type: Dict[str, List[Tuple[int, str]]]

    count_deletions = 0
    count_additions = 0

    for line in lines:
        line = line.rstrip()
        count_deletions += 1
        count_additions += 1

        if line.startswith("@@"):
            count_deletions, count_additions = get_line_numbers(line)

        if line.startswith("-"):
            modified_lines["deleted"].append((count_deletions, line[1:]))
            count_additions -= 1

        if line.startswith("+"):
            modified_lines["added"].append((count_additions, line[1:]))
            count_deletions -= 1

        if line == r"\ No newline at end of file":
            count_deletions -= 1
            count_additions -= 1

    return modified_lines

: 

In [None]:
query = """
SELECT cv.cve_id, f.filename, f.num_lines_added, f.num_lines_deleted, f.code_before, f.code_after, cc.cwe_id 
FROM file_change f, commits c, fixes fx, cve cv, cwe_classification cc
WHERE f.hash = c.hash 
AND c.hash = fx.hash 
AND fx.cve_id = cv.cve_id 
AND cv.cve_id = cc.cve_id 
AND f.num_lines_added<=1 
AND f.num_lines_deleted<=1;
"""
single_line_fixes = pd.read_sql_query(query, conn)
print(len(single_line_fixes))
single_line_fixes.head(5)

6876


Unnamed: 0,cve_id,filename,num_lines_added,num_lines_deleted,code_before,code_after,cwe_id
0,CVE-1999-0199,Makefile,1,1,"# Copyright (C) 1991,92,93,94,95,96,97,98,99 F...","# Copyright (C) 1991,92,93,94,95,96,97,98,99 F...",CWE-252
1,CVE-1999-0731,saver.cpp,1,0,#include <pwd.h>\n#include <unistd.h>\n#includ...,#include <pwd.h>\n#include <unistd.h>\n#includ...,NVD-CWE-Other
2,CVE-2012-1013,svr_principal.c,1,1,/* -*- mode: c; c-basic-offset: 4; indent-tabs...,/* -*- mode: c; c-basic-offset: 4; indent-tabs...,NVD-CWE-Other
3,CVE-2014-4343,spnego_mech.c,0,1,"/*\n * Copyright (C) 2006,2008 by the Massachu...","/*\n * Copyright (C) 2006,2008 by the Massachu...",CWE-415
4,CVE-2014-4344,spnego_mech.c,1,1,"/*\n * Copyright (C) 2006,2008 by the Massachu...","/*\n * Copyright (C) 2006,2008 by the Massachu...",CWE-476


: 