# Generating a Minimal ILP Dataset

Finds a group of small buffer overflow examples from the Juliet dataset, then generates their Prolog representations from the code property graph generated by Joern. These are saved to `../data/ilp_data.csv.gz`  and `../data/ilp_prolog_data.csv.gz` respectively. 


## Find a nice subset

Find a small number of short buffer overflow examples from the Juliet dataset, and save them into the `../data/ilp_data.csv.gz` dataframe. 

In [1]:
import pandas as pd

In [2]:
buffer_overflow_juliet = pd.read_csv("../data/buffer_overflow_data.csv.gz")

In [3]:
buffer_overflow_juliet['code_length'] = buffer_overflow_juliet.code.apply(len)
buffer_overflow_juliet

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
0,984,984,62516,000/062/516/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,112,False,5108
1,985,985,62517,000/062/517/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9668
2,986,986,62518,000/062/518/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9710
3,987,987,62519,000/062/519/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,123,False,10162
4,988,988,62520,000/062/520/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,123,False,10084
5,989,989,62521,000/062/521/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,122,False,10123
6,990,990,62522,000/062/522/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,122,False,10054
7,991,991,62523,000/062/523/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,130,False,10190
8,992,992,62524,000/062/524/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9897
9,993,993,62525,000/062/525/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,117,False,9799


In [4]:
buffer_overflow_juliet.groupby(['testcase_ID']).apply(
    lambda testcase, **kwargs: sum(testcase['code_length']),
    axis='columns',
).sort_values()

testcase_ID
-62852      1570
-62900      1576
-62869      1616
-62804      1618
-232012     1619
-62853      1621
-62917      1622
-232086     1625
-62901      1627
-62854      1630
-62867      1632
-62902      1636
-62915      1638
-62868      1643
-62861      1649
-62916      1649
-62909      1655
-62865      1657
-62913      1663
-62821      1664
-232029     1665
-231979     1667
-62805      1669
-62860      1670
-232103     1671
 62852      1674
-232013     1674
-62908      1676
-62862      1676
-62806      1678
           ...  
 232342    11388
 67724     11389
 232346    11425
 232341    11486
 232347    11501
 232196    11511
 67718     11512
 232343    11514
 232345    11523
 67716     11542
 70687     11546
 67717     11581
 67759     11584
 67715     11620
 232339    11643
 67719     11648
 232337    11673
 232364    11690
 62727     11694
 232338    11712
 232336    11751
 232340    11779
 232184    11843
 232352    12397
 67552     12449
 67744     12616
 62548     13047
 6

Pick 20 smallest:

In [5]:
ilp_data= buffer_overflow_juliet[
    (buffer_overflow_juliet['testcase_ID'] == 62852) | 
    (buffer_overflow_juliet['testcase_ID'] == 62900) | 
    (buffer_overflow_juliet['testcase_ID'] == 62869) | 
    (buffer_overflow_juliet['testcase_ID'] == 62804) | 
    (buffer_overflow_juliet['testcase_ID'] == 232012) | 
    (buffer_overflow_juliet['testcase_ID'] == 62853) |
    (buffer_overflow_juliet['testcase_ID'] == 62917) |
    (buffer_overflow_juliet['testcase_ID'] == 232086) |
    (buffer_overflow_juliet['testcase_ID'] == 62901) |
    (buffer_overflow_juliet['testcase_ID'] == 62854) |
    (buffer_overflow_juliet['testcase_ID'] == 62867) |
    (buffer_overflow_juliet['testcase_ID'] == 62902) |
    (buffer_overflow_juliet['testcase_ID'] == 62915) |
    (buffer_overflow_juliet['testcase_ID'] == 62868) |
    (buffer_overflow_juliet['testcase_ID'] == 62861) |
    (buffer_overflow_juliet['testcase_ID'] == 62916) |
    (buffer_overflow_juliet['testcase_ID'] == 62909) |
    (buffer_overflow_juliet['testcase_ID'] == 62865) |
    (buffer_overflow_juliet['testcase_ID'] == 62913) |
    (buffer_overflow_juliet['testcase_ID'] == 62821) |
    (buffer_overflow_juliet['testcase_ID'] == -62852) | 
    (buffer_overflow_juliet['testcase_ID'] == -62900) | 
    (buffer_overflow_juliet['testcase_ID'] == -62869) | 
    (buffer_overflow_juliet['testcase_ID'] == -62804) | 
    (buffer_overflow_juliet['testcase_ID'] == -232012) |
    (buffer_overflow_juliet['testcase_ID'] == -62853) |
    (buffer_overflow_juliet['testcase_ID'] == -62917) |
    (buffer_overflow_juliet['testcase_ID'] == -232086) |
    (buffer_overflow_juliet['testcase_ID'] == -62901) |
    (buffer_overflow_juliet['testcase_ID'] == -62854) |
    (buffer_overflow_juliet['testcase_ID'] == -62867) |
    (buffer_overflow_juliet['testcase_ID'] == -62902) |
    (buffer_overflow_juliet['testcase_ID'] == -62915) |
    (buffer_overflow_juliet['testcase_ID'] == -62868) |
    (buffer_overflow_juliet['testcase_ID'] == -62861) |
    (buffer_overflow_juliet['testcase_ID'] == -62916) |
    (buffer_overflow_juliet['testcase_ID'] == -62909) |
    (buffer_overflow_juliet['testcase_ID'] == -62865) |
    (buffer_overflow_juliet['testcase_ID'] == -62913) |
    (buffer_overflow_juliet['testcase_ID'] == -62821)
]
ilp_data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
516,1500,1500,62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,1722
533,1517,1517,62821,000/062/821/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,35,False,1808
581,1565,1565,62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1674
582,1566,1566,62853,000/062/853/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2396
583,1567,1567,62854,000/062/854/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2414
590,1574,1574,62861,000/062/861/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2453
594,1578,1578,62865,000/062/865/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2468
596,1580,1580,62867,000/062/867/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,34,False,1781
597,1581,1581,62868,000/062/868/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,34,False,1790
598,1582,1582,62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,False,1760


In [6]:
ilp_data = ilp_data.drop(["Unnamed: 0", "Unnamed: 0.1"], axis='columns')

In [7]:
ilp_data

Unnamed: 0,testcase_ID,filename,code,flaw,flaw_loc,bug,code_length
516,62804,000/062/804/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,1722
533,62821,000/062/821/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,35,False,1808
581,62852,000/062/852/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,30,False,1674
582,62853,000/062/853/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2396
583,62854,000/062/854/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2414
590,62861,000/062/861/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2453
594,62865,000/062/865/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,33,False,2468
596,62867,000/062/867/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,34,False,1781
597,62868,000/062/868/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,34,False,1790
598,62869,000/062/869/CWE121_Stack_Based_Buffer_Overflow...,/* TEMPLATE GENERATED TESTCASE FILE\nFilename:...,CWE-121,32,False,1760


In [8]:
ilp_data.to_csv("../data/ilp_dataset.csv.gz")

## Generating Prolog Representations

Now that we have a set of examples, we want to generate a Prolog representation. To do this we use Joern to derive a code property graph from each source file. Then, using our `../code/joern_cfg_to_prolog.scala` script, we convert a subset of this graph into a set of prolog facts. 

In [13]:
import os
import subprocess
import tempfile

In [14]:
testcase_IDs = []
flaws = []
bugs = []
code_lengths = []
trees = []

In [15]:
def generate_prolog(testcase):
    tmp_dir = tempfile.TemporaryDirectory()

    for file in testcase.itertuples():
        short_filename = file.filename.split("/")[-1]
        with open(tmp_dir.name + "/" + short_filename, 'w') as f:
            f.write(file.code)

    subprocess.check_call(["/joern/joern-parse", "--out", tmp_dir.name + "/cpg.bin.zip", tmp_dir.name])

    tree = subprocess.check_output(
        "cd /joern && /joern/joern-query --cpg "+tmp_dir.name + "/cpg.bin.zip -f /project/code/joern_cfg_to_prolog.scala",
        shell=True,
        universal_newlines=True,
    )

    testcase_IDs.append(file.testcase_ID)
    flaws.append( file.flaw)
    bugs.append(file.bug)
    code_lengths.append(file.code_length)
    trees.append(tree)

    tmp_dir.cleanup()

In [16]:
ilp_data.groupby('testcase_ID').apply(generate_prolog)

In [17]:
prolog = pd.DataFrame({
    'testcase_ID': testcase_IDs,
    'flaw': flaws,
    'bug': bugs,
    'code_length': code_lengths,
    'tree': trees,
})
prolog = prolog[1:41]  # when we run apply it duplicates the first group
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232086,CWE-122,True,1625,% START: Generated Prolog\n% NODE PROPERTIES \...
2,-232012,CWE-122,True,1619,% START: Generated Prolog\n% NODE PROPERTIES \...
3,-62917,CWE-121,True,1622,% START: Generated Prolog\n% NODE PROPERTIES \...
4,-62916,CWE-121,True,1649,% START: Generated Prolog\n% NODE PROPERTIES \...
5,-62915,CWE-121,True,1638,% START: Generated Prolog\n% NODE PROPERTIES \...
6,-62913,CWE-121,True,1663,% START: Generated Prolog\n% NODE PROPERTIES \...
7,-62909,CWE-121,True,1655,% START: Generated Prolog\n% NODE PROPERTIES \...
8,-62902,CWE-121,True,1636,% START: Generated Prolog\n% NODE PROPERTIES \...
9,-62901,CWE-121,True,1627,% START: Generated Prolog\n% NODE PROPERTIES \...
10,-62900,CWE-121,True,1576,% START: Generated Prolog\n% NODE PROPERTIES \...


In [26]:
print(prolog.tree.iloc[0])

% START: Generated Prolog
% NODE PROPERTIES 
assignment(bad_232086_id_0_f_l_c_).
sizeOf(bad_232086_id_26_f_l_c_).
alloc(bad_232086_id_30_f_l_c_).
writeToPointer(bad_232086_id_50_f_l_c_).
compMemberAccess(bad_232086_id_91_f_l_c_).
compMemberAccess(bad_232086_id_132_f_l_c_).
compMemberAccess(bad_232086_id_133_f_memmove_01_c_l_31_c_21_).
sizeOf(bad_232086_id_137_f_l_c_).
sizeOf(bad_232086_id_138_f_memmove_01_c_l_30_c_33_).
writeToPointer(bad_232086_id_144_f_l_c_).
writeToPointer(bad_232086_id_145_f_memmove_01_c_l_30_c_8_).
assignment(bad_232086_id_147_f_memmove_01_c_l_28_c_12_).
alloc(bad_232086_id_162_f_l_c_).
alloc(bad_232086_id_163_f_memmove_01_c_l_25_c_18_).
assignment(bad_232086_id_168_f_memmove_01_c_l_25_c_4_).
assignment(bad_232086_id_171_f_l_c_).
assignment(bad_232086_id_172_f_memmove_01_c_l_23_c_4_).
% METHOD 
pointer(bad_232086_id_6_f_l_48_c_19_).
voidPointer(bad_232086_id_117_f_memmove_01_c_l_49_c_0_).
pointer(bad_232086_id_119_f_l_48_c_19_).
pointer(bad_232086_id_127_f_memmove

In [18]:
import re

In [19]:
def fix_single_rules(testcase):
    find_node_ids = re.compile('\((\w+)\)\.')
    replacement_node_ids = '({bug}_{testcase_id}_\\1).'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

def fix_tree_rules(testcase):
    find_node_ids = re.compile('\((\w+), (\w+)\)\.')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, {bug}_{testcase_id}_\\2).'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

def fix_code_rules(testcase):
    find_node_ids = re.compile('\((\w+), "(.*)"\)\.')
    replacement_node_ids = '({bug}_{testcase_id}_\\1, "\\2").'.format(
        bug='bad' if testcase.bug else 'good',
        testcase_id=abs(testcase.testcase_ID),
    )
    return find_node_ids.sub(replacement_node_ids, testcase['tree'])

In [20]:
prolog['tree'] = prolog.apply(fix_single_rules, axis='columns')
prolog['tree'] = prolog.apply(fix_tree_rules, axis='columns')
prolog['tree'] = prolog.apply(fix_code_rules, axis='columns')

In [21]:
prolog

Unnamed: 0,testcase_ID,flaw,bug,code_length,tree
1,-232086,CWE-122,True,1625,% START: Generated Prolog\n% NODE PROPERTIES \...
2,-232012,CWE-122,True,1619,% START: Generated Prolog\n% NODE PROPERTIES \...
3,-62917,CWE-121,True,1622,% START: Generated Prolog\n% NODE PROPERTIES \...
4,-62916,CWE-121,True,1649,% START: Generated Prolog\n% NODE PROPERTIES \...
5,-62915,CWE-121,True,1638,% START: Generated Prolog\n% NODE PROPERTIES \...
6,-62913,CWE-121,True,1663,% START: Generated Prolog\n% NODE PROPERTIES \...
7,-62909,CWE-121,True,1655,% START: Generated Prolog\n% NODE PROPERTIES \...
8,-62902,CWE-121,True,1636,% START: Generated Prolog\n% NODE PROPERTIES \...
9,-62901,CWE-121,True,1627,% START: Generated Prolog\n% NODE PROPERTIES \...
10,-62900,CWE-121,True,1576,% START: Generated Prolog\n% NODE PROPERTIES \...


In [22]:
def extract_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
            
        if line == "% AST":
            in_source_code_section = False
        
        if in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)
            
    
def remove_source_map(prolog_src):
    output_lines = []
    
    in_source_code_section = False
    
    for line in prolog_src.split("\n"):
        if line == "% CODE":
            in_source_code_section = True
            
        if line == "% AST":
            in_source_code_section = False
        
        if not in_source_code_section:
            output_lines.append(line)
            
    return '\n'.join(output_lines)

In [23]:
prolog['source_map'] = prolog['tree'].apply(extract_source_map)
prolog['tree'] = prolog['tree'].apply(remove_source_map)

In [24]:
prolog.to_csv("../data/ilp_prolog_data.csv.gz")