Notebook to generate adjacency matrices of our scripts in the juliet dataset to be used as input for our neural network model.

In [199]:
import ast
import pickle
import numpy as np
import pandas as pd
from scipy import sparse
import networkx as nx
from preprocess_code import *

In [200]:
data = pd.read_csv("../data/buffer_overflow_data.csv.gz")

In [211]:
data = data.iloc[500:599]
# data = data.iloc[0:100]

In [4]:
def generate_edge_list1(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip 
    or (as loaded with pandas) matching one particular testcase, 
    and returns an edge list of its graph representation.
    """
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor

    # Memoise/concretise the ast so that we can consistently
    # modify it, then number each node in the tree uniquely.
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)

    # Next, construct an edge list for the graph2vec input:
    edgelist = generate_edgelist(ast_root)
    
    edgelist_representation = {
        "edges": edgelist,
    }

    # Explicitly delete clang objects
    del translation_unit
    del ast_root
    del index

    return json.dumps(edgelist_representation)

In [None]:
# dask_data = dd.from_pandas(data, npartitions=20)

# generate the graphs for all the testcases in the dataset 

graphs = data.groupby(['testcase_ID']).apply(
        generate_edge_list1,
        axis='columns',
        meta=('generate_edge_list', 'unicode'),
    )

NameError: name 'generate_edge_list1' is not defined

In [6]:
def gen_adj_matrix1(testcase):
    
    """
    Takes in a list of files/datapoints from buffer_overflow_data.csv.gz 
    matching one particular testcase, and generates an adjacency matrix 
    from the edgelist created.
    """
    
    # extracting the list of edges 

    x = testcase.split('edges": ')
    x = x[1].split('}')
    x = ast.literal_eval(x[0])
    
#     return x

    # generating the matrix
    
    G = nx.Graph()

    G.add_edges_from(x)

    A = nx.adjacency_matrix(G)

    B = A.todense()

    return B

In [7]:
# create a dataframe containing the testcase ID and its adjacency matrix 
adjacency_df = pd.DataFrame()

In [8]:
adjacency_df['testcase_ID'] = data.testcase_ID.drop_duplicates()

In [9]:
# kernel dies when there are more than 200 datapoints

# adj_matrices = graphs.apply(gen_adj_matrix1, meta = ('generate_adj_matrices', 'O'))
adj_matrices = graphs.apply(gen_adj_matrix1)

In [10]:
# adj_matrices = pd.DataFrame(adj_matrices)
adj_matrices = adj_matrices.to_frame()

In [11]:
## TODO: in a DASK framework reset_index is not a recognized function like pandas, fix this bug

# adj_matrices = adj_matrices.compute()
adj_matrices = adj_matrices.reset_index(level='testcase_ID')

In [12]:
adjacency_df['adj_matrix'] = adj_matrices[0]

In [13]:
adj_df = adjacency_df.dropna()

In [14]:
adj_df.to_csv("../data/adj_df.csv.gz")

## Feature Matrix

In [9]:
def concretise_ast(node):
    """
    Everytime you run .get_children() on a clang ast node, it
    gives you new objects. So if you want to modify those objects
    they will lose their changes everytime you walk the tree again.
    To avoid this problem, concretise_ast walks the tree once,
    saving the resulting list from .get_children() into a a concrete
    list inside the .children.
    You can then use .children to consistently walk over tree, and
    it will give you the same objects each time.
    """
    node.children = list(node.get_children())

    for child in node.children:
        counter = concretise_ast(child)

def number_ast_nodes(node, counter=1):
    """
    Given a concretised clang ast, assign each node with a unique
    numerical identifier. This will be accessible via the .identifier
    attribute of each node.
    """
    node.identifier = counter
    counter += 1

    node.children = list(node.get_children())
    for child in node.children:
        counter = number_ast_nodes(child, counter)

    return counter


def generate_ast_roots(testcase, **kwargs):
    """
    Takes in a list of files/datapoints from juliet.csv.zip (as loaded with pandas) matching one particular
    testcase, and preprocesses it ready for the feature matrix.
    """
    
    parse_list = [
        (datapoint.filename, datapoint.code)
        for datapoint in testcase.itertuples()
    ]

    primary = find_primary_source_file(testcase)

    # Parse the source code with clang, and get out an ast:
    index = clang.cindex.Index.create()
    translation_unit = index.parse(
        path=primary.filename,
        unsaved_files=parse_list,
    )
    ast_root = translation_unit.cursor
    
    concretise_ast(ast_root)
    number_ast_nodes(ast_root)
    
    return ast_root

In [212]:
ast_roots = data.groupby(['testcase_ID']).apply(generate_ast_roots)

In [32]:
# example_node = ast_roots.iloc[0].children[19]
# dir(example_node)

['__class__',
 '__ctypes_from_outparam__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_b_base_',
 '_b_needsfree_',
 '_displayname',
 '_fields_',
 '_kind_id',
 '_objects',
 '_tu',
 'access_specifier',
 'availability',
 'brief_comment',
 'canonical',
 'children',
 'data',
 'displayname',
 'enum_type',
 'enum_value',
 'exception_specification_kind',
 'extent',
 'from_cursor_result',
 'from_location',
 'from_result',
 'get_arguments',
 'get_bitfield_width',
 'get_children',
 'get_definition',
 'get_field_offsetof',
 'get_included_file',
 'get_num_template_arguments',
 'get_template_argument_kind',
 'get_template_argument_type',
 'get_template_argument_unsigned_val

Getting the columns for the feature matrix:

In [221]:
def generate_colnames(ast_root):
    """
    Given a concretised & numbered clang ast, returns a set of node kinds to be used as columns in feature matrix
    """
    features =  set()


    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        features.add(str(node.kind))

#         features[node.identifier] = [str(node.kind)]

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return features


def generate_spelling(ast_root):
    """
    Given a concretised & numbered clang ast, returns a set of node spellings to be used later
    in constructing the columns in feature matrix
    """
    spelling =  set()


    def walk_tree_and_set_features(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        spelling.add(node.spelling)

        for child in node.children:
            walk_tree_and_set_features(child)

    walk_tree_and_set_features(ast_root)

    return spelling

Creating unique set of node kinds and node spellings

In [222]:
colnames = ast_roots.apply(generate_colnames)
spelling = ast_roots.apply(generate_spelling)

obtaining final boolean column names

In [97]:
final_colnames = set()
final_colnames.update(['WriteToPointer', 'SizeOf', 'Alloc'])
for i in range(len(feature_sets)):
    final_colnames.update(feature_sets.iloc[i])

Set of all node spellings

In [223]:
final_spelling = set()
for i in range(len(spelling)):
    final_spelling.update(spelling.iloc[i])

In [99]:
final_colnames = pd.Series(list(final_colnames))

Manually pick out important node spellings 

In [239]:
alloc_list = ['__builtin_alloca', 
              '__alloc', 
              'malloc', 
              'valloc', 
              '__alloc_on_copy', 
              '__alloc_on_move', 
              'calloc', 
              'realloc', 
              'alloca',
              'ALLOCA'
             ]

sizeOf_list = ['std::aligned_storage<sizeof(_Tp), __alignof(_Tp)>'
              ]

writeToPointer_list = ['__builtin_memmove', 
                       '__builtin_memcpy', 
                       'wmempcpy', 
                       'wmemmove'
                      ]

In [237]:
# [feature for feature in final_spelling if 'Alloc' in feature]

In [238]:
# pd.get_dummies(final_colnames)

Creating the feature matrix:

In [242]:
def generate_features_pdf(ast_root):
    """
    Given a concretised & numbered clang ast, return a dictionary of
    features in the form:
        {
            <node_id>: [<type>, <description>],
            ...
        }
        
    To extract whether a node has the properties of WriteToPointer, SizeOf or Alloc
    """
    index = []
    kind = {}
    spelling = {}

    def walk_tree_and_set_properties(node):
        out_degree = len(node.children)
        in_degree = 1
        degree = out_degree + in_degree
        
        index.append(node.identifier)
        
        kind[node.identifier] = node.kind
        spelling[node.identifier] = node.spelling
        
        if str(node.spelling) in writeToPointer_list:
            spelling[node.identifier] = 'writeToPointer'
        
        elif str(node.spelling) in sizeOf_list:
            spelling[node.identifier] = 'sizeOf'
            
        elif str(node.spelling) in alloc_list:
            spelling[node.identifier] = 'alloca'
        
        else:
            spelling[node.identifier] = ''
        

        for child in node.children:
            walk_tree_and_set_properties(child)

    walk_tree_and_set_properties(ast_root)
    
#     return spelling
    
    d = {'identifier': index, 'kind': list(kind.values()), 'spelling': list(spelling.values())}
        
    final_df = pd.DataFrame(data = d)
    final_df = final_df.set_index('identifier')

    return final_df

In [243]:
# eg = generate_features_pdf(ast_roots.iloc[4])
generate_features_pdf(ast_roots.iloc[4])

# '__alloc' in eg.values()

Unnamed: 0_level_0,kind,spelling
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
1,CursorKind.TRANSLATION_UNIT,
2,CursorKind.FUNCTION_DECL,
3,CursorKind.COMPOUND_STMT,
4,CursorKind.DECL_STMT,
5,CursorKind.VAR_DECL,
6,CursorKind.BINARY_OPERATOR,
7,CursorKind.DECL_REF_EXPR,
8,CursorKind.CSTYLE_CAST_EXPR,
9,CursorKind.CALL_EXPR,alloca
10,CursorKind.UNEXPOSED_EXPR,alloca


TODO: figure out how to get this dataframe for all the testcases IDS stored in one place then apply this to the rest of the data to be converted into matrix format