# Experimenting with Python Clang Bindings

The goal of this notebook is to load the python clang bindings and the juliet dataset. Then try to get an AST out of some of Juliet's code snippets, can we make sense of any of it?

In [1]:
import clang.cindex

In [2]:
import os
import pandas as pd

In [3]:
# This cell might not be needed for you.
clang.cindex.Config.set_library_file('/lib/x86_64-linux-gnu/libclang-8.so.1')

Load in the juliet data set, and pick the first data point as an example

In [4]:
juliet = pd.read_csv("../data/juliet.csv.zip")

In [4]:
vdisc = pd.read_csv("../data/vdisc_test.csv.gz")

In [5]:
vdisc

Unnamed: 0.1,Unnamed: 0,testcase_ID,flaw_loc,filename,code,CWE-119,CWE-120,CWE-469,CWE-476,CWE-OTHERS,...,CWE-427,CWE-481,CWE-535,CWE-467,CWE-835,CWE-506,CWE-785,CWE-259,CWE-253,CWE-620
0,0,vdisc_testcase_0,,,default_event_handler( \n GuiWidget *widget...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,vdisc_testcase_1,,,"krb5_krbhst_init_flags(krb5_context context,\n...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,vdisc_testcase_2,,,swap_info_get(swp_entry_t entry)\n{\n\tstruct ...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,vdisc_testcase_3,,,"parseattribs4(char *&c, const Vec4 &ival = Vec...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,vdisc_testcase_4,,,generateExecCode(CompileState* comp)\n{\n g...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,5,vdisc_testcase_5,,,"use_vfp_abi (enum arm_pcs pcs_variant, bool is...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,6,vdisc_testcase_6,,,"hasUsesToReplace(GlobalAlias &GA, const LLVMUs...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,7,vdisc_testcase_7,,,load_tile_set( const char *set ) {\n string s...,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,8,vdisc_testcase_8,,,"BufFileSeek(BufFile *file, int fileno, long of...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,9,vdisc_testcase_9,,,"sendPlayPacket(rtmp::RTMP& r, FakeNC& nc)\n{\n...",False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
example = vdisc.iloc[0]

In [5]:
example = juliet.iloc[0]

In [7]:
print(example.code)

default_event_handler( 
    GuiWidget *widget, GuiEvent *event )
{
    char *ptr;
    int px, py, i;
    switch ( event->type ) {
        case GUI_DESTROY:
            if ( widget->spec.edit.buffer )
                free( widget->spec.edit.buffer );
            if ( widget->spec.edit.display_buffer )
                free( widget->spec.edit.display_buffer );
            break;
        case GUI_DRAW:
            /* display surface */
            stk_surface_blit( 
                widget->surface, 0, 0, -1, -1, 
                stk_display, 
                widget->screen_region.x,
                widget->screen_region.y );
            /* add text */
            gui_theme->edit_font->align = STK_FONT_ALIGN_LEFT;
            ptr = widget->spec.edit.display_buffer;
            px = widget->screen_region.x + widget->border;
            py = widget->screen_region.y + widget->border + 
                 widget->spec.edit.y_offset;
            for ( i = 0; i < widget->spec.edit.height; 
        

Instantiate the clang parser and give it our example. We use `unsaved_files` to tell it to parse a file that doesn't actually exist on disk.

In [8]:
index = clang.cindex.Index.create()
translation_unit = index.parse(path=example.filename, unsaved_files=[(example.filename, example.code)])




TypeError: expected str, bytes or os.PathLike object, not numpy.float64

`root` is the root note of the AST. Try to explore and figure out what this all means! It's pretty dense ha

In [8]:
root = translation_unit.cursor

I found this nice tutorial that helps to explain how the Python clang bindings canbe used to explore AST's: https://github.com/FraMuCoder/PyClASVi/blob/master/doc/python_clang_usage.md

In [9]:
def print_ast(cursor, deep=0):
    print(' '.join((deep*'    ', str(cursor.kind), str(cursor.spelling))))
    for child in cursor.get_children():
        print_ast(child, deep+1)

print_ast(root)

 CursorKind.TRANSLATION_UNIT 000/061/940/CWE114_Process_Control__w32_char_connect_socket_01.c
     CursorKind.TYPEDEF_DECL _Float32
     CursorKind.TYPEDEF_DECL _Float64
     CursorKind.TYPEDEF_DECL _Float32x
     CursorKind.TYPEDEF_DECL _Float64x
     CursorKind.TYPEDEF_DECL wint_t
     CursorKind.STRUCT_DECL 
         CursorKind.FIELD_DECL __count
         CursorKind.UNION_DECL 
             CursorKind.FIELD_DECL __wch
             CursorKind.FIELD_DECL __wchb
                 CursorKind.INTEGER_LITERAL 
         CursorKind.FIELD_DECL __value
             CursorKind.UNION_DECL 
                 CursorKind.FIELD_DECL __wch
                 CursorKind.FIELD_DECL __wchb
                     CursorKind.INTEGER_LITERAL 
     CursorKind.TYPEDEF_DECL __mbstate_t
         CursorKind.STRUCT_DECL 
             CursorKind.FIELD_DECL __count
             CursorKind.UNION_DECL 
                 CursorKind.FIELD_DECL __wch
                 CursorKind.FIELD_DECL __wchb
                     CursorKi

                                             CursorKind.BINARY_OPERATOR 
                                                 CursorKind.UNEXPOSED_EXPR 
                                                     CursorKind.UNEXPOSED_EXPR 
                                                         CursorKind.PAREN_EXPR 
                                                             CursorKind.DECL_REF_EXPR __bsx
                                                 CursorKind.INTEGER_LITERAL 
                                         CursorKind.INTEGER_LITERAL 
     CursorKind.FUNCTION_DECL __bswap_32
         CursorKind.TYPE_REF __uint32_t
         CursorKind.PARM_DECL __bsx
             CursorKind.TYPE_REF __uint32_t
         CursorKind.COMPOUND_STMT 
             CursorKind.RETURN_STMT 
                 CursorKind.PAREN_EXPR 
                     CursorKind.BINARY_OPERATOR 
                         CursorKind.BINARY_OPERATOR 
                             CursorKind.BINARY_OPERATOR 
                     

         CursorKind.ENUM_CONSTANT_DECL MSG_DONTWAIT
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_EOR
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_WAITALL
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_FIN
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_SYN
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_CONFIRM
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_RST
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_ERRQUEUE
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_NOSIGNAL
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_MORE
             CursorKind.INTEGER_LITERAL 
         CursorKind.ENUM_CONSTANT_DECL MSG_WAITFORONE
             CursorKind.INTEGER_LITERAL 
 

         CursorKind.TYPE_REF __useconds_t
     CursorKind.TYPEDEF_DECL intptr_t
         CursorKind.TYPE_REF __intptr_t
     CursorKind.FUNCTION_DECL access
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.PARM_DECL __name
         CursorKind.PARM_DECL __type
     CursorKind.FUNCTION_DECL faccessat
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.PARM_DECL __fd
         CursorKind.PARM_DECL __file
         CursorKind.PARM_DECL __type
         CursorKind.PARM_DECL __flag
     CursorKind.FUNCTION_DECL lseek
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.TYPE_REF __off_t
         CursorKind.PARM_DECL __fd
         CursorKind.PARM_DECL __offset
             CursorKind.TYPE_REF __off_t
         CursorKind.PARM_DECL __whence
     CursorKind.FUNCTION_DECL close
         CursorKind.PARM_DECL __fd
     CursorKind.FUNCTION_DECL read
         CursorKind.TYPE_REF ssize_t
         CursorKind.PARM_DECL __f

         CursorKind.UNEXPOSED_ATTR 
         CursorKind.TYPE_REF __pid_t
     CursorKind.FUNCTION_DECL getpgrp
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.TYPE_REF __pid_t
     CursorKind.FUNCTION_DECL __getpgid
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.TYPE_REF __pid_t
         CursorKind.PARM_DECL __pid
             CursorKind.TYPE_REF __pid_t
     CursorKind.FUNCTION_DECL getpgid
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.TYPE_REF __pid_t
         CursorKind.PARM_DECL __pid
             CursorKind.TYPE_REF __pid_t
     CursorKind.FUNCTION_DECL setpgid
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.PARM_DECL __pid
             CursorKind.TYPE_REF __pid_t
         CursorKind.PARM_DECL __pgid
             CursorKind.TYPE_REF __pid_t
     CursorKind.FUNCTION_DECL setpgrp
         CursorKind.UNEXPOSED_ATTR 
     CursorKind.FUNCTION_DECL setsid
         CursorKind.UNEXPOSED_ATTR 
         CursorKind.TYPE_REF __pid_t
     CursorKind.FUNCTION_

In [10]:
import snap

In [11]:
identifier = 1

def number_each_node(node):
    global identifier
    
    node.identifier = identifier
    identifier += 1
    
    node.children = list(node.get_children())
    for child in node.children:
        number_each_node(child)
        
number_each_node(root)

In [12]:
tree = snap.TNGraph.New()

def tree2edgelist(node):
    tree.AddNode(node.identifier)
    
    for child in node.children:
        tree2edgelist(child)
        tree.AddEdge(node.identifier, child.identifier)
        
tree2edgelist(root)

In [13]:
snap.SaveEdgeList(tree, '../data/mygraph.txt')

In [14]:
with open('../data/mygraph.txt') as f:
    print(f.read())

# Directed graph: ../data/mygraph.txt 
# Nodes: 2538 Edges: 2537
# FromNodeId	ToNodeId
1	2
1	3
1	4
1	5
1	6
1	7
1	18
1	30
1	32
1	33
1	35
1	36
1	38
1	48
1	50
1	52
1	53
1	56
1	59
1	62
1	65
1	71
1	78
1	82
1	87
1	93
1	100
1	104
1	106
1	112
1	114
1	117
1	120
1	123
1	126
1	129
1	132
1	135
1	137
1	140
1	143
1	146
1	152
1	154
1	156
1	158
1	162
1	166
1	171
1	173
1	175
1	177
1	179
1	181
1	183
1	185
1	187
1	191
1	195
1	199
1	204
1	209
1	214
1	219
1	221
1	223
1	228
1	233
1	237
1	239
1	244
1	249
1	252
1	258
1	262
1	264
1	268
1	273
1	276
1	281
1	286
1	289
1	294
1	300
1	304
1	310
1	314
1	318
1	320
1	325
1	330
1	333
1	334
1	338
1	344
1	346
1	347
1	348
1	349
1	350
1	351
1	352
1	353
1	354
1	355
1	356
1	357
1	358
1	360
1	362
1	364
1	366
1	368
1	370
1	372
1	374
1	375
1	376
1	377
1	378
1	379
1	380
1	381
1	382
1	383
1	384
1	385
1	386
1	387
1	388
1	391
1	395
1	396
1	397
1	398
1	399
1	400
1	401
1	402
1	403
1	404
1	405
1	406
1	407
1	408
1	409
1	410
1	411
1	412
1	413
1	414
1	415
1	416
1	417
1	419
1	420
1	421
1	4

In [15]:
def print_node_identifiers(node):
    print(str(node.kind) + ' ' + str(node.identifier))
    for child in node.children:
        print_ast(child) 

In [16]:
print_node_identifiers(root)

CursorKind.TRANSLATION_UNIT 1
 CursorKind.TYPEDEF_DECL _Float32
 CursorKind.TYPEDEF_DECL _Float64
 CursorKind.TYPEDEF_DECL _Float32x
 CursorKind.TYPEDEF_DECL _Float64x
 CursorKind.TYPEDEF_DECL wint_t
 CursorKind.STRUCT_DECL 
     CursorKind.FIELD_DECL __count
     CursorKind.UNION_DECL 
         CursorKind.FIELD_DECL __wch
         CursorKind.FIELD_DECL __wchb
             CursorKind.INTEGER_LITERAL 
     CursorKind.FIELD_DECL __value
         CursorKind.UNION_DECL 
             CursorKind.FIELD_DECL __wch
             CursorKind.FIELD_DECL __wchb
                 CursorKind.INTEGER_LITERAL 
 CursorKind.TYPEDEF_DECL __mbstate_t
     CursorKind.STRUCT_DECL 
         CursorKind.FIELD_DECL __count
         CursorKind.UNION_DECL 
             CursorKind.FIELD_DECL __wch
             CursorKind.FIELD_DECL __wchb
                 CursorKind.INTEGER_LITERAL 
         CursorKind.FIELD_DECL __value
             CursorKind.UNION_DECL 
                 CursorKind.FIELD_DECL __wch
                

         CursorKind.INTEGER_LITERAL 
     CursorKind.FIELD_DECL __align
 CursorKind.TYPEDEF_DECL pthread_barrierattr_t
     CursorKind.UNION_DECL 
         CursorKind.FIELD_DECL __size
             CursorKind.INTEGER_LITERAL 
         CursorKind.FIELD_DECL __align
 CursorKind.STRUCT_DECL iovec
     CursorKind.FIELD_DECL iov_base
     CursorKind.FIELD_DECL iov_len
 CursorKind.TYPEDEF_DECL socklen_t
     CursorKind.TYPE_REF __socklen_t
 CursorKind.ENUM_DECL __socket_type
     CursorKind.ENUM_CONSTANT_DECL SOCK_STREAM
         CursorKind.INTEGER_LITERAL 
     CursorKind.ENUM_CONSTANT_DECL SOCK_DGRAM
         CursorKind.INTEGER_LITERAL 
     CursorKind.ENUM_CONSTANT_DECL SOCK_RAW
         CursorKind.INTEGER_LITERAL 
     CursorKind.ENUM_CONSTANT_DECL SOCK_RDM
         CursorKind.INTEGER_LITERAL 
     CursorKind.ENUM_CONSTANT_DECL SOCK_SEQPACKET
         CursorKind.INTEGER_LITERAL 
     CursorKind.ENUM_CONSTANT_DECL SOCK_DCCP
         CursorKind.INTEGER_LITERAL 
     CursorKind.ENUM_CONSTAN

     CursorKind.UNEXPOSED_ATTR 
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.PARM_DECL __path
     CursorKind.PARM_DECL __name
 CursorKind.FUNCTION_DECL fpathconf
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.PARM_DECL __fd
     CursorKind.PARM_DECL __name
 CursorKind.FUNCTION_DECL sysconf
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.PARM_DECL __name
 CursorKind.FUNCTION_DECL confstr
     CursorKind.UNEXPOSED_ATTR 
 CursorKind.FUNCTION_DECL getpid
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.TYPE_REF __pid_t
 CursorKind.FUNCTION_DECL getppid
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.TYPE_REF __pid_t
 CursorKind.FUNCTION_DECL getpgrp
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.TYPE_REF __pid_t
 CursorKind.FUNCTION_DECL __getpgid
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.TYPE_REF __pid_t
     CursorKind.PARM_DECL __pid
         CursorKind.TYPE_REF __pid_t
 CursorKind.FUNCTION_DECL getpgid
     CursorKind.UNEXPOSED_ATTR 
     CursorKind.TYPE_REF __pid_t
     Cur

Below works in bash:

In [1]:
!python /node2vec/src/main.py --input ../data/mygraph.txt --output ../data/myvec.emd

Traceback (most recent call last):
  File "/node2vec/src/main.py", line 13, in <module>
    import numpy as np
ImportError: No module named numpy
