In [1]:
import xml.etree.ElementTree as ET
import copy
import string
import gc, sys
from hurry.filesize import size

from types import ModuleType, FunctionType
from gc import get_referents

# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType


def getsize(obj):
    """sum size of object & members."""
    if isinstance(obj, BLACKLIST):
        raise TypeError('getsize() does not take argument of type: '+ str(type(obj)))
    seen_ids = set()
    size = 0
    objects = [obj]
    while objects:
        need_referents = []
        for obj in objects:
            if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids:
                seen_ids.add(id(obj))
                size += sys.getsizeof(obj)
                need_referents.append(obj)
        objects = get_referents(*need_referents)
    return size

def isLeaf(node):
    t = True
    for child in node:
        t = False
    return t

def hasToken(node):
    if node.text == None:
        return False
    return node.text.strip() != ''

def nodeIsVisited(node):
    return node.get('visited') != None

def visitNode(node):
    node.set('visited', True)

global_tag_id = {}
tag_id = {}

def getTag(node):
    tag = node.tag
    tag = tag.replace('{http://www.srcML.org/srcML/src}', '')
    
    # initialize this tag's id
    if tag_id.get(tag) is None:
        tag_id[tag] = -1
        
    # set id for this tag
    if node.get('tgid') is None:
        node.set('tgid', tag_id[tag])
        tag_id[tag] += 1
    
    # set global tag id
    if global_tag_id.get(tag) is None:
        global_tag_id[tag] = len(global_tag_id.keys())
    
    return f'{tag}:{tag_id[tag]}'
    # return f'{global_tag_id[tag]}:{tag_id[tag]}'
    # return tag

def isPunctuation(token):
    return token in string.punctuation

def isComment(node):
    tag = node.tag.replace('{http://www.srcML.org/srcML/src}', '')
    return tag == 'comment'

def dfsPathKeep(node, keepPunctuation = False, keepComment=False):
    # tag_id.clear()
    path_stack = []
    stack = []
    visit_list = []
    
    stack.append(node)
    visited_child_count_list = []
    while (len(stack) > 0):
        currNode = stack.pop()
        # print(currNode)
        if not nodeIsVisited(currNode):
            tag = getTag(currNode)
            path_stack.append(tag)
            if hasToken(currNode):
                token = currNode.text.strip()
                # path_token_str = f'<{path_stack}>:{token}'
                path_token_str = {
                    'path': copy.deepcopy(path_stack),
                    'token': token
                }
                visit_list.append(path_token_str)
                
                if isComment(currNode) and not keepComment:
                    visit_list.remove(path_token_str)
                if isPunctuation(token) and not keepPunctuation:
                    visit_list.remove(path_token_str)
                
            visitNode(currNode)
            visited_child_count = 0
            
            child_list = []
            for child in currNode:
                child_list.append(child)
                
            # reverse to restore the order of nodes
            child_list.reverse()
            for child in child_list:
                if not nodeIsVisited(child):
                    stack.append(child)
                    visited_child_count += 1

            # if len(visit_list) > 0:
            #     print(visited_child_count_list, visited_child_count, visit_list[-1])
            if visited_child_count == 0:
                if len(visited_child_count_list) > 0:
                    visited_child_count_list[-1] -= 1
                    path_stack.pop()
                    # print('pop')
                while len(visited_child_count_list) > 0 and visited_child_count_list[-1] == 0:
                    visited_child_count_list = visited_child_count_list[:-1]
                    if len(visited_child_count_list) > 0:
                        visited_child_count_list[-1] -= 1
                        path_stack.pop()
                        # print('pop3')
                        
                    # path_stack.pop()
                    # print('pop2')
            else:
                visited_child_count_list.append(visited_child_count)
                
    del visited_child_count_list   
    gc.collect()
         

    # for v in visit_list:
    #     print(v)
        
    return visit_list

def parseXML(xml_path, *args, **kwargs):
    tree = ET.parse(xml_path)
    
    root = tree.getroot()
    
    print(dfsPathKeep(root, *args, **kwargs))
    
    
parseXML('srcxml/juliet.java.0.test.xml')
# parseXML('srcxml/helloworld.java.xml')
# parseXML('srcxml/helloworld.java.xml', keepComment=True)
# parseXML('srcxml/helloworld.java.xml', keepPunctuation=True)
# print("-----")
# parseXML('srcxml/juliet.java.0.test.xml', keepPunctuation=True)

def printMethod(xml_path, *args, **kwargs):
    tree = ET.parse(xml_path)
    
    root = tree.getroot()
    
    for function in root.findall('.//{http://www.srcML.org/srcML/src}function'):
        print(function)
        
# printMethod('srcxml/helloworld.java.xml')

[{'path': ['unit:0', 'package:0'], 'token': 'package'}, {'path': ['unit:0', 'package:0', 'name:0', 'name:1'], 'token': 'testcases'}, {'path': ['unit:0', 'package:0', 'name:0', 'name:2'], 'token': 'CWE23_Relative_Path_Traversal'}, {'path': ['unit:0', 'import:0'], 'token': 'import'}, {'path': ['unit:0', 'import:0', 'name:3', 'name:4'], 'token': 'testcasesupport'}, {'path': ['unit:0', 'package:1'], 'token': 'package'}, {'path': ['unit:0', 'package:1', 'name:6', 'name:7'], 'token': 'testcases'}, {'path': ['unit:0', 'package:1', 'name:6', 'name:8'], 'token': 'CWE23_Relative_Path_Traversal'}]


In [2]:
def token_generalize(t):
    return t.translate(str.maketrans('', '', string.punctuation))\
        .translate({ord(c): None for c in string.whitespace}).lower()

# token_id = 0
# token_id_map = {}
# token_frequecy_map = {}

# dep_id = 0
# dep_id_map = {}
# dep_frequecy_map = {}

def extract_dep_path(srcT, srcP, desT, desP, rs_obj, max_dep_len = None):
    global dep_id, dep_id_map, dep_frequecy_map
    intersection = [p for p in srcP if p in desP]
    intersection_point = intersection[-1]
    # print(intersection)
    # print(intersection_point)
    t1_left = [p for p in srcP if p not in intersection]
    t2_left = [p for p in desP if p not in intersection]
    # print(t1_left)
    # print(t2_left)

    # reverse t1 path
    t1_left.reverse()

    # remove id and add arrows
    intersection_point = '-' + intersection_point.split(":")[0] + '-'
    t1_left = [f'{p.split(":")[0]}↑' for p in t1_left]
    t2_left = [f'{p.split(":")[0]}↓' for p in t2_left]

    t1_2_t2_path = [*t1_left, intersection_point, *t2_left]
    dep_len = len(t1_2_t2_path)
    # print(t1_2_t2_path)

    t1_2_t2_path = ''.join(t1_2_t2_path)

    # print(t1, t2)

    if max_dep_len == None or dep_len <= max_dep_len:
        if rs_obj['dep_id_map'].get(t1_2_t2_path) == None:
            rs_obj['dep_id_map'][t1_2_t2_path] = rs_obj['dep_id']
            rs_obj['dep_id'] += 1
        if rs_obj['dep_frequecy_map'].get(t1_2_t2_path) == None:
            rs_obj['dep_frequecy_map'][t1_2_t2_path] = 0
        rs_obj['dep_frequecy_map'][t1_2_t2_path] += 1
    
    return (dep_len, srcT, desT, t1_2_t2_path)
    

def handle_node(node, xml_path, rs_obj, *args, window_size = None, max_dep_len = None, **kwargs):
    visit_list = dfsPathKeep(node, *args, **kwargs)
    
    for v in visit_list:
        token = token_generalize(v['token'])
        if token == '':
            continue
        if rs_obj['token_id_map'].get(token) == None:
            rs_obj['token_id_map'][token] = rs_obj['token_id']
            rs_obj['token_id'] += 1
            
        if rs_obj['token_frequecy_map'].get(token) == None:
            rs_obj['token_frequecy_map'][token] = 0
        rs_obj['token_frequecy_map'][token] += 1
    
    dep_triples = []
        
    for i in range(len(visit_list)):
        t1 = visit_list[i]
        # t1_path = copy.deepcopy(t1['path'])
        t1_path = t1['path']
        t1_token = t1['token']
        t1_token = token_generalize(t1_token)
        
        # token forward
        if window_size == None:
            forward_window_size = 0
        else:
            forward_window_size = (i - 1 - window_size) if (i - 1 - window_size) >= 0 else 0
        for h in range(forward_window_size, i):
            t0 = visit_list[h]
            # t0_path = copy.deepcopy(t0['path'])
            t0_path = t0['path']
            t0_token = t0['token']
            t0_token = token_generalize(t0_token)

            if (t0_token == '' or t1_token == ''):
                continue
            
            dep_len, srcT, desT, t1_2_t2_path = extract_dep_path(
                t1_token, t1_path, t0_token, t0_path, rs_obj, max_dep_len=max_dep_len 
            )
            
            if max_dep_len == None or dep_len <= max_dep_len:
                dep_triples.append((rs_obj['token_id_map'][srcT], rs_obj['token_id_map'][desT], rs_obj['dep_id_map'][t1_2_t2_path]))
        
        # token backward
        if window_size == None:
            backward_window_size = len(visit_list)
        else:
            backward_window_size = (i + 1 + window_size) if (i + 1 + window_size) <= len(visit_list) else len(visit_list)
        
        for j in range(i + 1, backward_window_size):
            t2 = visit_list[j]
            # t2_path = copy.deepcopy(t2['path'])
            t2_path = t2['path']
            t2_token = t2['token']
            t2_token = token_generalize(t2_token)

            if (t1_token == '' or t2_token == ''):
                continue
            
            dep_len, srcT, desT, t1_2_t2_path = extract_dep_path(
                t1_token, t1_path, t2_token, t2_path, rs_obj, max_dep_len=max_dep_len
            )
            if max_dep_len == None or dep_len <= max_dep_len:
                dep_triples.append((rs_obj['token_id_map'][srcT], rs_obj['token_id_map'][desT], rs_obj['dep_id_map'][t1_2_t2_path]))
                
    del visit_list
    gc.collect()
                
    rs_obj['sentence_triples'].append(dep_triples)  
    
    rs_obj['xml_function_map'].append(xml_path)    
    # for triple in dep_triples:
    #     print(triple)

def parseMethodXML(xml_path, rs_obj, *args, window_size = None, max_dep_len = None, **kwargs):
    tag_id.clear()
    tree = ET.parse(xml_path)
    
    root = tree.getroot()
    
    for function in root.findall('.//{http://www.srcML.org/srcML/src}function'):
        # print(function)
        handle_node(function, xml_path, rs_obj, *args, window_size = window_size, max_dep_len = max_dep_len, **kwargs)
        
    return rs_obj

def parseMethodXMLForDraper(record, rs_obj, *args, window_size = None, max_dep_len = None, **kwargs):
    tag_id.clear()
    part, id, xml = record
    root = ET.fromstring(xml)
    
    handle_node(root, f'{part}#{id}', rs_obj, *args, window_size = window_size, max_dep_len = max_dep_len, **kwargs)
    
    root.clear()
    del root
    gc.collect()
    
    return rs_obj
            

def printData(rs_obj):
    print()
    print()
    print('----------')
    # print('Token ID Map')
    # for _token, _id in rs_obj['token_id_map'].items():
    #     print(_token, _id)
    # print()
    # print()
    # print('Token Fec Map')
    # for _token, _fec in rs_obj['token_frequecy_map'].items():
    #     print(_token, _fec)
    # print()
    # print()
    # print('Dep ID Map')
    # for _dep, _id in rs_obj['dep_id_map'].items():
    #     print(_dep, _id)
    # print()
    # print()
    # print('Dep Fec Map')
    # for _dep, _fec in rs_obj['dep_frequecy_map'].items():
    #     print(_dep, _fec)
    # print()
    # print()
    print('Sentence Triple')
    for trip in rs_obj['sentence_triples'][0]:
        srct, dest, path = trip
        print(srct, dest, path)
        
# parseMethodXML('srcxml/helloworld.java.xml')
rs_obj = parseMethodXML('srcxml/juliet/juliet.java.76.xml', rs_obj = dict(
    token_id = 0,
    token_id_map = {},
    token_frequecy_map = {},
    dep_id = 0,
    dep_id_map = {},
    dep_frequecy_map = {},
    sentence_triples = [],
    xml_function_map = []
))

printData(rs_obj)




----------
Sentence Triple
0 1 0
0 2 1
0 3 2
0 4 3
0 5 2
0 6 3
0 7 2
0 8 3
0 9 4
0 10 5
0 11 6
0 4 7
0 12 8
0 13 9
0 13 10
0 14 11
0 13 12
0 15 13
0 13 14
0 4 15
0 16 16
0 17 17
0 18 18
0 8 19
0 19 19
0 20 20
0 21 21
0 22 22
0 8 23
0 23 23
0 4 24
0 22 25
1 0 26
1 2 27
1 3 28
1 4 29
1 5 28
1 6 29
1 7 28
1 8 29
1 9 30
1 10 31
1 11 32
1 4 33
1 12 34
1 13 35
1 13 36
1 14 37
1 13 38
1 15 39
1 13 40
1 4 41
1 16 42
1 17 43
1 18 44
1 8 45
1 19 45
1 20 46
1 21 47
1 22 48
1 8 49
1 23 49
1 4 50
1 22 51
2 0 52
2 1 53
2 3 54
2 4 55
2 5 54
2 6 55
2 7 54
2 8 55
2 9 56
2 10 57
2 11 58
2 4 59
2 12 60
2 13 61
2 13 62
2 14 63
2 13 64
2 15 65
2 13 66
2 4 67
2 16 68
2 17 69
2 18 70
2 8 71
2 19 71
2 20 72
2 21 73
2 22 74
2 8 75
2 23 75
2 4 76
2 22 77
3 0 78
3 1 79
3 2 80
3 4 81
3 5 82
3 6 83
3 7 82
3 8 83
3 9 84
3 10 85
3 11 86
3 4 87
3 12 88
3 13 89
3 13 90
3 14 91
3 13 92
3 15 93
3 13 94
3 4 95
3 16 96
3 17 97
3 18 98
3 8 99
3 19 99
3 20 100
3 21 101
3 22 102
3 8 103
3 23 103
3 4 104
3 22 105
4 0 106
4 

In [21]:
import os
from tqdm import tqdm

def getWordGCNDataForDataSet(data_set_name, *args, **kwargs):
    
    data_set_xml_root_path = os.path.join('srcxml', data_set_name)
    
    rs_obj = dict(
        token_id = 0,
        token_id_map = {},
        token_frequecy_map = {},
        dep_id = 0,
        dep_id_map = {},
        dep_frequecy_map = {},
        sentence_triples = [],
        xml_function_map = []
    )
    
    ls = [f for f in os.listdir(data_set_xml_root_path) if f.endswith('.xml')]
    ls = sorted(ls, key=lambda a:int(a.split('.')[-2]))
    pbar = tqdm(range(len(ls)))
    for i in pbar:
        xml_path = os.path.join(data_set_xml_root_path, ls[i])
        parseMethodXML(xml_path, rs_obj = rs_obj, *args, **kwargs)
        if i % 10 == 0:
            pbar.set_postfix({'Sise of rs_obj': size(getsize(rs_obj))})
    
    # printData()
    return rs_obj

import pandas as pd

def getWordGCNDataForDataSetForDrapper(*args, **kwargs):
    
    draper_xml_test_csv = pd.read_csv(os.path.join('srcxml', 'draper', 'draper_xml_test.csv'))
    draper_xml_validate_csv = pd.read_csv(os.path.join('srcxml', 'draper', 'draper_xml_validate.csv'))
    draper_xml_train_csvs = [pd.read_csv(os.path.join('srcxml', 'draper', f'draper_xml_train_{i}.csv')) for i in [1,2,3,4,5,6,7,8]]
    
    rs_obj = dict(
        token_id = 0,
        token_id_map = {},
        token_frequecy_map = {},
        dep_id = 0,
        dep_id_map = {},
        dep_frequecy_map = {},
        sentence_triples = [],
        xml_function_map = []
    )
    
    pbar_1 = tqdm(range(draper_xml_test_csv.shape[0]), desc="Test CSV")
    for i in pbar_1:
        record = [draper_xml_test_csv['part'].iloc[i], draper_xml_test_csv['id'].iloc[i], draper_xml_test_csv['xml'].iloc[i]]
        parseMethodXMLForDraper(record, rs_obj = rs_obj, *args, **kwargs)
        
        # print('sise of token_id_map ', size(getsize(rs_obj['token_id_map'])))
        # print('sise of token_frequecy_map ', size(getsize(rs_obj['token_frequecy_map'])))
        # print('sise of dep_id_map ', size(getsize(rs_obj['dep_id_map'])))
        if i % 100 == 0:
            pbar_1.set_postfix({'Sise of rs_obj': size(getsize(rs_obj))})
        # print('sise of xml_function_map ', size(getsize(rs_obj['xml_function_map'])))
        
        # break
    
    pbar_2 = tqdm(range(draper_xml_validate_csv.shape[0]), desc="Vali CSV")
    for i in pbar_2:
        record = [draper_xml_validate_csv['part'].iloc[i], draper_xml_validate_csv['id'].iloc[i], draper_xml_validate_csv['xml'].iloc[i]]
        parseMethodXMLForDraper(record, rs_obj = rs_obj, *args, **kwargs)  
        if i % 100 == 0:
            pbar_2.set_postfix({'Sise of rs_obj': size(getsize(rs_obj))})  
            
    for i, draper_xml_train_csv in enumerate(draper_xml_train_csvs):
        pbar_3 = tqdm(range(draper_xml_train_csv.shape[0]), desc=f"Train{i+1} CSV")
        for i in pbar_3:
            record = [draper_xml_train_csv['part'].iloc[i], draper_xml_train_csv['id'].iloc[i], draper_xml_train_csv['xml'].iloc[i]]
            parseMethodXMLForDraper(record, rs_obj = rs_obj, *args, **kwargs)    
            if i % 100 == 0:
                pbar_3.set_postfix({'Sise of rs_obj': size(getsize(rs_obj))}) 
    
    return rs_obj

In [4]:
def outputData(data_set_name, rs_obj):
    out_dir = os.path.join('srcxml', f'{data_set_name}_out')
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    
    with open(os.path.join(out_dir, f'voc2id.txt'), 'w') as f:
        for _tk, _id in rs_obj['token_id_map'].items():
            f.write(f'{_tk}\t{_id}\r\n')
            
    with open(os.path.join(out_dir, f'id2feq.txt'), 'w') as f:
        for _tk, _feq in rs_obj['token_frequecy_map'].items():
            _id = rs_obj['token_id_map'][_tk]
            f.write(f'{_id}\t{_feq}\r\n')

    with open(os.path.join(out_dir, f'de2id.txt'), 'w') as f:
        for _dep, _id in rs_obj['dep_id_map'].items():
            f.write(f'{_dep}\t{_id}\r\n')
            
    with open(os.path.join(out_dir, f'de2feq.txt'), 'w') as f:
        for _dep, _feq in rs_obj['dep_frequecy_map'].items():
            f.write(f'{_feq}\t\t{_dep}\r\n')
            
    with open(os.path.join(out_dir, f'data.txt'), 'w') as f:
        for sentence_triples in rs_obj['sentence_triples']:
            tk_set = set([])
            dep_path_list = []
            for triple in sentence_triples:
                srcT_id, desT_id, dep_path_id = triple
                tk_set.add(srcT_id)
                tk_set.add(desT_id)
                
                dep_path_list.append((srcT_id, desT_id, dep_path_id))
                
            tk_id_list = list(tk_set)
            
            # dep_path_list = [f"{sdd[0]}|{sdd[1]}|{sdd[2]}" for sdd in dep_path_list]
            dep_path_list = [f"{tk_id_list.index(sdd[0])}|{tk_id_list.index(sdd[1])}|{sdd[2]}" for sdd in dep_path_list]
            
            f.write(f'{len(tk_id_list)} ')
            f.write(f'{len(dep_path_list)} ')
            f.write(' '.join([str(_id) for _id in tk_id_list]))
            f.write(' ')
            f.write(' '.join(dep_path_list))
            f.write('\r\n')
            
    with open(os.path.join(out_dir, f'data_xml_map.txt'), 'w') as f:
        for xml_path in rs_obj['xml_function_map']:
            f.write(f'{xml_path}\r\n')

In [8]:
rs_obj = getWordGCNDataForDataSet('juliet', window_size = None, max_dep_len = None)
outputData('juliet', rs_obj)

100%|██████████| 115/115 [00:21<00:00,  5.29it/s, Sise of rs_obj=39M]


In [None]:
rs_obj = getWordGCNDataForDataSet('owasp')
outputData('owasp', rs_obj)

In [None]:
rs_obj = getWordGCNDataForDataSetForDrapper(window_size =5, max_dep_len = 10)
outputData('draper', rs_obj)