In [1]:
import xml.etree.cElementTree as et
import networkx as nx
import matplotlib.pyplot as plt
import json
import subprocess
import time
from os import path
import glob
import pandas as pd
import re

The following is to extract graphs from adb files, and to save into json files.

In [2]:
def get_rtl_hash_table(root):
    """
    param: 
        root: the root of the adb file
    return:
        rtl_table: This file returns a hash table of resources and the rtlNames.
    """
    all_rtl = root.findall('*/res/*/item')
    rtl_table = {}
    if_add = False
    rep = re.compile(' \(.*\)')
    for i in all_rtl:
        res_table = {}
        rtl_name = i.find('first').text
        rtl_res = i.find('second')
        if rtl_name not in rtl_table.keys():
            for res in rtl_res.iter('item'):
                try:
                    res_name = res.findall('first')[0].text
                    res_num = res.findall('second')[0].text
                except BaseException:
                    # print('The RTL $',rtl_name,'& does not contain any resource info.')
                    break
                else:
                    if res_name in res_considered:
                        res_table[res_name] = res_num
                        if_add = True
        if if_add:
            rtl_table[re.sub(rep, '', rtl_name)] = res_table
        if_add = False
    return rtl_table

In [3]:
### parse adb files into graphs (in json)
res_considered = ['FF', 'LUT', 'DSP']

def parse_xml_into_graph_single(xml_file):
    prefix = ''
    G = nx.DiGraph()
    parser = et.parse(xml_file)
    root = parser.getroot()
    cdfg = root.findall('*/cdfg')[0]

    # rtl hash table
    rtl_res_table = get_rtl_hash_table(root)

    ### find edges and build the graph
    #print("Adding Edges")
    edge_id_max = -1
    for edges in cdfg.iter('edges'):
        for edge in edges.iter('item'):
            source = edge.find('source_obj').text
            sink = edge.find('sink_obj').text
            edge_id = edge.find('id').text
            edge_id_max = max(int(edge_id), edge_id_max)
            is_back_edge = edge.find('is_back_edge').text
            edge_type = edge.find('edge_type').text
            G.add_edges_from([(prefix + source, prefix + sink, {'edge_name': prefix + edge_id, 'is_back_edge': is_back_edge, 'edge_type': edge_type})])

    ### add node attributes
    #print("Adding Nodes")
    for nodes in cdfg.iter('nodes'):
        for node in nodes.findall('item'):
            node_id = node.findall('*/*/id')[0].text
            node_name = prefix + node_id
        
            if node_name not in G.nodes():
                #print('Node %s (type: nodes) not in the graph' % node_name)
                op_code = node.findall('opcode')[0].text
                if op_code == 'ret':
                    G.add_node(node_name)
                    G.nodes[node_name]['node_name'] = node_name
                    G.nodes[node_name]['category']='nodes'
                    G.nodes[node_name]['bitwidth'] = node.findall('*/bitwidth')[0].text
                    G.nodes[node_name]['opcode'] = node.findall('opcode')[0].text
                    G.nodes[node_name]['m_Display'] = node.findall('m_Display')[0].text
                    G.nodes[node_name]['m_isOnCriticalPath'] = node.findall('m_isOnCriticalPath')[0].text
                    G.nodes[node_name]['m_isStartOfPath'] = node.findall('m_isStartOfPath')[0].text
                    G.nodes[node_name]['m_delay'] = node.findall('m_delay')[0].text
                    G.nodes[node_name]['m_topoIndex'] = node.findall('m_topoIndex')[0].text
                    G.nodes[node_name]['m_isLCDNode'] = node.findall('m_isLCDNode')[0].text
                    G.nodes[node_name]['m_clusterGroupNumber'] = node.findall('m_clusterGroupNumber')[0].text
                    G.nodes[node_name]['type'] = node.findall('*/*/type')[0].text
                    G.nodes[node_name]['LUT'] = '0'
                    G.nodes[node_name]['FF'] = '0'
                    G.nodes[node_name]['DSP'] = '0'
                continue

            G.nodes[node_name]['node_name'] = node_name        
            G.nodes[node_name]['category'] = 'nodes'
            G.nodes[node_name]['bitwidth'] = node.findall('*/bitwidth')[0].text
            G.nodes[node_name]['opcode'] = node.findall('opcode')[0].text
            G.nodes[node_name]['m_Display'] = node.findall('m_Display')[0].text
            G.nodes[node_name]['m_isOnCriticalPath'] = node.findall('m_isOnCriticalPath')[0].text
            G.nodes[node_name]['m_isStartOfPath'] = node.findall('m_isStartOfPath')[0].text
            G.nodes[node_name]['m_delay'] = node.findall('m_delay')[0].text
            G.nodes[node_name]['m_topoIndex'] = node.findall('m_topoIndex')[0].text
            G.nodes[node_name]['m_isLCDNode'] = node.findall('m_isLCDNode')[0].text
            G.nodes[node_name]['m_clusterGroupNumber'] = node.findall('m_clusterGroupNumber')[0].text
            G.nodes[node_name]['type'] = node.findall('*/*/type')[0].text
            # rtl info below
            # every nodes has the three features, so we initilize them as 0.
            G.nodes[node_name]['LUT'] = '0'
            G.nodes[node_name]['FF'] = '0'
            G.nodes[node_name]['DSP'] = '0'
            t_rtlname = node.findall('*/*/rtlName')[0].text
            if t_rtlname != None:
                # if this nodes has a rtlName info
                if t_rtlname in rtl_res_table.keys():
                    # if this rtlName has corresponding resources info
                    # print(t_rtlname, '+++++++++++', rtl_res_table[t_rtlname])
                    res_name = rtl_res_table[t_rtlname].keys()
                    for i in res_name:
                        # rewrite the initial number with the actual number
                        G.nodes[node_name][i] = rtl_res_table[t_rtlname][i]

    ## blocks are for control signals
    for nodes in cdfg.iter('blocks'):
        for node in nodes.findall('item'):
            node_id = node.findall('*/id')[0].text
            node_name = prefix + node_id

            if node_name not in G.nodes():
                #print('Node %s (type: blocks) not in the graph' % node_name)
                continue
            G.nodes[node_name]['node_name'] = node_name        
            G.nodes[node_name]['category'] = 'blocks'
            G.nodes[node_name]['type'] = node.findall('*/type')[0].text
    
    ## ports are function arguments 
    for nodes in cdfg.iter('ports'):
        for node in nodes.findall('item'):
            node_id = node.findall('*/*/id')[0].text
            node_name = prefix + node_id

            if node_name not in G.nodes():
                #print('Node %s (type: ports) not in the graph' % node_name)
                continue
            G.nodes[node_name]['node_name'] = node_name        
            G.nodes[node_name]['category'] = 'ports'
            G.nodes[node_name]['type'] = node.findall('*/*/type')[0].text
            G.nodes[node_name]['bitwidth'] = node.findall('*/bitwidth')[0].text
            G.nodes[node_name]['direction'] = node.findall('direction')[0].text
            G.nodes[node_name]['if_type'] = node.findall('if_type')[0].text
            G.nodes[node_name]['array_size'] = node.findall('array_size')[0].text

    ## no need to keep consts as nodes in the graph
    ## remove to reduce the graph size
    for nodes in cdfg.iter('consts'):
        for node in nodes.findall('item'):
            node_id = node.findall('*/*/id')[0].text
            node_name = prefix + node_id

            if node_name not in G.nodes():
                #print('Node %s (type: consts) not in the graph' % node_name)
                continue
            for v in G.neighbors(node_name):
                G.nodes[v]['const'] = node_name
                G.nodes[v]['const-bitwidth'] = node.findall('*/bitwidth')[0].text
            # remove the const node
            G.remove_node(node_name)
            #print("const node %s removed" % node_name)

    #edge_list = list(G.edges)
    #print(edge_list)
    #node_list = list(G.nodes)
    #print(node_list)
    return G

In [4]:
### save one graph into json
def json_save(G, fname):
    f = open(fname + '.json', 'w')
    G_dict = dict(nodes=[[n, G.nodes[n]] for n in G.nodes()], \
                  edges=[(e[0], e[1], G.edges[e]) for e in G.edges()])
    json.dump(G_dict, f)
    f.close()

In [5]:
### save the graphs into json
def json_save_graphs(Gs, fname):
    f = open(fname + '.json', 'w')
    G_dict = dict()
    G_dict['nodes'] = []
    G_dict['edges'] = []
    for G in Gs:
        for n in G.nodes():
            G_dict['nodes'].append([n, G.nodes[n]])
        for e in G.edges():
            G_dict['edges'].append((e[0], e[1], G.edges[e]))
    json.dump(G_dict, f)
    f.close()

In [6]:
### read the actual resource
def get_real_perf(fname):
    f = open(fname + '.json', 'r')
    d = json.load(f)
    f.close()
    DSP=d['DSP']
    LUT=d['LUT']
    CP=d['CP']
    FF=d['FF']
    SLICE=d['SLICE']

    return DSP, LUT, CP, FF, SLICE

In [7]:
#result_dir='MachSuite'
#graph_dir='MachSuite_adb/'
#result_dir='PolyBench'
#graph_dir='PolyBench_adb/'
result_dir='CHStone'
graph_dir='CHStone_adb/'

In [8]:
### get subgraphs in one application
graph_mapping = dict()
for adb_file in glob.glob('real_case/' + graph_dir + '*.adb'):
    #print(adb_file)
    _, _, file_name = adb_file.split('/')
    fname, func_name = file_name.split('-')
    if fname not in graph_mapping:
        graph_mapping[fname] = [func_name]
    else:
        graph_mapping[fname].append(func_name)

In [9]:
def check_max_node_id(node_string):
    node_array=[]
    for n in node_string:
        node_array.append(int(n))
    max_id=max(node_array)
    return max_id

In [10]:
### The final stage of saving graphs into json files
for fname in graph_mapping:
    graph_num = len(graph_mapping[fname])
    max_id = 0
    # if multiple functions
    if graph_num > 1:
        G = []
        for adb_file in glob.glob('real_case/' + graph_dir + fname + '-*'):
            g = parse_xml_into_graph_single(adb_file)
            # relabel nodes
            for n in g.nodes:
                mapping = {n:str(int(n)+max_id)}
                g = nx.relabel_nodes(g, mapping)
            G.append(g)
            max_id = check_max_node_id(g.nodes)+1
            print(fname, max_id)
        json_save_graphs(G, 'real_case/' + result_dir + '/' + fname)
        
    else:
        for adb_file in glob.glob('real_case/' + graph_dir + fname + '-*'):
            g = parse_xml_into_graph_single(adb_file)
            json_save(g, 'real_case/' + result_dir + '/' + fname)
            print(fname)


float64_add 277
float64_add 605
float64_add 634
adpcm_main 1816
adpcm_main 4302
adpcm_main 4522
mips
aes_main 535
aes_main 710
aes_main 769
aes_main 1019
aes_main 1391
aes_main 1926
aes_main 2298
blowfish_main 276
blowfish_main 958
local_sin 277
local_sin 605
local_sin 1033
local_sin 1126
local_sin 1466
float64_mul
Gsm_LPC_Analysis 47
Gsm_LPC_Analysis 355
Gsm_LPC_Analysis 579
sha_stream 2504
sha_stream 2904
float64_div


The following is to process graphs into dataset format. 

In [43]:
### original features (no rtl information)
allowable_features = {
    'node_category' : ['nodes', 'blocks', 'ports', 'misc'], 
    'bitwidth' : list(range(0, 256)) + ['misc'], 
    'opcode_category' : ['terminator','binary_unary', 'bitwise', 'conversion','memory','aggregate','other','misc'], 
    'possible_opcode_list' : [
        'br', 'ret', 'switch',
        'add', 'dadd', 'fadd', 'sub', 'dsub', 'fsub', 'mul', 'dmul', 'fmul', 'udiv', 'ddiv', 'fdiv', 'sdiv', 'urem', 'srem', 'frem', 'dexp', 'dsqrt',
        'shl', 'lshr', 'ashr', 'and', 'xor', 'or',
        'uitofp', 'sitofp', 'uitodp', 'sitodp', 'bitconcatenate', 'bitcast', 'zext', 'sext', 'fpext', 'trunc', 'fptrunc',
        'extractvalue', 'insertvalue',
        'alloca', 'load', 'store', 'read', 'write', 'getelementptr',
        'phi', 'call', 'icmp', 'dcmp', 'fcmp', 'select', 'bitselect', 'partselect', 'mux', 'dacc',
        'misc'
    ],
    'possible_is_start_of_path': [0, 1, 'misc'],
    'possible_is_LCDnode':[0, 1, 'misc'],
    'possible_cluster_group_num':[-1] + list(range(0, 256)) + ['misc'],
    
    'possible_edge_type_list' : [1, 2, 3, 'misc'], 
    'possible_is_back_edge': [0, 1],
}

def safe_index(l, e):
    """
    Return index of element e in list l. If e is not present, return the last index
    """
    try:
        return l.index(e)
    except:
        return len(l) - 1

def opcode_type(opcode):
    if opcode in {'br', 'ret', 'switch'}:
        t='terminator'
    if opcode in {'add', 'dadd', 'fadd', 'sub', 'dsub', 'fsub', 'mul', 'dmul', 'fmul', 'udiv', 'ddiv', 'fdiv', 'sdiv', 'urem', 'srem', 'frem', 'dexp', 'dsqrt'}:
        t='binary_unary'
    if opcode in {'shl', 'lshr', 'ashr', 'and', 'xor', 'or'}:
        t='bitwise'
    if opcode in {'uitofp', 'sitofp', 'uitodp', 'sitodp', 'bitconcatenate', 'bitcast', 'zext', 'sext', 'fpext', 'trunc', 'fptrunc'}:
        t='conversion'
    if opcode in {'alloca', 'load', 'store', 'read', 'write', 'getelementptr'}:
        t='memory'
    if opcode in {'extractvalue', 'insertvalue'}:
        t='aggregate'
    if opcode in {'phi', 'call', 'icmp', 'dcmp', 'fcmp', 'select', 'bitselect', 'partselect', 'mux', 'dacc'}:
        t='other'
    return t


def node_to_feature_vector(node):
    """
    Converts node object to feature list of indices
    :return: list
    """
    if node=={}:
        node_feature = [
                len(allowable_features['node_category'])-1,
                len(allowable_features['bitwidth'])-1,
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1
                ]
        return node_feature

    if node['category']=='nodes':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                safe_index(allowable_features['bitwidth'], int(node['bitwidth'])),
                safe_index(allowable_features['opcode_category'], opcode_type(node['opcode'])),
                safe_index(allowable_features['possible_opcode_list'], node['opcode']),
                safe_index(allowable_features['possible_is_start_of_path'], int(node['m_isStartOfPath'])),
                safe_index(allowable_features['possible_is_LCDnode'], int(node['m_isLCDNode'])),
                safe_index(allowable_features['possible_cluster_group_num'], int(node['m_clusterGroupNumber'])),
                ]
    elif node['category']=='ports':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                safe_index(allowable_features['bitwidth'], int(node['bitwidth'])),
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1
                ]
    elif node['category']=='blocks':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                len(allowable_features['bitwidth'])-1,
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1
                ]
    return node_feature

def get_node_feature_dims():
    return list(map(len, [
        allowable_features['node_category'],
        allowable_features['bitwidth'],
        allowable_features['opcode_category'],
        allowable_features['possible_opcode_list'],
        allowable_features['possible_is_start_of_path'],
        allowable_features['possible_is_LCDnode'],
        allowable_features['possible_cluster_group_num']
        ]))


def edge_to_feature_vector(edge):
    """
    Converts edge to feature list of indices
    :return: list
    """
    bond_feature = [
                safe_index(allowable_features['possible_edge_type_list'], int(edge['edge_type'])),
                allowable_features['possible_is_back_edge'].index(int(edge['is_back_edge']))
            ]
    return bond_feature

def get_edge_feature_dims():
    return list(map(len, [
        allowable_features['possible_edge_type_list'],
        allowable_features['possible_is_back_edge']
        ]))

In [57]:
### features for binary rtl resource

allowable_features = {
    'node_category' : ['nodes', 'blocks', 'ports', 'misc'], 
    'bitwidth' : list(range(0, 256)) + ['misc'], 
    'opcode_category' : ['terminator','binary_unary', 'bitwise', 'conversion','memory','aggregate','other','misc'], 
    'possible_opcode_list' : [
        'br', 'ret', 'switch',
        'add', 'dadd', 'fadd', 'sub', 'dsub', 'fsub', 'mul', 'dmul', 'fmul', 'udiv', 'ddiv', 'fdiv', 'sdiv', 'urem', 'srem', 'frem', 'dexp', 'dsqrt',
        'shl', 'lshr', 'ashr', 'and', 'xor', 'or',
        'uitofp', 'sitofp', 'uitodp', 'sitodp', 'bitconcatenate', 'bitcast', 'zext', 'sext', 'fpext', 'trunc', 'fptrunc',
        'extractvalue', 'insertvalue',
        'alloca', 'load', 'store', 'read', 'write', 'getelementptr',
        'phi', 'call', 'icmp', 'dcmp', 'fcmp', 'select', 'bitselect', 'partselect', 'mux', 'dacc',
        'misc'
    ],
    'possible_is_start_of_path': [0, 1, 'misc'],
    'possible_is_LCDnode':[0, 1, 'misc'],
    'possible_cluster_group_num': [-1] + list(range(0, 256)) + ['misc'],
    'LUT': [0, 1, 'misc'],
    'DSP': [0, 1, 'misc'],
    'FF': [0, 1, 'misc'],
    
    'possible_edge_type_list' : [1, 2, 3, 'misc'], 
    'possible_is_back_edge': [0, 1],
}

def safe_index(l, e):
    """
    Return index of element e in list l. If e is not present, return the last index
    """
    try:
        return l.index(e)
    except:
        return len(l) - 1

def opcode_type(opcode):
    if opcode in {'br', 'ret', 'switch'}:
        t='terminator'
    if opcode in {'add', 'dadd', 'fadd', 'sub', 'dsub', 'fsub', 'mul', 'dmul', 'fmul', 'udiv', 'ddiv', 'fdiv', 'sdiv', 'urem', 'srem', 'frem', 'dexp', 'dsqrt'}:
        t='binary_unary'
    if opcode in {'shl', 'lshr', 'ashr', 'and', 'xor', 'or'}:
        t='bitwise'
    if opcode in {'uitofp', 'sitofp', 'uitodp', 'sitodp', 'bitconcatenate', 'bitcast', 'zext', 'sext', 'fpext', 'trunc', 'fptrunc'}:
        t='conversion'
    if opcode in {'alloca', 'load', 'store', 'read', 'write', 'getelementptr'}:
        t='memory'
    if opcode in {'extractvalue', 'insertvalue'}:
        t='aggregate'
    if opcode in {'phi', 'call', 'icmp', 'dcmp', 'fcmp', 'select', 'bitselect', 'partselect', 'mux', 'dacc'}:
        t='other'
    return t

def res_type(res_num):
    if res_num > 0:
        return 1
    else:
        return 0


def node_to_feature_vector(node):
    """
    Converts node object to feature list of indices
    :return: list
    """
    if node=={}:
        node_feature = [
                len(allowable_features['node_category'])-1,
                len(allowable_features['bitwidth'])-1,
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1,
                len(allowable_features['LUT'])-1,
                len(allowable_features['DSP'])-1,
                len(allowable_features['FF'])-1
                ]
        return node_feature

    if node['category']=='nodes':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                safe_index(allowable_features['bitwidth'], int(node['bitwidth'])),
                safe_index(allowable_features['opcode_category'], opcode_type(node['opcode'])),
                safe_index(allowable_features['possible_opcode_list'], node['opcode']),
                safe_index(allowable_features['possible_is_start_of_path'], int(node['m_isStartOfPath'])),
                safe_index(allowable_features['possible_is_LCDnode'], int(node['m_isLCDNode'])),
                safe_index(allowable_features['possible_cluster_group_num'], int(node['m_clusterGroupNumber'])),
                safe_index(allowable_features['LUT'], res_type(int(node['LUT']))),
                safe_index(allowable_features['DSP'], res_type(int(node['DSP']))),
                safe_index(allowable_features['FF'], res_type(int(node['FF'])))
                ]
    elif node['category']=='ports':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                safe_index(allowable_features['bitwidth'], int(node['bitwidth'])),
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1,
                len(allowable_features['LUT'])-1,
                len(allowable_features['DSP'])-1,
                len(allowable_features['FF'])-1
                ]
    elif node['category']=='blocks':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                len(allowable_features['bitwidth'])-1,
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1,
                len(allowable_features['LUT'])-1,
                len(allowable_features['DSP'])-1,
                len(allowable_features['FF'])-1
                ]
    return node_feature

def get_node_feature_dims():
    return list(map(len, [
        allowable_features['node_category'],
        allowable_features['bitwidth'],
        allowable_features['opcode_category'],
        allowable_features['possible_opcode_list'],
        allowable_features['possible_is_start_of_path'],
        allowable_features['possible_is_LCDnode'],
        allowable_features['possible_cluster_group_num'],
        allowable_features['LUT'],
        allowable_features['DSP'],
        allowable_features['FF'],
        ]))


def edge_to_feature_vector(edge):
    """
    Converts edge to feature list of indices
    :return: list
    """
    bond_feature = [
                safe_index(allowable_features['possible_edge_type_list'], int(edge['edge_type'])),
                allowable_features['possible_is_back_edge'].index(int(edge['is_back_edge']))
            ]
    return bond_feature

def get_edge_feature_dims():
    return list(map(len, [
        allowable_features['possible_edge_type_list'],
        allowable_features['possible_is_back_edge']
        ]))


In [11]:
### features for numerical rtl resource

allowable_features = {
    'node_category' : ['nodes', 'blocks', 'ports', 'misc'], 
    'bitwidth' : list(range(0, 256)) + ['misc'], 
    'opcode_category' : ['terminator','binary_unary', 'bitwise', 'conversion','memory','aggregate','other','misc'], 
    'possible_opcode_list' : [
        'br', 'ret', 'switch',
        'add', 'dadd', 'fadd', 'sub', 'dsub', 'fsub', 'mul', 'dmul', 'fmul', 'udiv', 'ddiv', 'fdiv', 'sdiv', 'urem', 'srem', 'frem', 'dexp', 'dsqrt',
        'shl', 'lshr', 'ashr', 'and', 'xor', 'or',
        'uitofp', 'sitofp', 'uitodp', 'sitodp', 'bitconcatenate', 'bitcast', 'zext', 'sext', 'fpext', 'trunc', 'fptrunc',
        'extractvalue', 'insertvalue',
        'alloca', 'load', 'store', 'read', 'write', 'getelementptr',
        'phi', 'call', 'icmp', 'dcmp', 'fcmp', 'select', 'bitselect', 'partselect', 'mux', 'dacc',
        'misc'
    ],
    'possible_is_start_of_path': [0, 1, 'misc'],
    'possible_is_LCDnode':[0, 1, 'misc'],
    'possible_cluster_group_num': [-1] + list(range(0, 256)) + ['misc'],
    'LUT': list(range(0, 1000)) + ['misc'],
    'DSP': list(range(0, 11)) + ['misc'],
    'FF': list(range(0, 1000)) + ['misc'],
    
    'possible_edge_type_list' : [1, 2, 3, 'misc'], 
    'possible_is_back_edge': [0, 1],
}

def safe_index(l, e):
    """
    Return index of element e in list l. If e is not present, return the last index
    """
    try:
        return l.index(e)
    except:
        return len(l) - 1

def opcode_type(opcode):
    if opcode in {'br', 'ret', 'switch'}:
        t='terminator'
    if opcode in {'add', 'dadd', 'fadd', 'sub', 'dsub', 'fsub', 'mul', 'dmul', 'fmul', 'udiv', 'ddiv', 'fdiv', 'sdiv', 'urem', 'srem', 'frem', 'dexp', 'dsqrt'}:
        t='binary_unary'
    if opcode in {'shl', 'lshr', 'ashr', 'and', 'xor', 'or'}:
        t='bitwise'
    if opcode in {'uitofp', 'sitofp', 'uitodp', 'sitodp', 'bitconcatenate', 'bitcast', 'zext', 'sext', 'fpext', 'trunc', 'fptrunc'}:
        t='conversion'
    if opcode in {'alloca', 'load', 'store', 'read', 'write', 'getelementptr'}:
        t='memory'
    if opcode in {'extractvalue', 'insertvalue'}:
        t='aggregate'
    if opcode in {'phi', 'call', 'icmp', 'dcmp', 'fcmp', 'select', 'bitselect', 'partselect', 'mux', 'dacc'}:
        t='other'
    return t



def node_to_feature_vector(node):
    """
    Converts node object to feature list of indices
    :return: list
    """

    if node=={}:
        node_feature = [
                len(allowable_features['node_category'])-1,
                len(allowable_features['bitwidth'])-1,
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1,
                len(allowable_features['LUT'])-1,
                len(allowable_features['DSP'])-1,
                len(allowable_features['FF'])-1
                ]
        return node_feature
        
    if node['category']=='nodes':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                safe_index(allowable_features['bitwidth'], int(node['bitwidth'])),
                safe_index(allowable_features['opcode_category'], opcode_type(node['opcode'])),
                safe_index(allowable_features['possible_opcode_list'], node['opcode']),
                safe_index(allowable_features['possible_is_start_of_path'], int(node['m_isStartOfPath'])),
                safe_index(allowable_features['possible_is_LCDnode'], int(node['m_isLCDNode'])),
                safe_index(allowable_features['possible_cluster_group_num'], int(node['m_clusterGroupNumber'])),
                safe_index(allowable_features['LUT'], int(node['LUT'])),
                safe_index(allowable_features['DSP'], int(node['DSP'])),
                safe_index(allowable_features['FF'], int(node['FF']))
                ]
    elif node['category']=='ports':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                safe_index(allowable_features['bitwidth'], int(node['bitwidth'])),
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1,
                len(allowable_features['LUT'])-1,
                len(allowable_features['DSP'])-1,
                len(allowable_features['FF'])-1
                ]
    elif node['category']=='blocks':
        node_feature = [
                safe_index(allowable_features['node_category'], node['category']),
                len(allowable_features['bitwidth'])-1,
                len(allowable_features['opcode_category'])-1,
                len(allowable_features['possible_opcode_list'])-1,
                len(allowable_features['possible_is_start_of_path'])-1,
                len(allowable_features['possible_is_LCDnode'])-1,
                len(allowable_features['possible_cluster_group_num'])-1,
                len(allowable_features['LUT'])-1,
                len(allowable_features['DSP'])-1,
                len(allowable_features['FF'])-1
                ]
    return node_feature

def get_node_feature_dims():
    return list(map(len, [
        allowable_features['node_category'],
        allowable_features['bitwidth'],
        allowable_features['opcode_category'],
        allowable_features['possible_opcode_list'],
        allowable_features['possible_is_start_of_path'],
        allowable_features['possible_is_LCDnode'],
        allowable_features['possible_cluster_group_num'],
        allowable_features['LUT'],
        allowable_features['DSP'],
        allowable_features['FF'],
        ]))


def edge_to_feature_vector(edge):
    """
    Converts edge to feature list of indices
    :return: list
    """
    bond_feature = [
                safe_index(allowable_features['possible_edge_type_list'], int(edge['edge_type'])),
                allowable_features['possible_is_back_edge'].index(int(edge['is_back_edge']))
            ]
    return bond_feature

def get_edge_feature_dims():
    return list(map(len, [
        allowable_features['possible_edge_type_list'],
        allowable_features['possible_is_back_edge']
        ]))


In [12]:
#result_dir='PolyBench/'
#prefix='polybench_'
#save_dir='real_case/PB_ds/'
result_dir='CHStone/'
prefix='chstone_'
save_dir='real_case/CH_ds/'
#result_dir='MachSuite/'
#prefix='MachSuite_'
#save_dir='real_case/MS_ds/'

In [13]:
### graphs in json transformed into csv format
graph_mapping_list = []
num_node_list = []
num_edge_list = []

DSP = []
LUT = []
CP = []
FF = []
SLICE = []

node_feat = []
edge_list = []
edge_feat = []

for perf_file in glob.glob('real_case/' + result_dir + prefix + '*.json'):
    _, _, file_name = perf_file.split('/')
    graph_name = file_name.replace(prefix,'')
    print(graph_name)

    f = open('real_case/'+result_dir + graph_name, 'r')
    d = json.load(f)
    f.close()
    nodes=d['nodes']
    edges=d['edges']

    num_node_list.append(len(nodes))
    num_edge_list.append(len(edges))
    graph_mapping_list.append(result_dir + graph_name)

    dsp, lut, cp, ff, Slice = get_real_perf(perf_file.replace('.json',''))
    DSP.append(dsp)
    LUT.append(lut/1000)
    CP.append(cp)
    FF.append(ff/1000)
    SLICE.append(Slice/200)

    node_index_map = dict() # map the node name to the index
    index = 0

    for n in nodes:
        if n[0] not in node_index_map:
            node_index_map[n[0]] = index
        #print(n[1])
        node_feat.append(node_to_feature_vector(n[1]))
        index = index + 1
    
    for e in edges:
        source = node_index_map[e[0]]
        sink = node_index_map[e[1]]
        edge_list.append([source,sink])
        edge_feat.append(edge_to_feature_vector(e[2]))
 

Gsm_LPC_Analysis.json
local_sin.json
float64_div.json
mips.json
aes_main.json
sha_stream.json
float64_mul.json
adpcm_main.json
blowfish_main.json
float64_add.json


In [14]:
### save graphs into csv files

ds_dir = 'CHStone_ds'                # 'MachSuite_ds', 'CHStone_ds', PolyBench_ds
save_dir = 'real_case/' + ds_dir + '/' # the directory to save real cases, three benchmarks are saved separately in this stage

mapping = pd.DataFrame({'orignal code':graph_mapping_list , 'DSP' : DSP , 'LUT' : LUT, 'CP' : CP, 'FF' : FF, 'SLICE' : SLICE})
NODE_num = pd.DataFrame(num_node_list) # number of nodes in each graph 
NODE = pd.DataFrame(node_feat) # node features
EDGE_num = pd.DataFrame(num_edge_list) # number of edges in each graph
EDGE_list = pd.DataFrame(edge_list) # edge (source, end)
EDGE_feat = pd.DataFrame(edge_feat) # edge features

graph_label_dsp = pd.DataFrame(DSP)
graph_label_lut = pd.DataFrame(LUT)
graph_label_cp = pd.DataFrame(CP)
graph_label_ff = pd.DataFrame(FF)
graph_label_slice = pd.DataFrame(SLICE)

# save into csv files
mapping.to_csv(save_dir + 'mapping.csv', index = False)
NODE_num.to_csv(save_dir + 'num-node-list.csv', index = False, header = False)
NODE.to_csv(save_dir + 'node-feat.csv', index = False, header = False)

EDGE_num.to_csv(save_dir + 'num-edge-list.csv', index = False, header = False)
EDGE_list.to_csv(save_dir + 'edge.csv', index = False, header=False)
EDGE_feat.to_csv(save_dir + 'edge-feat.csv', index = False, header = False)

graph_label_dsp.to_csv(save_dir + 'graph-label-dsp.csv', index = False, header = False)
graph_label_lut.to_csv(save_dir + 'graph-label-lut.csv', index = False, header = False)
graph_label_cp.to_csv(save_dir + 'graph-label-cp.csv', index = False, header = False)
graph_label_ff.to_csv(save_dir + 'graph-label-ff.csv', index = False, header = False)

The following is to merge real-case benchmarks with synthetic cdfg.

In [15]:
### merge all three real case with synthetic cdfg

# read synthetic cdfg
syn_dir = 'real_case/cdfg/'
mapping_0 = pd.read_csv(syn_dir + 'mapping.csv').values.tolist()
edge_feat_0 = pd.read_csv(syn_dir + 'edge-feat.csv', header = None).values.tolist()
edge_0 = pd.read_csv(syn_dir + 'edge.csv', header = None).values.tolist()
node_0 = pd.read_csv(syn_dir + 'node-feat.csv', header = None).values.tolist()
node_num_0 = pd.read_csv(syn_dir + 'num-node-list.csv', header = None).values.tolist()
edge_num_0 = pd.read_csv(syn_dir + 'num-edge-list.csv', header = None).values.tolist()

dsp_0 = pd.read_csv(syn_dir + 'graph-label-dsp.csv', header = None).values.tolist()
lut_0 = pd.read_csv(syn_dir + 'graph-label-lut.csv', header = None).values.tolist()
ff_0 = pd.read_csv(syn_dir + 'graph-label-ff.csv', header = None).values.tolist()
cp_0 = pd.read_csv(syn_dir + 'graph-label-cp.csv', header = None).values.tolist()

# read real-case benchmarks
case_dir_all=['real_case/CHStone_ds/'
# , 'real_case/PolyBench_ds/'
# , 'real_case/MachSuite_ds/'
]

mapping_1 = []
edge_feat_1 = []
edge_1 = []
node_1 = []

dsp_1 = []
lut_1 = []
ff_1 = []
cp_1 = []

node_num_1 = []
edge_num_1 = []


for case_dir in case_dir_all:
    mapping_1 += pd.read_csv(case_dir + 'mapping.csv').values.tolist()
    edge_feat_1 += pd.read_csv(case_dir + 'edge-feat.csv', header = None).values.tolist()
    edge_1 += pd.read_csv(case_dir + 'edge.csv', header = None).values.tolist()
    node_1 += pd.read_csv(case_dir + 'node-feat.csv', header = None).values.tolist()

    dsp_1 += pd.read_csv(case_dir + 'graph-label-dsp.csv', header = None).values.tolist()
    lut_1 += pd.read_csv(case_dir + 'graph-label-lut.csv', header = None).values.tolist()
    ff_1 += pd.read_csv(case_dir + 'graph-label-ff.csv', header = None).values.tolist()
    cp_1 += pd.read_csv(case_dir + 'graph-label-cp.csv', header = None).values.tolist()

    node_num_1 += pd.read_csv(case_dir + 'num-node-list.csv', header = None).values.tolist()
    edge_num_1 += pd.read_csv(case_dir + 'num-edge-list.csv', header = None).values.tolist()

# merge together
DSP = dsp_0 + dsp_1
LUT = lut_0 + lut_1
FF = ff_0 + ff_1
CP = cp_0 + cp_1

graph_mapping_list = mapping_0 + mapping_1
num_node_list = node_num_0 + node_num_1
num_edge_list = edge_num_0 + edge_num_1

node_feat = node_0 + node_1
edge_list = edge_0 + edge_1
edge_feat = edge_feat_0 + edge_feat_1

In [17]:
### save merged dataset

save_dir = 'real_case/all_real_case/'
mapping = pd.DataFrame({'orignal code' : graph_mapping_list, 'DSP' : DSP, 'LUT' : LUT, 'CP' : CP, 'FF' : FF})
NODE_num = pd.DataFrame(num_node_list)
EDGE_num = pd.DataFrame(num_edge_list)

graph_label_dsp = pd.DataFrame(DSP)
graph_label_lut = pd.DataFrame(LUT)
graph_label_cp = pd.DataFrame(CP)
graph_label_ff = pd.DataFrame(FF)

NODE = pd.DataFrame(node_feat)
EDGE_list = pd.DataFrame(edge_list)
EDGE_feat = pd.DataFrame(edge_feat)


mapping.to_csv(save_dir + 'mapping.csv', index = False)
NODE_num.to_csv(save_dir + 'num-node-list.csv', index = False, header = False)
EDGE_num.to_csv(save_dir + 'num-edge-list.csv', index = False, header = False)

graph_label_dsp.to_csv(save_dir + 'graph-label-dsp.csv', index = False, header = False)
graph_label_lut.to_csv(save_dir + 'graph-label-lut.csv', index = False, header = False)
graph_label_cp.to_csv(save_dir + 'graph-label-cp.csv', index = False, header = False)
graph_label_ff.to_csv(save_dir + 'graph-label-ff.csv', index = False, header = False)

NODE.to_csv(save_dir + 'node-feat.csv', index = False, header = False)
EDGE_list.to_csv(save_dir + 'edge.csv', index = False, header = False)
EDGE_feat.to_csv(save_dir + 'edge-feat.csv', index = False, header = False)

The following is to generate training/valid/test set.

In [18]:
### for training set
from sklearn import model_selection
basis = 18570
num_real_case = 56
save_dir = 'real_case/all_real_case/'
train, valid = model_selection.train_test_split([i for i in range(basis)], train_size = 0.997, test_size = None)
test_list = pd.DataFrame([i for i in range(basis, basis+num_real_case)])
train_list = pd.DataFrame(sorted(train))
valid_list = pd.DataFrame(sorted(valid) + [i for i in range(basis, basis + num_real_case)])
test_list.to_csv(save_dir + 'test.csv', index = False, header = False)
train_list.to_csv(save_dir + 'train.csv', index = False, header = False)
valid_list.to_csv(save_dir + 'valid.csv', index = False, header = False)