### Export CPG

In [1]:
import os
import re
import time
import html
import pydot
import torch
import json
import shutil
import subprocess
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv("D:\\iSE_vulCode\\data\\raw\\train.csv")
PATH_CODE = "data/code/"
PATH_CPG = "data/cpg/"
PATH_DOT = "data/dot"
PATH_JSON = "data/json/"

In [3]:
def clean_code(code):
    code = re.sub(r'"(.*?)"', 'STRING', code)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'\n\s*\n', '\n', code).strip()
    code = re.sub(r'\s+\(', '(', code)
    code = code.replace('"', '')
    return code

In [4]:
def to_C_files(code, index, out_path):
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    file_name = f"{index}.c"
    with open(out_path + file_name, 'w') as f:
        f.write(code)

def to_DOT_files(code, index, out_path):
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    file_name = f"{index}.dot"
    with open(out_path + file_name, 'w') as f:
        f.write(code)

In [5]:
def joern_parse(input_path, output_path, file_name):
    out_file = file_name + ".bin"
    joern_parse_call = subprocess.run(
        ["joern-parse.bat",
        input_path,
        "--output",
        output_path + out_file],
        stdout=subprocess.PIPE,
        text=True, 
        check=True)
    return joern_parse_call.stdout

def joern_export(index, output_dir, output_format="dot"):
    joern_export_command = [
        "joern-export.bat",
        "--repr=all",
        f"{PATH_CPG}{index}_cpg.bin",
        "-o",
        output_dir,
        "--format",
        output_format
    ]
    joern_export_call = subprocess.run(
        joern_export_command,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        check=True
    )
    default_output_file = os.path.join(output_dir, f"export.dot")
    custom_output_file = os.path.join(output_dir, f"{index}.dot")
    shutil.move(default_output_file, custom_output_file)
    return joern_export_call.stdout

In [6]:
train_codes = []
train_labels = []

for i in range(len(train_data)):
    code = train_data.loc[i, 'func']
    code = clean_code(code)
    train_codes.append(code)
    label = train_data.loc[i, 'target']
    train_labels.append(label)

In [7]:
def code2cpg(index_code):
    to_C_files(train_codes[index_code], index_code , PATH_CODE)
    if os.path.exists(PATH_CODE) and os.path.isdir(PATH_CODE):
        joern_parse(PATH_CODE, PATH_CPG, f"{index_code}_cpg")

In [8]:
i = 0
code2cpg(i)
joern_export(0, PATH_DOT)

'[INFO ] initialising from existing storage (d:\\iSE_vulCode\\data\\cpg\\0_cpg.bin)\nexported 233 nodes, 1405 edges into d:\\iSE_vulCode\\data\\dot\n[INFO ] closing graph: writing to storage at `d:\\iSE_vulCode\\data\\cpg\\0_cpg.bin`\n'

### Embedding node


In [14]:
import re
import codecs
from typing import List
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from transformers import RobertaTokenizer, RobertaModel
import json
import pydot

In [53]:
def get_graph(input_path):
    graphs = pydot.graph_from_dot_file(input_path)
    return graphs[0]

def get_type(node):
    attributes = node.get_attributes()
    return attributes.get("TYPE_FULL_NAME")[1:-1]

def has_type(node):
    attributes = node.get_attributes()
    return "TYPE_FULL_NAME" in attributes

def has_code(node):
    attributes = node.get_attributes()
    return "CODE" in attributes

def get_code(node):
    if has_code(node):
        code = node.get_attributes()['CODE'][1:-1]
        if has_type(node) and not get_type(node) == "ANY" and get_type(node) not in code:
            code = f"{get_type(node)} {code}"
        return code
    return None
    
def has_line_number(node):
    attributes = node.get_attributes()
    return 'LINE_NUMBER' in attributes

def get_line_number(node):
    if has_line_number(node):
        return node.get_attributes()['LINE_NUMBER']
    return None

def has_column_number(node):
    attributes = node.get_attributes()
    return 'COLUMN_NUMBER' in attributes

def get_column_number(node):
    if has_column_number(node):
        return node.get_attributes()['COLUMN_NUMBER']
    return None

def get_method_full_name(node):
    attributes = node.get_attributes()
    if 'METHOD_FULL_NAME' in attributes:
        return attributes['METHOD_FULL_NAME'][1:-1]
    return None

def get(node):
    return node.get_attributes()

def get_operator(node):
    value = node.get_attributes()['METHOD_FULL_NAME']
    if value is None:
        return value
    if ("<operator>" in value) or ("<operators>" in value):
        return value.split(".")[-1][:-1] 
    return None

def get_label(node):
    label = node.get_attributes()['label']
    if label == "CALL":
        if get_method_full_name(node).startswith("<operator"):
            label = get_operator(node)
    return label

#### Node

In [16]:
def filter_nodes(nodes):
    ast_nodes = []
    for node in nodes:
        attributes = node.get_attributes()
        if all(key in attributes for key in ['LINE_NUMBER', 'CODE', 'COLUMN_NUMBER']):
            if attributes['label'] not in ['COMMENT', 'UNKNOWN'] and attributes['CODE'] not in ['"<empty>"', '"<global>"']:
                ast_nodes.append(node)
    return ast_nodes

def order_nodes(nodes, max_nodes=500):
    nodes_by_column = sorted(nodes, key=lambda node: get_column_number(node))
    sorted_nodes = sorted(nodes_by_column, key=lambda node: get_line_number(node))
    for i, node in enumerate(sorted_nodes):
        setattr(node, 'order', i)
    if len(sorted_nodes) > max_nodes:
        return sorted_nodes[:max_nodes]
    return sorted_nodes

In [37]:
node_labels = ["BLOCK", "CALL", "COMMENT", "CONTROL_STRUCTURE", "FILE", "IDENTIFIER", "FIELD_IDENTIFIER", "LITERAL",
               "LOCAL", "MEMBER", "METADATA", "METHOD", "METHODINST", "METHOD_PARAMETER_IN", "METHOD_PARAMETER_OUT",
               "METHOD_RETURN", "NAMESPACE", "NAMESPACE_BLOCK", "RETURN", "TYPE", "TYPEDECL", "UNKNOWN", "JUMP_TARGET"]

operators = ['addition', 'addressOf', 'and', 'arithmeticShiftRight', 'assignment',
             'assignmentAnd', 'assignmentArithmeticShiftRight', 'assignmentDivision',
             'assignmentMinus', 'assignmentMultiplication', 'assignmentOr', 'assignmentPlus',
             'assignmentShiftLeft', 'assignmentXor', 'cast', 'conditionalExpression',
             'division', 'equals', 'fieldAccess', 'greaterEqualsThan', 'greaterThan',
             'indirectFieldAccess', 'indirectIndexAccess', 'indirection', 'lessEqualsThan',
             'lessThan', 'logicalAnd', 'logicalNot', 'logicalOr', 'minus', 'modulo', 'multiplication',
             'not', 'notEquals', 'or', 'postDecrement', 'plus', 'postIncrement', 'preDecrement',
             'preIncrement', 'shiftLeft', 'sizeOf', 'subtraction', 'pointerCall', 'alloc', 'expressionList',
             'conditional', 'arrayInitializer', 'xor', 'op_ellipses', 'assignmentModulo', 'bracketedPrimary']

node_labels += operators
node_labels = {label: i for i, label in enumerate(node_labels)}
node_labels['bracketedPrimary']

74

#### Edge

In [17]:
def filter_edges(nodes, edges):
    valid_edges = []
    node_ids = set(node.get_name() for node in nodes)
    for edge in edges:
        src = edge.get_source()
        dst = edge.get_destination()
        if src in node_ids and dst in node_ids:
            valid_edges.append(edge)
    return valid_edges

def get_nodes_edges(input_path):
    graph = get_graph(input_path)
    nodes = filter_nodes(graph.get_nodes())
    nodes = order_nodes(nodes)
    edges = filter_edges(nodes, graph.get_edges())
    return nodes, edges

In [45]:
import torch
import torch.nn as nn
import numpy as np
from transformers import BertTokenizer, BertModel

class NodesEmbedding:
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.bert = BertModel.from_pretrained('bert-base-uncased').to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.node_type_embedding = nn.Embedding(75, 16).to(self.device)

    def __call__(self, nodes):
        embedded_nodes = self.embed_nodes(nodes)
        nodes_tensor = torch.tensor(embedded_nodes).float().to(self.device)
        self.target = torch.zeros(nodes_tensor.size(0), nodes_tensor.size(1)).float().to(self.device)
        self.target[:nodes_tensor.size(0), :] = nodes_tensor
        return self.target

    def embed_nodes(self, nodes):
        embeddings = []
        with torch.no_grad():
            for node in nodes:
                node_code = get_code(node)
                input_ids, attention_mask = encode_input(node_code, self.tokenizer)
                cls_feats = self.bert(input_ids.to(self.device), attention_mask.to(self.device))[0][:, 0]
                source_embedding = torch.mean(cls_feats, dim=0)
                node_type = get_label(node)
                type_embedding = self.node_type_embedding(torch.tensor(node_labels[node_type]).to(self.device))
                embedding = torch.cat((type_embedding, source_embedding), dim=0).cpu().numpy()
                embeddings.append(embedding)
        return np.array(embeddings)


In [21]:
class GraphsEmbedding:
    def __init__(self, edge_type):
        self.edge_type = edge_type

    def __call__(self, nodes, edges):
        adj_matrix = self.get_adj_matrix(nodes, edges)
        return torch.tensor(adj_matrix, dtype=torch.long)

    def get_adj_matrix(self, nodes, edges):
        node_order = {node.get_name(): node.order for node in nodes}
        n = len(node_order)
        adjacency_matrix = [[0] * n for _ in range(n)]

        for edge in edges:
            edge_label = edge.get_label()
            if self.edge_type == 'PDG' and edge_label not in {'CDG', 'DDG'}:
                continue
            if self.edge_type != 'PDG' and edge_label != self.edge_type:
                continue
            src_id, dst_id = edge.get_source(), edge.get_destination()
            if src_id in node_order and dst_id in node_order:
                in_order, out_order = node_order[src_id], node_order[dst_id]
                if in_order is not None and out_order is not None:
                    adjacency_matrix[in_order][out_order] = 1

        return adjacency_matrix

In [22]:
def nodes_to_input(nodes, edges, target, nodes_embedding):
    label = torch.tensor([target]).long()

    ast = GraphsEmbedding('AST')
    cfg = GraphsEmbedding('CFG')
    pdg = GraphsEmbedding('PDG')

    ast_adj_matrix = ast(nodes, edges)
    cfg_adj_matrix = cfg(nodes, edges)
    pdg_adj_matrix = pdg(nodes, edges)

    return Data(x = nodes_embedding(nodes),
                ast_adj_matrix = ast_adj_matrix,
                cfg_adj_matrix = cfg_adj_matrix,
                pdg_adj_matrix = pdg_adj_matrix,
                y=label)

In [None]:
import os
import pickle
output_dir = "data/emb"
BATCH_SIZE = 50
os.makedirs(output_dir, exist_ok=True)

nodes_embedding = NodesEmbedding()
for batch_idx in range(0, 400):
    batch_embeddings = []
    start_code_idx = batch_idx * BATCH_SIZE
    end_code_idx = (batch_idx + 1) * BATCH_SIZE

    for index in range(start_code_idx, end_code_idx):
        print(index)
        code2cpg(index)
        joern_export(index, PATH_DOT)
        file_path = f"data/dot/{index}.dot"
        nodes, edges = get_nodes_edges(file_path)
        label = train_labels[index]
        embed = nodes_to_input(nodes, edges, label, nodes_embedding)
        batch_embeddings.append(embed)

        time.sleep(0.1)
        del embed, nodes, edges
        shutil.rmtree(PATH_CODE)
        shutil.rmtree(PATH_DOT)
        torch.cuda.empty_cache()

    batch_file = os.path.join(output_dir, f"batch_{batch_idx}.pkl")
    with open(batch_file, "wb") as f:
        pickle.dump(batch_embeddings, f)
    print(f"Saved batch {batch_idx} to {batch_file}")