In [54]:
import os
import re
import pydot
import json
import shutil
import subprocess
import numpy as np
import pandas as pd

In [None]:
train_data = pd.read_csv("D:\\iSE_vulCode\\data\\raw\\train.csv")
PATH_CODE = "data/code/"
PATH_CPG = "data/cpg/"
PATH_DOT = "data/dot/"
PATH_JSON = "data/json/"

In [43]:
def clean_code(code):
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'//.*', '', code)
    code = re.sub(r'\n\s*\n', '\n', code).strip()
    code = re.sub(r'\s+\(', '(', code)
    return code

In [31]:
def to_C_files(code, index, out_path):
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    file_name = f"{index}.c"
    with open(out_path + file_name, 'w') as f:
        f.write(code)

def to_DOT_files(code, index, out_path):
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    file_name = f"{index}.dot"
    with open(out_path + file_name, 'w') as f:
        f.write(code)

In [None]:
def joern_parse(input_path, output_path, file_name):
    out_file = file_name + ".bin"
    joern_parse_call = subprocess.run(["joern-parse.bat", input_path, "--output", output_path + out_file],
                                      stdout=subprocess.PIPE, text=True, check=True)

def joern_export(cpg_file_path, output_path, output_format="dot"):
    output_folder = output_path
    joern_export_call = subprocess.run(
        ["joern-export.bat", cpg_file_path, "-o", output_folder, "--format", output_format],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=True
    )

In [None]:
train_codes = []
train_labels = []

for i in range(len(train_data)):
    code = train_data.loc[i, 'code']
    code = clean_code(code)
    train_codes.append(code)
    label = train_data.loc[i, 'Label']
    train_labels.append(label)

In [None]:
def process_dot(graph, function_name, file_name):
    function = {
        "function": function_name,
        "file": file_name,
        "AST": [],
        "CFG": [],
        "PDG": []
    }

    nodes = graph.get_nodes()
    edges = graph.get_edges()

    # Tạo từ điển cạnh để tra cứu nhanh
    edge_dict = {}
    for edge in edges:
        src_id = edge.get_source().strip('"')
        dst_id = edge.get_destination().strip('"')
        label_code = edge.get_label().strip('"').split(': ') if edge.get_label() else ["None"]
        label = label_code[0]
        code = label_code[1] if len(label_code) > 1 and label_code[1].strip() else "None"

        # Ghi cạnh vào từ điển
        if src_id not in edge_dict:
            edge_dict[src_id] = []
        if dst_id not in edge_dict:
            edge_dict[dst_id] = []
        
        edge_dict[src_id].append({
            "in": src_id,
            "out": dst_id,
            "label": label,
            "code": code
        })

    for node in nodes:
        node_id = node.get_name().strip('"')
        node_label = node.get_label().strip('<>').strip('"') if node.get_label() else "None"
        node_type_code = re.sub(r"<SUB>.*", "", node_label).split(',')

        node_type = 'None'
        node_code = 'None'

        # Xử lý phân tích node_type_code
        if len(node_type_code) == 2:
            node_type = node_type_code[0].strip('(')
            node_code = node_type_code[1][:-1]
        elif len(node_type_code) == 1:
            node_type = node_type_code[0].strip()
        else:
            if node_type_code[0] == "BLOCK":
                node_type = node_type_code[0].strip('(')
                node_code = node_type_code[1] + node_type_code[2][:-1]
            else:
                node_type = node_type_code[0].strip('(') + ' ' + node_type_code[1]
                node_code = node_type_code[2][:-1]
        node_data = {
            "id": node_id,
            "edges": [],
            "properties": []
        }
        node_data["properties"].append({"type": node_type, "code": node_code})

        # Lấy các cạnh liên quan
        related_edges = edge_dict.get(node_id, [])
        for edge in related_edges:
            edge_data = {
                "in": edge["in"],
                "out": edge["out"],
                "label": edge["label"],
                "code": edge["code"]
            }
            node_data["edges"].append(edge_data)

        # Phân loại node vào AST, CFG, hoặc PDG
        is_ast = any(edge["label"].startswith("AST") for edge in related_edges)
        is_cfg = any(edge["label"].startswith("CFG") for edge in related_edges)
        is_pdg = any(edge["label"].startswith("DDG") for edge in related_edges)

        if is_ast:
            function["AST"].append(node_data)
        if is_cfg:
            function["CFG"].append(node_data)
        if is_pdg:
            function["PDG"].append(node_data)

    return function

In [None]:
def code2cpg(index_code):
    to_C_files(train_codes[index_code], index_code , PATH_CODE)
    if os.path.exists(PATH_CODE) and os.path.isdir(PATH_CODE):
        joern_parse(PATH_CODE, PATH_CPG, f"{index_code}_cpg")

def cpg2dot(cpg_file_path):
    joern_export(cpg_file_path, PATH_DOT)

def dot2json(input_folder, index, output_folder):
    output_json_path = f'data/{output_folder}/{index}.json'

    json_output = {
        "functions": []
    }
    dot_files = [f for f in os.listdir(input_folder) if f.lower().endswith('.dot')]
    for dot_file in dot_files:
        file_path = os.path.join(input_folder, dot_file)
        try:
            # Đọc file .dot
            graphs = pydot.graph_from_dot_file(file_path)
            if not graphs:
                print(f"Không thể đọc đồ thị từ file .dot: {file_path}. Bỏ qua tệp này.")
                continue

            graph = graphs[0]
            subgraphs = graph.get_subgraphs()

            # Nếu có subgraphs, mỗi subgraph là một function
            if subgraphs:
                for subgraph in subgraphs:
                    function_label = subgraph.get_label().strip('"') if subgraph.get_label() else "unknown_function"
                    function = process_dot(subgraph, function_label, dot_file)
                    json_output["functions"].append(function)
            else:
                # Nếu không có subgraphs, coi toàn bộ đồ thị là một function
                function_name = graph.get_name().strip('"') or "unknown_function"
                function = process_dot(graph, function_name, dot_file)
                json_output["functions"].append(function)

            print(f"Đã xử lý thành công tệp: {dot_file}")

        except Exception as e:
            print(f"Đã xảy ra lỗi khi xử lý file {file_path}: {e}")

    # Lưu kết quả ra tệp JSON
    if json_output["functions"]:
        try:
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(json_output, f, ensure_ascii=False, indent=2)
            print(f"Chuyển đổi thành công! Kết quả đã được lưu vào '{output_json_path}'.")
        except Exception as e:
            print(f"Đã xảy ra lỗi khi ghi tệp JSON: {e}")
    else:
        print("Không tìm thấy functions trong bất kỳ tệp .dot nào.")
