## 构建neo4j图

In [10]:
import networkx as nx
from py2neo import Graph, Node, Relationship
from networkx.readwrite import json_graph
import json
import pandas as pd

#### 导入数据

In [4]:
graph_path = "../data/CICAPT_IIOT/Phase2_Provenance.graphml"
graph = nx.read_graphml(graph_path)
print(f"The number of graph nodes are: {graph.number_of_nodes()} and the number of edges are: {graph.number_of_edges()}")

The number of graph nodes are: 53286 and the number of edges are: 143449


In [3]:
# 打印前5个节点及其属性
print("前5个节点及其属性：")
for i, (node, attr) in enumerate(graph.nodes(data=True)):
    print(f"节点ID: {node}, 属性: {attr}")
    if i >= 4:
        break

# 打印前5条边及其属性
print("\n前5条边及其属性：")
for i, (u, v, attr) in enumerate(graph.edges(data=True)):
    print(f"起点: {u}, 终点: {v}, 属性: {attr}")
    if i >= 4:
        break

前5个节点及其属性：
节点ID: e36715b62c8cdc32e47483b3600712f1, 属性: {'type': 'Process', 'uid': 0.0, 'egid': 0.0, 'exe': '/usr/bin/dash', 'gid': 0.0, 'euid': 0.0, 'name': 'ethtool', 'path': nan, 'subtype': nan, 'permissions': nan, 'remote port': nan, 'remote address': nan}
节点ID: ca5b322fa4d4cb63aa5dd9fbd88e37e4, 属性: {'type': 'Process', 'uid': 0.0, 'egid': 0.0, 'exe': '/usr/bin/dash', 'gid': 0.0, 'euid': 0.0, 'name': 'ethtool', 'path': nan, 'subtype': nan, 'permissions': nan, 'remote port': nan, 'remote address': nan}
节点ID: 216f4d4893942a66d649c4e02b5722c4, 属性: {'type': 'Artifact', 'uid': nan, 'egid': nan, 'exe': nan, 'gid': nan, 'euid': nan, 'name': nan, 'path': '/etc/network/if-up.d/ethtool', 'subtype': 'file', 'permissions': 755.0, 'remote port': nan, 'remote address': nan}
节点ID: bdc2ca1d94be5a3958f6621d8b52fdb0, 属性: {'type': 'Artifact', 'uid': nan, 'egid': nan, 'exe': nan, 'gid': nan, 'euid': nan, 'name': nan, 'path': '/bin/sh', 'subtype': 'file', 'permissions': 755.0, 'remote port': nan, 'remote

#### 导出为json格式

In [None]:
json_data = json_graph.node_link_data(graph)
json_save_dir = "../data/CICAPT_IIOT/Phase2_Provenance.json"

with open(json_save_dir, 'w') as f:
    json.dump(json_data, f, indent=2)

#### 导出为csv

In [9]:
nodes_csv_save_dir = "../data/CICAPT_IIOT/Phase2_Provenance_nodes.csv"
edges_csv_save_dir = "../data/CICAPT_IIOT/Phase2_Provenance_edges.csv"

# 导出nodes
nodes = [{'id': n, **graph.nodes[n]} for n in graph.nodes()]
nodes_df = pd.DataFrame(nodes)
nodes_df.to_csv(nodes_csv_save_dir, index=False)

# 导出edges
edges = [{'from': u, 'to': v, **attrs} for u, v, attrs in graph.edges(data=True)]
edges_df = pd.DataFrame(edges)
edges_df.to_csv(edges_csv_save_dir, index=False)


#### 连接neo4j

In [7]:
G = Graph('bolt://localhost:7687', auth=('neo4j', 'qujinhao150ok'))
# 删除所有实体和关系
cypher = 'MATCH (n) DETACH DELETE n'
G.run(cypher)

#### 构建图

In [8]:
# 创建nodes
nodes_cache = {} # 保存所有的node结点，构建边时需要用到
for node_id, attrs in graph.nodes(data=True):
    main_label = attrs.get('type', 'unknown') # 主标签
    labels = [main_label]
    if main_label == 'artifact' and 'subtype' in attrs:
        labels.append(attrs['subtype']) # 子标签
    node_attrs = {k: v for k, v in attrs.items() if k not in ['type', 'subtype']}
    node = Node(*labels, id=node_id, **node_attrs)
    G.create(node)
    nodes_cache[node_id] = node

In [11]:
# 创建edges
for u, v, attrs in graph.edges(data=True):
    rel_type = attrs.get('type', 'unknown')  # 关系类型
    # 关系属性，去掉type字段
    rel_attrs = {k: v for k, v in attrs.items() if k != 'type'}
    # 获取起点和终点节点对象
    start_node = nodes_cache.get(u)
    end_node = nodes_cache.get(v)
    if start_node is None or end_node is None:
        print(f"节点未找到: {u} 或 {v}")
        continue
    # 创建关系
    rel = Relationship(start_node, rel_type, end_node, **rel_attrs)
    G.create(rel)

##### 构建process结点

In [12]:
# 字段包括：id	type	uid	egid	exe	gid	euid	name


##### 构建artifact：file、directory、link结点

In [None]:
# 字段包括： id	type  path	subtype	permissions
# name为path，label为subtype(artifact)


##### 构建network socket(artifact)结点

In [None]:
# 字段包括： id	type subtype	remote port	remote address
# name需要构建 address:port

构建unknown(artifact)结点

In [None]:
# 没有属性，只有id	type subtype字段