In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import sys
import math
import time
import random
import networkx as nx
import pickle
from itertools import combinations
from collections import Counter


import torch
from torch_geometric.data import Data
from torch_geometric.utils import train_test_split_edges, degree


seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

## 数据读入

In [3]:
dataset = "dataset\\"
graph_id = 3

pklfile_tx_sort = dataset + f"EthereumG{graph_id}\\LPsubG{graph_id}" + "_df_sort.pickle"

df_tx = pd.read_pickle(pklfile_tx_sort)
df_tx['From'] = df_tx['From'].astype(int)
df_tx['To'] = df_tx['To'].astype(int)
df_tx['TimeStamp'] = df_tx['TimeStamp'].astype(int)

In [4]:
df_tx

Unnamed: 0,From,To,Value,TimeStamp
0,9435,1773,1670.000000,1438269973
1,9435,6547,101.230000,1438269973
2,9435,1885,1200.000000,1438269973
3,9435,1886,800.000000,1438269973
4,9435,1887,370.000000,1438269973
...,...,...,...,...
1359143,1778,3063,1.000658,1547762069
1359144,1405,5844,0.200106,1547763634
1359145,77942,2985,0.101029,1547764823
1359146,1405,142,0.201796,1547782857


## 数据集统计

In [5]:
def graph_statistics(df):
    G = nx.Graph()
    
    min_time = sys.maxsize
    max_time = 0
    
    for _, row in df.iterrows():
        x = int(row['From'])
        y = int(row['To'])
        a = row['Value']
        t = int(row['TimeStamp'])
        
        if G.has_edge(x, y):
            G[x][y]['weight'] += a                        
        else:    
            G.add_edge(x, y, timestamp=t, weight=a)     
                    
        if t < min_time:
            min_time = t
        elif t > max_time:
            max_time = t  

    num_nodes = G.number_of_nodes()  # 节点数量
    num_edges = G.number_of_edges()  # 边数量

    avg_degree = np.mean([deg for node, deg in G.degree()])  # 平均度数
    
    avg_clustering_coefficient = nx.average_clustering(G)  # 平均聚类系数

    time_span = max_time - min_time  # 时间跨度

    print(f'Number of nodes: {num_nodes}')
    print(f'Number of edges: {num_edges}')
    print(f'Average degree: {avg_degree:.3f}')
    print(f'Average clustering coefficient: {avg_clustering_coefficient:.3f}')
    print(f'Transaction time span: {time_span} seconds')

In [6]:
# df_tx.drop(0, inplace = True) # 针对EthereumG1

graph_statistics(df_tx)

Number of nodes: 103916
Number of edges: 118440
Average degree: 2.280
Average clustering coefficient: 0.016
Transaction time span: 109537157 seconds


## 数据预处理

In [119]:
total_rows = len(df_tx)

# 划分数据集的三个部分：%用于生成节点嵌入，%用于训练和测试
emb_ratio = 0.7
emb_end = int(total_rows * emb_ratio)

# 取出前 % 和后 % 数据
df_emb = df_tx.iloc[:emb_end]
df_train_test = df_tx.iloc[emb_end:]

# 获取前 % 数据中的所有节点（From 和 To 列）
emb_nodes = set(df_emb['From']).union(set(df_emb['To']))

# 确保用于训练和测试的交易中所涉及的节点存在嵌入
df_train_test_filtered = df_train_test[df_train_test['From'].isin(emb_nodes) & df_train_test['To'].isin(emb_nodes)]

# 打印划分后的数据
print(f"Embedding data shape: {df_emb.shape}")
print(f"Train and Test data shape (filtered): {df_train_test_filtered.shape}")

Embedding data shape: (951403, 4)
Train and Test data shape (filtered): (279581, 4)


In [120]:
def sample_neg_edges(edges_pos, nodes, neg_samples_per_pos=1):    
    edges_neg = []

    # 随机采样负样本
    while len(edges_neg) < len(edges_pos) * neg_samples_per_pos:
        # 随机选择一个节点对 (i, j)
        i = random.choice(nodes)
        j = random.choice(nodes)
        
        # 确保节点对 (i, j) 不相等并且不是正样本
        if i != j and (i, j) not in edges_pos and (j, i) not in edges_pos:
            edges_neg.append((i, j))

    return edges_neg

In [121]:
# 获取训练测试交易集的所有节点
train_test_nodes = list(set(df_train_test_filtered['From']).union(set(df_train_test_filtered['To'])))

# 获取训练测试交易集的所有边，保证训练测试边集的数据唯一性，并打乱时间顺序
train_test_edges_pos = list(set(zip(df_train_test_filtered['From'], df_train_test_filtered['To'])))
random.shuffle(train_test_edges_pos)

# 随机采样负样本
train_test_edges_neg = sample_neg_edges(train_test_edges_pos, train_test_nodes)

print("Num of positive Train Test edges:", len(train_test_edges_pos))
print("Num of negative Train Test edges:", len(train_test_edges_neg))

Num of positive Train Test edges: 4677
Num of negative Train Test edges: 4677


In [122]:
# 划分为训练集（50%）和测试集（50%）
train_pos_size = int(len(train_test_edges_pos) * 0.5)
train_neg_size = int(len(train_test_edges_neg) * 0.5)


# 训练集
train_edges_pos = train_test_edges_pos[:train_pos_size]
train_edges_neg = train_test_edges_neg[:train_neg_size]

# 测试集
test_edges_pos = train_test_edges_pos[train_pos_size:]
test_edges_neg = train_test_edges_neg[train_neg_size:]

print("Num of positive Train edges:", len(train_edges_pos))
print("Num of negative Train edges:", len(train_edges_neg))
print("Num of positive Test edges:", len(test_edges_pos))
print("Num of negative Test edges:", len(test_edges_neg))

Num of positive Train edges: 2338
Num of negative Train edges: 2338
Num of positive Test edges: 2339
Num of negative Test edges: 2339


In [123]:
train_test_edges = {}
train_test_edges['train_edges_pos'] = train_edges_pos
train_test_edges['train_edges_false'] = train_edges_neg
train_test_edges['test_edges_pos'] = test_edges_pos
train_test_edges['test_edges_false'] = test_edges_neg

read or write processed data:

In [7]:
graph_id = 3
emb_ratio = 0.7

data_path = "dataset\\"
pklfile_emb = data_path + f"LPsubG{graph_id}_df_emb_{emb_ratio}.pickle"
pklfile_train_test_edges = data_path + f"LPsubG{graph_id}_train_test_edges_{emb_ratio}.pickle"

# 写
# df_emb.to_pickle(pklfile_emb)
# with open(pklfile_train_test_edges, 'wb') as f:
#     pickle.dump(train_test_edges, f)

# 读    
df_emb = pd.read_pickle(pklfile_emb)
if os.path.exists(pklfile_train_test_edges):    
    with open( pklfile_train_test_edges,"rb") as f:     
        train_test_edges = pickle.load(f)
        train_edges_pos = train_test_edges['train_edges_pos']
        train_edges_neg = train_test_edges['train_edges_false']
        test_edges_pos = train_test_edges['test_edges_pos']
        test_edges_neg = train_test_edges['test_edges_false'] 

## 特征提取

In [8]:
def extract_account_features(df_tx, is_df=True):
    # 初始化账户统计信息
    account_features = {}

    # 遍历交易数据，更新统计信息
    for idx, tx in df_tx.iterrows():
        sender = int(tx["From"])
        receiver = int(tx["To"])
        amount = tx["Value"]
        timestamp = int(tx["TimeStamp"])

        # 更新发送方信息
        if sender not in account_features:
            account_features[sender] = {"in_txs": 0,  "in_amount": 0, \
                                     "out_txs": 0,  "out_amount": 0,\
                                     "first_active": timestamp, "last_active": timestamp}
        account_features[sender]["out_txs"] += 1
        account_features[sender]["out_amount"] += amount
        if timestamp < account_features[sender]["first_active"]:
            account_features[sender]["first_active"] = timestamp
        if timestamp > account_features[sender]["last_active"]:
            account_features[sender]["last_active"] = timestamp


        # 更新接收方信息
        if receiver not in account_features:
            account_features[receiver] = {"in_txs": 0,  "in_amount": 0, \
                                     "out_txs": 0, "out_amount": 0, \
                                     "first_active": timestamp, "last_active": timestamp}
        account_features[receiver]["in_txs"] += 1
        account_features[receiver]["in_amount"] += amount
        if timestamp < account_features[receiver]["first_active"]:
            account_features[receiver]["first_active"] = timestamp
        if timestamp > account_features[receiver]["last_active"]:
            account_features[receiver]["last_active"] = timestamp


    # 计算存活周期和平均信息，打标签
    for account, features in account_features.items():
        features["total_txs"] = features["in_txs"] + features["out_txs"]
        features["balance"] = features["in_amount"] - features["out_amount"]
        features["avg_in_amount"] = features["in_amount"] / features["in_txs"] if features["in_txs"] > 0 else features["in_amount"]
        features["avg_out_amount"] = features["out_amount"] / features["out_txs"] if features["out_txs"] > 0 else features["out_amount"]
        features["lifespan"] = features["last_active"] - features["first_active"]
        features["freq_in"] = features["in_txs"] / (features["lifespan"] / 86400) if features["lifespan"] > 0 else features["in_txs"]  # xx笔交易/天
        features["freq_out"] = features["out_txs"] / (features["lifespan"] / 86400) if features["lifespan"] > 0 else features["out_txs"]
        features["freq"] = features["freq_in"] + features["freq_out"]
    
    if not is_df:
        return account_features

    df_account_features = pd.DataFrame(list(account_features.values()), index = list(account_features.keys()))
    df_account_features = df_account_features.drop(['first_active', 'last_active'], axis=1)
    
    return df_account_features

In [9]:
def extract_edge_features(df_tx):
    # 按'From'和'To'列进行分组，并聚合Value列的和、交易数量和时间跨度
    edge_features = df_tx.groupby(['From', 'To']).agg(
            Sum_amount=('Value', 'sum'),  # 将 'Value' 列求和并重命名为 'Sum_amount'
            Txs=('TimeStamp', 'count'),  # 计算 'TimeStamp' 的数量
            Time_min=('TimeStamp', 'min'),  # 计算 'TimeStamp' 的最小值
            Time_max=('TimeStamp', 'max')   # 计算 'TimeStamp' 的最大值
        ).rename(columns={'Value': 'Amount'}).reset_index()

    edge_features['TimeSpan'] = edge_features['Time_max'] - edge_features['Time_min']

    edge_features = edge_features[['From', 'To', 'Sum_amount', 'Txs', 'TimeSpan']]

    edge_features["Avg_amount"] = edge_features["Sum_amount"] / edge_features["Txs"]
    edge_features["TimeSpan"] = edge_features["TimeSpan"].replace(0, 1)  # 单位：秒

    return edge_features

## 图构建

### 静态图

In [91]:
class txGraph(Data):
    def __init__(self, tx, feature, train_test_edges, seed=seed):
        super().__init__()
        
        nodes_dict = {value: idx for idx, value in enumerate(feature.index.tolist())}
        
        edges = self.edge_idx_map(nodes_dict, set(zip(tx["From"].tolist(), tx["To"].tolist())))
        train_edges_pos = self.edge_idx_map(nodes_dict, train_test_edges['train_edges_pos'])
        train_edges_neg = self.edge_idx_map(nodes_dict, train_test_edges['train_edges_false'])
        test_edges_pos = self.edge_idx_map(nodes_dict, train_test_edges['test_edges_pos'])
        test_edges_neg = self.edge_idx_map(nodes_dict, train_test_edges['test_edges_false'])
        
        self.edge_index = torch.tensor(edges, dtype=torch.long).t()
        self.train_edge_pos_index = torch.tensor(train_edges_pos, dtype=torch.long).t()
        self.train_edge_neg_index = torch.tensor(train_edges_neg, dtype=torch.long).t()
        self.test_edge_pos_index = torch.tensor(test_edges_pos, dtype=torch.long).t()
        self.test_edge_neg_index = torch.tensor(test_edges_neg, dtype=torch.long).t()

        self.time = torch.tensor(tx["TimeStamp"].tolist(), dtype=torch.long)

        self.x = torch.tensor(feature.values)  # 节点特征
        self.num_nodes = self.x.shape[0]  # 节点数量
        
    def edge_idx_map(self, nodes_dict, edges):
        
        return list(map(lambda x: (nodes_dict[x[0]], nodes_dict[x[1]]), list(edges)))
        

In [92]:
data = txGraph(df_emb, df_emb_account_features_normalized, train_test_edges)

graph_path = '..\\code_tracking_eth\\graphs\\'
with open(graph_path + f'LPsubG{graph_id}_static_graph_with_node_features_{emb_ratio}.pkl', 'wb') as f:
    pickle.dump(data, f)
    
# with open(data_path + 'LPsubG3_static_graph_with_node_features.pkl', 'rb') as f:
#     data = pickle.load(f)

In [93]:
print("Num of nodes:", data.num_nodes)
print("Dimension of node features:", data.x.shape[1])
print("Num of edges:", data.edge_index.shape[1])
print("Num of train edges for LP:", data.train_edge_pos_index.shape[1])
print("Num of test edges for LP:", data.test_edge_pos_index.shape[1])

Num of nodes: 4767
Dimension of node features: 12
Num of edges: 8210
Num of train edges for LP: 587
Num of test edges for LP: 587


### 动态图

In [10]:
# 设置时间步数量
temporal_steps = 8
# df_emb.drop(0, inplace = True) # 针对EthereumG1

overlap_ratio = 0.1

In [11]:
def split_df_by_temporal_steps_with_overlap(df, temporal_steps=temporal_steps, overlap_ratio=overlap_ratio):
    
    # 计算时间跨度
    start_time = df['TimeStamp'].min()
    end_time = df['TimeStamp'].max()
    time_span = end_time - start_time  # 总时间跨度，单位为秒
    
    # 计算滑动窗口大小 W 和步长 S
    window_size = time_span / (temporal_steps * (1 - overlap_ratio) + overlap_ratio)
    step_size = window_size * (1 - overlap_ratio)
    
    # 存储时间快照
    snapshots = []
    
    # 初始化滑动窗口的起始时间
    current_start_time = start_time
    
    for i in range(temporal_steps):
        # 计算当前快照的结束时间
        current_end_time = current_start_time + window_size
        
        # 提取当前时间段的数据
        snapshot = df[(df['TimeStamp'] >= current_start_time) & (df['TimeStamp'] <= current_end_time)]
        snapshots.append(snapshot)
        
        # 更新下一次快照的起始时间
        current_start_time = current_start_time + step_size  # 有重叠部分
    
    # 输出每个子 DataFrame 的大小
    for i, sub_df in enumerate(snapshots):
        print(f"TimeStep {i} size: {sub_df.shape[0]} rows")
    
    return snapshots

In [12]:
def split_df_by_temporal_steps(df, temporal_steps=temporal_steps):
    """
    将 DataFrame 按照时间戳分成多个时间步区间，并输出每个子 DataFrame 的大小。
    """

    # 获取时间戳的最小值和最大值
    min_timestamp = df['TimeStamp'].min()
    max_timestamp = df['TimeStamp'].max()

    # 使用 pandas 的 cut 函数来划分时间戳
    time_bins = np.linspace(min_timestamp, max_timestamp, temporal_steps + 1)  # 创建时间步区间的边界
    df['TimeStep'] = pd.cut(df['TimeStamp'], bins=time_bins, labels=False, include_lowest=True)

    # 按 TimeStep 列分组并拆分为多个子 DataFrame
    df_list = [df[df['TimeStep'] == i] for i in range(temporal_steps)]

    # 输出每个子 DataFrame 的大小
    for i, sub_df in enumerate(df_list):
        print(f"TimeStep {i} size: {sub_df.shape[0]} rows")

    return df_list

In [13]:
# df_emb_list = split_df_by_temporal_steps(df_emb.copy())
df_emb_list = split_df_by_temporal_steps_with_overlap(df_emb.copy())

TimeStep 0 size: 75100 rows
TimeStep 1 size: 78082 rows
TimeStep 2 size: 75503 rows
TimeStep 3 size: 118812 rows
TimeStep 4 size: 170545 rows
TimeStep 5 size: 177593 rows
TimeStep 6 size: 213917 rows
TimeStep 7 size: 133541 rows


In [14]:
def extract_node_features_for_temporal_steps(df_list, extract_account_features, all_nodes=None):
     # 如果没有提供 all_nodes，自动从 df_list 中提取所有节点
    if all_nodes is None:
        all_nodes = set()
        for sub_df in df_list:
            all_nodes.update(set(sub_df['From']).union(set(sub_df['To'])))

    df_features_list = []

    # 为每个子 DataFrame 提取节点特征
    for i, sub_df in enumerate(df_list):
        # 提取当前时间步的节点特征
        node_features = extract_account_features(sub_df, False)
        
        # 获取节点特征的默认值
        null_feature = {key: 0 for key in list(node_features.values())[0]}
        
        # 创建一个特征矩阵，其中每个节点的特征为空值时设置为 null_feature
        df_features = {}
        for node in all_nodes:
            df_features[node] = node_features.get(node, null_feature)
        
        # 将特征字典转换为 DataFrame
        df_features = pd.DataFrame(list(df_features.values()), index=list(df_features.keys()))
        
        df_features = df_features.drop(['first_active', 'last_active'], axis=1, errors='ignore')
        
        non_zero_indices = list(node_features.keys())
        df_non_zero_features = df_features.loc[non_zero_indices]
        
        # 标准化非零特征
        df_non_zero_features_normalized = (df_non_zero_features - df_non_zero_features.mean()) / df_non_zero_features.std(ddof=0)
        df_features.loc[non_zero_indices] = df_non_zero_features_normalized
        
        df_features_list.append(df_features)
    
    return df_features_list

In [15]:
all_nodes = set(df_emb['From']).union(set(df_emb['To'])) 
df_emb_account_features_list = extract_node_features_for_temporal_steps(df_emb_list, extract_account_features, all_nodes)

In [16]:
df_emb_account_features_list[0]

Unnamed: 0,in_txs,in_amount,out_txs,out_amount,total_txs,balance,avg_in_amount,avg_out_amount,lifespan,freq_in,freq_out,freq
1,48.993363,32.259604,2.206775,57.478909,27.267396,-6.179811e+00,-0.045635,0.347977,2.790806,0.983073,-0.153021,0.111685
4,-0.034835,-0.040161,-0.039829,-0.061549,-0.051371,1.257447e-07,-0.099136,-0.079331,-0.273542,-0.211888,-0.177015,-0.214598
5,-0.070031,0.008434,-0.047111,0.012926,-0.075722,1.024562e-07,0.255882,0.270840,-0.654231,0.664688,-0.053546,0.120802
6,0.006227,-0.017467,-0.017981,-0.026769,-0.011800,1.965016e-07,-0.078319,-0.063163,-0.050789,-0.209972,-0.176035,-0.213220
7,-0.081763,-0.041259,-0.036187,0.089233,-0.072678,-1.172000e-01,-0.101108,0.094629,-0.606888,-0.222291,-0.166600,-0.207800
...,...,...,...,...,...,...,...,...,...,...,...,...
103911,-0.081763,-0.041259,-0.047111,-0.063213,-0.081809,-1.453902e-05,-0.101108,-0.081842,-0.655351,-0.222291,-0.175963,-0.216294
103912,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
103913,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
103914,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [19]:
class tempTxGraph(Data):
    def __init__(self, tx_list, feature_list, all_nodes, train_test_edges, seed=seed):
        super().__init__()
        
        self.temporal_steps = len(tx_list)

        nodes_dict = {value: idx for idx, value in enumerate(all_nodes)}
        
        # 边索引
        self.edge_indexs = []
        for tx in tx_list:
            edges = self.edge_idx_map(nodes_dict, set(zip(tx["From"].tolist(), tx["To"].tolist())))
            self.edge_indexs.append(torch.tensor(edges, dtype=torch.long).t())
        
        train_edges_pos = self.edge_idx_map(nodes_dict, train_test_edges['train_edges_pos'])
        train_edges_neg = self.edge_idx_map(nodes_dict, train_test_edges['train_edges_false'])
        test_edges_pos = self.edge_idx_map(nodes_dict, train_test_edges['test_edges_pos'])
        test_edges_neg = self.edge_idx_map(nodes_dict, train_test_edges['test_edges_false'])
        self.train_edge_pos_index = torch.tensor(train_edges_pos, dtype=torch.long).t()
        self.train_edge_neg_index = torch.tensor(train_edges_neg, dtype=torch.long).t()
        self.test_edge_pos_index = torch.tensor(test_edges_pos, dtype=torch.long).t()
        self.test_edge_neg_index = torch.tensor(test_edges_neg, dtype=torch.long).t()

        # 节点特征
        self.x = []
        for feature in feature_list:
            self.x.append(torch.tensor(feature.values))
        
    def edge_idx_map(self, nodes_dict, edges):
        
        return list(map(lambda x: (nodes_dict[x[0]], nodes_dict[x[1]]), list(edges)))

In [23]:
data = tempTxGraph(df_emb_list, df_emb_account_features_list, all_nodes, train_test_edges)

graph_path = 'graphs\\'

# with open(graph_path + f'LPsubG{graph_id}_temp_graph_{data.temporal_steps}_overlap_{overlap_ratio}_with_node_features_{emb_ratio}.pkl', 'wb') as f:
#     pickle.dump(data, f)
    
    
with open(graph_path + f'LPsubG{graph_id}_temp_graph_{data.temporal_steps}_overlap_{overlap_ratio}_with_node_features_{emb_ratio}.pkl', 'rb') as f:
    data = pickle.load(f)

In [24]:
print("Num of temporal steps:", data.temporal_steps)
print("Num of nodes:", [feature.shape[0] for feature in data.x])
print("Dimension of node features:", [feature.shape[1] for feature in data.x])
print("Num of edges:", [edge_index.shape[1] for edge_index in data.edge_indexs])
print("Num of train edges for LP:", data.train_edge_pos_index.shape[1])
print("Num of test edges for LP:", data.test_edge_pos_index.shape[1])

Num of temporal steps: 8
Num of nodes: [37654, 37654, 37654, 37654, 37654, 37654, 37654, 37654]
Dimension of node features: [12, 12, 12, 12, 12, 12, 12, 12]
Num of edges: [8023, 3166, 2650, 5685, 10445, 12667, 19076, 5470]
Num of train edges for LP: 2338
Num of test edges for LP: 2339


动态边特征:

In [25]:
def extract_edge_features_for_temporal_steps(df_list, extract_edge_features, train_test_edges):
    # 获取用于训练和测试的边集
    train_edges_pos = train_test_edges['train_edges_pos']
    train_edges_neg = train_test_edges['train_edges_false']
    test_edges_pos = train_test_edges['test_edges_pos']
    test_edges_neg = train_test_edges['test_edges_false'] 
    
    train_features = []
    test_features = []
    
    for t, df in enumerate(df_list):
        # 提取当前时间步的边特征
        df_features = extract_edge_features(df)
        
        train_features_t = []
        test_features_t = []
        
        # 获取训练集的正负边特征
        for i, edge in enumerate(train_edges_pos + train_edges_neg):
            from_node, to_node = edge
            edge_data = df_features[(df_features['From'] == from_node) & (df_features['To'] == to_node)]

            if len(edge_data) == 0:  # 如果df中没有这个边，设置边特征为全0
                edge_feature = np.zeros(df_features.shape[1] - 2)  # 忽略'From' 和 'To'列
            else:
                edge_feature = edge_data.iloc[0, 2:].values  # 边特征从第三列开始
                
            train_features_t.append(edge_feature)

        # 获取测试集的正负边特征
        for i, edge in enumerate(test_edges_pos + test_edges_neg):
            from_node, to_node = edge
            edge_data = df_features[(df_features['From'] == from_node) & (df_features['To'] == to_node)]

            if len(edge_data) == 0:  # 如果df中没有这个边，设置边特征为全0
                edge_feature = np.zeros(df_features.shape[1] - 2)  # 忽略'From' 和 'To'列
            else:
                edge_feature = edge_data.iloc[0, 2:].values  # 假设边特征从第三列开始

            test_features_t.append(edge_feature)
        
        train_features.append(train_features_t)
        test_features.append(test_features_t)
    
    train_features = np.array(train_features)
    train_labels = np.concatenate([np.ones((len(train_edges_pos), 1)), np.zeros((len(train_edges_neg), 1))], axis=0)
    test_features = np.array(test_features)
    test_labels = np.concatenate([np.ones((len(test_edges_pos), 1)), np.zeros((len(test_edges_neg), 1))], axis=0)
    
    print("Shape of Train Features:", train_features.shape)
    print("Shape of Train Labels:", train_labels.shape)
    print("Shape of Test Features:", test_features.shape)
    print("Shape of Test Labels:", test_labels.shape)
    
    return train_features, train_labels, test_features, test_labels

In [26]:
train_edges_features, train_edges_labels, test_edges_features, test_edges_labels = extract_edge_features_for_temporal_steps(df_emb_list, extract_edge_features, train_test_edges)

Shape of Train Features: (8, 4676, 4)
Shape of Train Labels: (4676, 1)
Shape of Test Features: (8, 4678, 4)
Shape of Test Labels: (4678, 1)


In [27]:
def normalize(features):
    # 获取features的形状：[时间步数, 边数量, 特征维度]
    num_timesteps, num_edges, num_features = features.shape
    
    # 对每个时间步的特征进行标准化
    for t in range(num_timesteps):
        # 获取当前时间步的所有边特征
        features_t = features[t]
        
        # 排除值全为0的特征
        nonzero_mask = np.any(features_t != 0, axis=1)  # 过滤掉全为0的特征
        nonzero_features = features_t[nonzero_mask]
        
        if len(nonzero_features) != 0:
            # 标准化非零特征
            nonzero_features_normalized = (nonzero_features - nonzero_features.mean()) / nonzero_features.std(ddof=0)      
            features_t[nonzero_mask] = nonzero_features_normalized
        
        # 将标准化后的结果更新到features中
        features[t] = features_t

In [28]:
normalize(train_edges_features)
normalize(test_edges_features)

In [30]:
# 写

# np.save(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_overlap_{overlap_ratio}_train_edges_features_{emb_ratio}.npy', train_edges_features)
# np.save(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_overlap_{overlap_ratio}_train_edges_labels_{emb_ratio}.npy', train_edges_labels)
# np.save(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_overlap_{overlap_ratio}_test_edges_features_{emb_ratio}.npy', test_edges_features)
# np.save(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_overlap_{overlap_ratio}_test_edges_labels_{emb_ratio}.npy', test_edges_labels)

# 读
train_edges_features = np.load(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_train_edges_features_{emb_ratio}.npy')
train_edges_labels = np.load(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_train_edges_labels_{emb_ratio}.npy')
test_edges_features = np.load(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_test_edges_features_{emb_ratio}.npy')
test_edges_labels = np.load(data_path + f'LPsubG{graph_id}_temp_{temporal_steps}_test_edges_labels_{emb_ratio}.npy')