In [5]:
import torch
import torch_geometric
train_data = torch.load('train_data.pt')
val_data = torch.load('val_data.pt')
test_data = torch.load('test_data.pt')

In [6]:
from torch_geometric.utils import to_dense_adj, degree

# 转换 edge_index 为邻接矩阵并计算节点度
adj_matrix = to_dense_adj(test_data.edge_index, max_num_nodes=test_data.num_nodes)[0]
node_degree = degree(test_data.edge_index[0], dtype=torch.float, num_nodes=test_data.num_nodes)

# 定义相似性指标计算函数
def compute_similarity_scores(adj_matrix, node_degree):
    cn_matrix = torch.matmul(adj_matrix, adj_matrix)
    inv_log_deg = 1 / torch.log(node_degree + 1e-9)
    inv_log_deg[torch.isinf(inv_log_deg)] = 0
    aa_matrix = torch.matmul(adj_matrix * inv_log_deg.unsqueeze(0), adj_matrix)
    inv_deg = 1 / (node_degree + 1e-9)
    ra_matrix = torch.matmul(adj_matrix * inv_deg.unsqueeze(0), adj_matrix)
    pa_matrix = torch.ger(node_degree, node_degree)
    intersection = torch.matmul(adj_matrix, adj_matrix)
    union = adj_matrix.sum(dim=1).unsqueeze(1) + adj_matrix.sum(dim=1).unsqueeze(0) - intersection
    ja_matrix = intersection / (union + 1e-9)
    return cn_matrix, aa_matrix, ra_matrix, pa_matrix, ja_matrix

cn_matrix, aa_matrix, ra_matrix, pa_matrix, ja_matrix = compute_similarity_scores(adj_matrix, node_degree)

In [7]:
def extract_features_labels(data, matrices):
    # 提取正样本和负样本的索引
    pos_edge_index = data.pos_edge_label_index
    neg_edge_index = data.neg_edge_label_index
    
    # 初始化特征列表
    features = []
    
    # 对于每个相似性指标，提取正负样本的特征
    for matrix in matrices:
        pos_features = matrix[pos_edge_index[0], pos_edge_index[1]]
        neg_features = matrix[neg_edge_index[0], neg_edge_index[1]]
        features.append(torch.cat([pos_features, neg_features], dim=0))
    
    # 将特征列表转换为张量（特征维度在第二维）
    features = torch.stack(features, dim=1)
    
    # 创建标签
    labels = torch.cat([torch.ones(pos_edge_index.size(1)), torch.zeros(neg_edge_index.size(1))], dim=0)
    
    return features, labels

In [8]:
train_features, train_labels = extract_features_labels(train_data, (cn_matrix, aa_matrix, ra_matrix, pa_matrix, ja_matrix))
val_features, val_labels = extract_features_labels(val_data, (cn_matrix, aa_matrix, ra_matrix, pa_matrix, ja_matrix))
test_features, test_labels = extract_features_labels(test_data, (cn_matrix, aa_matrix, ra_matrix, pa_matrix, ja_matrix))

X_train, y_train = train_features.numpy(), train_labels.numpy()
X_val, y_val = val_features.numpy(), val_labels.numpy()
X_test, y_test = test_features.numpy(), test_labels.numpy()

In [10]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

def train_evaluate_model(model_name, X_train, y_train, X_val, y_val, X_test, y_test):
    if model_name == 'logistic_regression':
        model = make_pipeline(
            StandardScaler(),
            LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
        )
    elif model_name == 'linear_svc':
        model = make_pipeline(
            StandardScaler(),
            SVC(kernel='linear', probability=True, class_weight='balanced', random_state=42)
        )
    elif model_name == 'random_forest':
        model = RandomForestClassifier(
            n_estimators=500, class_weight='balanced', random_state=42
        )
    elif model_name == 'mlp':
        model = make_pipeline(
            StandardScaler(),
            MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
        )
    else:
        raise ValueError("Model not supported")

    # 训练模型
    model.fit(X_train, y_train)

    # 在验证集上预测概率
    if hasattr(model, 'predict_proba'):
        y_val_proba = model.predict_proba(X_val)[:, 1]
    else:
        y_val_dec = model.decision_function(X_val)
        y_val_proba = (y_val_dec - y_val_dec.min()) / (y_val_dec.max() - y_val_dec.min())  # 归一化

    # 寻找最佳阈值
    thresholds = np.linspace(0, 1, 100)
    best_threshold = 0.5
    best_f1 = 0
    for th in thresholds:
        y_val_pred = (y_val_proba >= th).astype(int)
        current_f1 = f1_score(y_val, y_val_pred)
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_threshold = th

    # 在测试集应用最佳阈值
    if hasattr(model, 'predict_proba'):
        y_test_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_test_dec = model.decision_function(X_test)
        y_test_proba = (y_test_dec - y_test_dec.min()) / (y_test_dec.max() - y_test_dec.min())

    y_test_pred = (y_test_proba >= best_threshold).astype(int)

    # 计算指标
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    roc_auc = roc_auc_score(y_test, y_test_proba)

    print(f"Model: {model_name}")
    print(f"Optimal Threshold: {best_threshold:.2f}")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}\n")

# 传入验证集调整阈值
model_names = ['logistic_regression', 'linear_svc', 'random_forest', 'mlp']
for model_name in model_names:
    train_evaluate_model(model_name, X_train, y_train, X_val, y_val, X_test, y_test)

Model: logistic_regression
Optimal Threshold: 0.29
Accuracy: 0.6704, Precision: 0.8230, Recall: 0.4342, F1: 0.5685, ROC-AUC: 0.6871

Model: linear_svc
Optimal Threshold: 0.29
Accuracy: 0.6712, Precision: 0.8040, Recall: 0.4527, F1: 0.5792, ROC-AUC: 0.6871

Model: random_forest
Optimal Threshold: 0.20
Accuracy: 0.6658, Precision: 0.7643, Recall: 0.4795, F1: 0.5893, ROC-AUC: 0.6840

Model: mlp
Optimal Threshold: 0.17
Accuracy: 0.6667, Precision: 0.7643, Recall: 0.4820, F1: 0.5911, ROC-AUC: 0.6867

