In [65]:
import torch
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GINConv,global_mean_pool
from torch.nn import Linear
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

In [66]:
# 加载数据
node_csv_path = r"D:\ttt\a\node_total.csv"
links_csv_path = r"D:\ttt\a\link_total.csv"
labels_csv_path = r"D:\ttt\a\label_total.csv"

In [67]:
node_df = pd.read_csv(node_csv_path)
links_df = pd.read_csv(links_csv_path)
labels_df = pd.read_csv(labels_csv_path)

In [68]:
graph_data_list = []

In [69]:
# 遍历每个会话
for session_id in tqdm(node_df['sessionId'].unique()):
    # 节点特征
    session_nodes = node_df[node_df['sessionId'] == session_id]
    node_features = session_nodes[['degree', 'cluster','efficiency']].values
    # 对节点特征进行标准化
    scaler = StandardScaler()
    node_features_normalized = scaler.fit_transform(node_features)
    # 将标准化后的特征转换为 PyTorch 张量
    node_features_tensor = torch.tensor(node_features_normalized, dtype=torch.float)
    
    #边索引
    session_edges = links_df[links_df['sessionId'] == session_id]
    edge_index = torch.tensor([session_edges['source'].values - 1, 
                               session_edges['target'].values - 1], 
                              dtype=torch.long)
    
    #标签
    label = labels_df[labels_df['sessionId'] == session_id]['label'].iloc[0]
    label_tensor = torch.tensor([label], dtype=torch.float)

    data = Data(x=node_features_tensor, edge_index=edge_index, y=label_tensor)
    graph_data_list.append(data)

100%|██████████| 680/680 [00:00<00:00, 693.46it/s]


In [70]:
for i in range(680):
    print(graph_data_list[i])

Data(x=[24, 3], edge_index=[2, 25], y=[1])
Data(x=[24, 3], edge_index=[2, 48], y=[1])
Data(x=[24, 3], edge_index=[2, 25], y=[1])
Data(x=[24, 3], edge_index=[2, 44], y=[1])
Data(x=[24, 3], edge_index=[2, 20], y=[1])
Data(x=[24, 3], edge_index=[2, 41], y=[1])
Data(x=[24, 3], edge_index=[2, 60], y=[1])
Data(x=[24, 3], edge_index=[2, 20], y=[1])
Data(x=[24, 3], edge_index=[2, 63], y=[1])
Data(x=[24, 3], edge_index=[2, 40], y=[1])
Data(x=[24, 3], edge_index=[2, 51], y=[1])
Data(x=[24, 3], edge_index=[2, 58], y=[1])
Data(x=[24, 3], edge_index=[2, 52], y=[1])
Data(x=[24, 3], edge_index=[2, 32], y=[1])
Data(x=[24, 3], edge_index=[2, 12], y=[1])
Data(x=[24, 3], edge_index=[2, 28], y=[1])
Data(x=[24, 3], edge_index=[2, 40], y=[1])
Data(x=[24, 3], edge_index=[2, 92], y=[1])
Data(x=[24, 3], edge_index=[2, 19], y=[1])
Data(x=[24, 3], edge_index=[2, 27], y=[1])
Data(x=[24, 3], edge_index=[2, 8], y=[1])
Data(x=[24, 3], edge_index=[2, 20], y=[1])
Data(x=[24, 3], edge_index=[2, 34], y=[1])
Data(x=[24, 

In [71]:
class GraphBinaryClassificationNet(torch.nn.Module):
    def __init__(self):
        super(GraphBinaryClassificationNet, self).__init__()
        self.conv1 = GINConv(Linear(3, 128))
        self.conv2 = GINConv(Linear(128, 128))
        self.conv3 = GINConv(Linear(128, 256))
        self.lin1 = Linear(256, 128)
        self.lin2 = Linear(128, 64)
        self.lin3 = Linear(64, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = F.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = torch.sigmoid(self.lin3(x)).squeeze(1)
        return x


In [72]:
def train(train_loader, model, optimizer, crit):
    model.train()
    loss_all = 0
    for data in train_loader:
        optimizer.zero_grad()
        output = model(data)
        label = data.y.float()  # Ensure labels are float
        loss = crit(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_loader)

In [73]:
def evaluate(loader, model):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for data in loader:
            pred = model(data)
            label = data.y
            predictions.extend(pred.numpy())
            labels.extend(label.numpy())

    predictions = np.array(predictions)
    labels = np.array(labels)
    return roc_auc_score(labels, predictions)

In [74]:
# Cross-validation
k = 5  # Number of folds for cross-validation
kf = KFold(n_splits=k, shuffle=True, random_state=42)
auc_scores = []

In [75]:
for train_index, test_index in kf.split(graph_data_list):
    train_data = [graph_data_list[i] for i in train_index]
    test_data = [graph_data_list[i] for i in test_index]

    train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

    # Initialize model, optimizer, and loss function
    model = GraphBinaryClassificationNet()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    crit = torch.nn.BCELoss()

    # Train the model
    for epoch in range(100):
        train_loss = train(train_loader, model, optimizer, crit)

    # Evaluate the model
    auc_score = evaluate(test_loader, model)
    auc_scores.append(auc_score)
    print(f"Fold AUC Score: {auc_score}")



Fold AUC Score: 0.8340943683409436




Fold AUC Score: 0.8492321003677266




Fold AUC Score: 0.845446679645252




Fold AUC Score: 0.8524122807017543




Fold AUC Score: 0.8806067172264356


In [12]:
avg_auc_score = np.mean(auc_scores)
print("Average AUC Score:", avg_auc_score)

Average AUC Score: 0.8620538834196679
