使用PyG的内置数据进行3个任务的代码实现

## 1.节点分类任务代码实现
Cora数据集是PyG内置的节点分类数据集，代表着学术论文的相关性分类问题（即把每一篇学术论文都看成是节点），Cora数据集有2708个节点，1433维特征，边数为5429。标签是文献的主题，共计 7 个类别。所以这是一个7分类问题。

In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

In [2]:
#载入数据
dataset = Planetoid(root='./data/Cora', name='Cora')
data = dataset[0]
#定义网络架构
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_features, 16)  #输入=节点特征维度，16是中间隐藏神经元个数
        self.conv2 = GCNConv(16, dataset.num_classes)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
#模型训练
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)    #模型的输入有节点特征还有边特征,使用的是全部数据
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])   #损失仅仅计算的是训练集的损失
    loss.backward()
    optimizer.step()
#测试：
model.eval()
test_predict = model(data.x, data.edge_index)[data.test_mask]
max_index = torch.argmax(test_predict, dim=1)
test_true = data.y[data.test_mask]
correct = 0
for i in range(len(max_index)):
    if max_index[i] == test_true[i]:
        correct += 1
print('测试集准确率为：{}%'.format(correct*100/len(test_true)))

测试集准确率为：80.8%


## 2.边分类任务代码实现
同样是利用Cora数据集，只是这个时候我们关注的不再是节点特征，而是边特征，因此，在这里我们需要手动创建边标签的正例与负例。这是一个二分类问题。

In [5]:
from torch_geometric.utils import negative_sampling

# 边分类模型
class EdgeClassifier(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(EdgeClassifier, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)
        self.classifier = torch.nn.Linear(2 * out_channels, 2)  

    def forward(self, x, edge_index):
        x = F.relu(self.conv(x, edge_index))
        pos_edge_index = edge_index    
        total_edge_index = torch.cat([pos_edge_index, 
                                    negative_sampling(edge_index, num_neg_samples=pos_edge_index.size(1))], dim=1)
        edge_features = torch.cat([x[total_edge_index[0]], x[total_edge_index[1]]], dim=1)  
        return self.classifier(edge_features)

In [6]:
# 加载数据集
dataset = Planetoid(root='./data/Cora/raw', name='Cora')
data = dataset[0]

# 创建train_mask和test_mask
edges = data.edge_index.t().cpu().numpy()   
num_edges = edges.shape[0]
train_mask = torch.zeros(num_edges, dtype=torch.bool)
test_mask = torch.zeros(num_edges, dtype=torch.bool)
train_size = int(0.8 * num_edges)
train_indices = torch.randperm(num_edges)[:train_size]
train_mask[train_indices] = True
test_mask[~train_mask] = True

# 定义模型和优化器/训练/测试
model = EdgeClassifier(dataset.num_features, 64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    logits = model(data.x, data.edge_index)
    pos_edge_index = data.edge_index
    pos_labels = torch.ones(pos_edge_index.size(1), dtype=torch.long)  
    neg_labels = torch.zeros(pos_edge_index.size(1), dtype=torch.long)  
    labels = torch.cat([pos_labels, neg_labels], dim=0).to(logits.device)
    new_train_mask = torch.cat([train_mask, train_mask], dim=0)
    loss = F.cross_entropy(logits[new_train_mask], labels[new_train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index)
        pos_edge_index = data.edge_index
        pos_labels = torch.ones(pos_edge_index.size(1), dtype=torch.long)
        neg_labels = torch.zeros(pos_edge_index.size(1), dtype=torch.long)
        labels = torch.cat([pos_labels, neg_labels], dim=0).to(logits.device)
        new_test_mask = torch.cat([test_mask, test_mask], dim=0)
        
        predictions = logits[new_test_mask].max(1)[1]
        correct = predictions.eq(labels[new_test_mask]).sum().item()
        return correct / len(predictions)

In [7]:
for epoch in range(1, 1001):
    loss = train()
    acc = test()
    if epoch % 100 == 0:
        print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}")                                     

Epoch: 001, Loss: 0.6923, Acc: 0.5000
Epoch: 002, Loss: 0.6819, Acc: 0.5836
Epoch: 003, Loss: 0.6761, Acc: 0.5433
Epoch: 004, Loss: 0.6622, Acc: 0.5289
Epoch: 005, Loss: 0.6536, Acc: 0.6113
Epoch: 006, Loss: 0.6444, Acc: 0.5987
Epoch: 007, Loss: 0.6341, Acc: 0.6027
Epoch: 008, Loss: 0.6270, Acc: 0.6487
Epoch: 009, Loss: 0.6202, Acc: 0.6314
Epoch: 010, Loss: 0.6111, Acc: 0.6581
Epoch: 011, Loss: 0.6005, Acc: 0.6723
Epoch: 012, Loss: 0.5959, Acc: 0.6650
Epoch: 013, Loss: 0.5891, Acc: 0.6686
Epoch: 014, Loss: 0.5871, Acc: 0.6667
Epoch: 015, Loss: 0.5875, Acc: 0.6771
Epoch: 016, Loss: 0.5766, Acc: 0.6854
Epoch: 017, Loss: 0.5743, Acc: 0.6761
Epoch: 018, Loss: 0.5734, Acc: 0.6920
Epoch: 019, Loss: 0.5660, Acc: 0.6929
Epoch: 020, Loss: 0.5629, Acc: 0.6887
Epoch: 021, Loss: 0.5649, Acc: 0.6903
Epoch: 022, Loss: 0.5640, Acc: 0.6899
Epoch: 023, Loss: 0.5624, Acc: 0.6965
Epoch: 024, Loss: 0.5584, Acc: 0.6996
Epoch: 025, Loss: 0.5593, Acc: 0.6934
Epoch: 026, Loss: 0.5535, Acc: 0.6929
Epoch: 027, 

## 3.图分类任务代码实现
采用ENZYMES数据集。ENZYMES是一个常用的图分类基准数据集。它是由600个图组成的，这些图实际上表示了不同的蛋白酶的结构，这些蛋白酶分为6个类别（每个类别有100个蛋白酶）。因此，每个图代表一个蛋白酶，我们的任务是预测蛋白酶属于哪一个类别。这是6分类任务。

In [9]:
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.datasets import TUDataset
from torch_geometric.data import DataLoader

# 加载数据集
dataset = TUDataset(root='./data/ENZYMES', name='ENZYMES')
dataset = dataset.shuffle()

train_dataset = dataset[:540]
test_dataset = dataset[540:]

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Downloading https://www.chrsmrrs.com/graphkerneldatasets/ENZYMES.zip
Extracting data/ENZYMES/ENZYMES/ENZYMES.zip
Processing...
Done!


In [10]:
# 定义图卷积网络模型
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(hidden_channels, dataset.num_classes)
    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        x = global_mean_pool(x, batch)    # 使用全局平均池化获得图的嵌入
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        return x

In [11]:
model = GCN(hidden_channels=64)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()

def test(loader):
    model.eval()
    correct = 0
    for data in loader:
        out = model(data.x, data.edge_index, data.batch)
        pred = out.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)

Epoch: 001, Train Acc: 0.2111, Test Acc: 0.1000
Epoch: 002, Train Acc: 0.2315, Test Acc: 0.2667
Epoch: 003, Train Acc: 0.2241, Test Acc: 0.2667
Epoch: 004, Train Acc: 0.2130, Test Acc: 0.1000
Epoch: 005, Train Acc: 0.2370, Test Acc: 0.2000
Epoch: 006, Train Acc: 0.2537, Test Acc: 0.2500
Epoch: 007, Train Acc: 0.2537, Test Acc: 0.2833
Epoch: 008, Train Acc: 0.2648, Test Acc: 0.1667
Epoch: 009, Train Acc: 0.2352, Test Acc: 0.1333
Epoch: 010, Train Acc: 0.2481, Test Acc: 0.1667
Epoch: 011, Train Acc: 0.2574, Test Acc: 0.1833
Epoch: 012, Train Acc: 0.2685, Test Acc: 0.2667
Epoch: 013, Train Acc: 0.2759, Test Acc: 0.3167
Epoch: 014, Train Acc: 0.2741, Test Acc: 0.2667
Epoch: 015, Train Acc: 0.2611, Test Acc: 0.3000
Epoch: 016, Train Acc: 0.2556, Test Acc: 0.2500
Epoch: 017, Train Acc: 0.2889, Test Acc: 0.3000
Epoch: 018, Train Acc: 0.2907, Test Acc: 0.2333
Epoch: 019, Train Acc: 0.2574, Test Acc: 0.1833
Epoch: 020, Train Acc: 0.2796, Test Acc: 0.2833
Epoch: 021, Train Acc: 0.2685, Test Acc:

In [None]:
for epoch in range(1, 1001):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    if epoch % 100 == 0:
        print(f'Epoch: {epoch:03d}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')