# 准备
## import

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

from torch_geometric.datasets import Planetoid
from torch_geometric import transforms
from torch_geometric.utils import negative_sampling

import matplotlib.pyplot as plt

from tqdm.auto import tqdm

from sklearn.metrics import roc_auc_score

## 全局变量

In [2]:
%load_ext tensorboard

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 功能函数

In [4]:
def dropout_edge(edge_index, p=0.5, training=True):
    if not training:
        return edge_index
    mask = torch.rand(edge_index.size(1), device=edge_index.device) > p
    return edge_index[:, mask]

def eval_acc(model, data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data).argmax(dim=1)
        correct = (out[mask] == data.y[mask])
    return correct.sum().item() / mask.sum().item()

def node_classifier(model, optimizer, criterion, data, num_epochs = 5, 
                    scheduler=None, comment=None):
    def accuracy(pred, mask):
        correct = (pred[mask] == data.y[mask])
        acc = correct.sum().item() / mask.sum().item()
        return acc
    def train_step():
        model.train()
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
        acc = accuracy(out.argmax(dim=1), data.train_mask)
        return loss, acc
    def val_step():
        model.eval()
        out = model(data)
        loss = criterion(out[data.val_mask], data.y[data.val_mask])
        pred = out.argmax(dim=1)
        acc = accuracy(pred, data.val_mask)
        return loss, acc
    
    writer = SummaryWriter(comment=comment)
    for epoch in tqdm(range(num_epochs)):
        train_loss, train_acc = train_step()
        val_loss, val_acc = val_step()
        scheduler.step(train_loss)
        lr = scheduler.get_last_lr()[0]
        writer.add_scalars(main_tag='Loss',
                          tag_scalar_dict={"train_loss": train_loss,
                                           "validation_loss": val_loss}, 
                          global_step=epoch)
        writer.add_scalars(main_tag='Accuracy',
                          tag_scalar_dict={"validation_acc": val_acc,
                                           "train_acc": train_acc}, 
                          global_step=epoch)
        writer.add_scalars(main_tag="lr", tag_scalar_dict={'lr':lr}, global_step=epoch)
    writer.close()

def eval_auc(model, data):
    model.eval()
    with torch.no_grad():
        z = model.encode(data.x, data.edge_index)
        out = model.decode(z, data.edge_label_index).view(-1).sigmoid()
        model.train()
    return roc_auc_score(data.edge_label.cpu().numpy(), out.cpu().numpy())
 
def link_predictor(model, optimizer, criterion, train_data, val_data, num_epochs=5, 
                   scheduler=None, comment=None):

    def train_step():
        model.train()
        neg_edge_index = negative_sampling(
            edge_index=train_data.edge_index, 
            num_nodes=train_data.num_nodes,
            num_neg_samples=train_data.edge_label_index.size(1), 
            method='sparse'
        ).to(device)
        edge_label_index = torch.cat(
            [train_data.edge_label_index, neg_edge_index],
            dim=-1,
        )
        edge_label = torch.cat([
            train_data.edge_label,
            train_data.edge_label.new_zeros(neg_edge_index.size(1))
        ], dim=0)
        optimizer.zero_grad()
        z = model.encode(train_data.x, train_data.edge_index)
        out = model.decode(z, edge_label_index).view(-1)
        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()

        auc_score = roc_auc_score(edge_label.cpu().numpy(), out.sigmoid().detach().cpu().numpy())
        return loss, auc_score
    
    def val_step():
        model.eval()
        z = model.encode(val_data.x, val_data.edge_index)
        out = model.decode(z, val_data.edge_label_index).view(-1)
        loss = criterion(out, val_data.edge_label)
        auc_score = eval_auc(model, val_data)
        return loss, auc_score
    
    
    writer = SummaryWriter(comment=comment)
    for epoch in tqdm(range(num_epochs)):
        train_loss, train_auc = train_step()
        val_loss, val_auc = val_step()
        scheduler.step(train_loss)
        lr = scheduler.get_last_lr()[0]
        writer.add_scalars(main_tag='Loss',
                          tag_scalar_dict={"train_loss": train_loss,
                                           "validation_loss": val_loss}, 
                          global_step=epoch)
        writer.add_scalars(main_tag='AUC',
                          tag_scalar_dict={"validation_auc": val_auc,
                                           "train_auc": train_auc},
                          global_step=epoch)
        writer.add_scalars(main_tag="lr", tag_scalar_dict={'lr':lr}, global_step=epoch)
    writer.close()

## 模型

In [5]:
class myPairNorm(nn.Module):
    def __init__(self, scale=1., epsilon=1e-5):
        super(myPairNorm, self).__init__()
        self.scale = scale
        self.eps = epsilon
        
    def forward(self, x):
        x = x - x.mean(dim=0, keepdim=True)
        x = self.scale * x / (x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps)
        return x

In [6]:
class myGCNConv(nn.Module):
    def __init__(self, in_channels, out_channels, self_loop=True):
        super(myGCNConv, self).__init__()
        self.loop = self_loop
        self.A_hat = None
        self.lin = nn.Linear(in_channels, out_channels)
    
    def get_A_hat(self, edge_index, num_nodes):
        A = torch.zeros((num_nodes, num_nodes), dtype=torch.float)
        for i in range(edge_index.size(1)):
            src, dst = edge_index[:, i]
            A[src, dst] = 1
            A[dst, src] = 1
        if self.loop:
            I = torch.diag(torch.ones(num_nodes))
            A = A + I
        D = torch.diag(torch.sum(A, dim=1))
        A_hat = torch.inverse(torch.sqrt(D)) @ A @ torch.inverse(torch.sqrt(D))
        return A_hat.to(device)

    def forward(self, x, edge_index):
        if self.A_hat == None:
            self.A_hat = self.get_A_hat(edge_index, x.size(0))
        x = self.lin(x)
        x = torch.matmul(self.A_hat, x)
        return x

In [7]:
class GCN(nn.Module):

    def __init__(self, features, hidden_dimension, hiddel_layers, classes, self_loop=True, drop_edge=False, pair_norm=False, activation=nn.ReLU()):
        super(GCN, self).__init__()
        # 输入的节点特征维度 * 中间隐藏层的维度
        self.net = nn.ModuleList()
        for i in range(hiddel_layers):
            if i == 0:
                self.net.append(myGCNConv(features, hidden_dimension, self_loop=self_loop))
            else:
                self.net.append(myGCNConv(hidden_dimension, hidden_dimension, self_loop=self_loop))
        # 中间隐藏层的维度 * 节点类别
        self.last_conv = myGCNConv(hidden_dimension, classes, self_loop=self_loop)
        self.norm_layer = myPairNorm()
        self.activation = activation
        
        self.drop_edge = drop_edge
        self.pair_norm = pair_norm

    def encode(self, x, edge_index):
        if self.drop_edge and self.training:
            edge_index = dropout_edge(edge_index, p=0.5)
        for conv in self.net:
            x = self.activation(conv(x, edge_index))
            if self.pair_norm:
                x = self.norm_layer(x)
        x = self.last_conv(x, edge_index)
        return x
    
    def decode(self, z, edge_label_index):
        # z所有节点的表示向量
        src = z[edge_label_index[0]]
        dst = z[edge_label_index[1]]
        # print(dst.size())   # (7284, 64)
        r = (src * dst).sum(dim=-1)
        # print(r.size())   (7284)
        return r

    def forward(self, data):
        # 节点特征 和 邻接关系
        x, edge_index = data.x, data.edge_index
        # 编码
        x = self.encode(x, edge_index)
        # 使用 softmax 得到概率分布
        return F.log_softmax(x, dim=1)

# Cora

## 节点分类
### 数据加载
split = 'full' 可以让所有不在val和test的结点加入train

In [18]:
dataset = Planetoid(root='data/Cora', name='Cora', transform=transforms.NormalizeFeatures(), split='full')
data = dataset[0].to(device)
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}\n')

print(data)
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Number of validation nodes: {data.val_mask.sum()}')
print(f'Number of test nodes: {data.test_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.contains_isolated_nodes()}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 1208
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.45
Contains isolated nodes: False
Contains self-loops: False
Is undirected: True




### 训练

In [19]:
model = GCN(dataset.num_features, 64, 1, dataset.num_classes, self_loop=True, drop_edge=False, pair_norm=False).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)
node_classifier(model, optimizer, criterion, data, num_epochs=300, scheduler=scheduler, comment='_Cora+loop')



  0%|          | 0/300 [00:00<?, ?it/s]

### 测试

In [22]:
eval_acc(model, data, data.test_mask)

0.877

## 链路预测
### 数据处理

In [8]:
transform = transforms.Compose([
    transforms.NormalizeFeatures(),
    transforms.ToDevice(device),
    transforms.RandomLinkSplit(num_val=0.1, num_test=0.1, is_undirected=True,
                      add_negative_train_samples=False),
])

dataset = Planetoid(root='data/Cora', name='Cora', transform=transform, split='full')

train_data, val_data, test_data = dataset[0]

### 训练

In [9]:
model = GCN(dataset.num_features, 128, 1, 64, self_loop=True, drop_edge=True, pair_norm=True, activation=nn.Sigmoid()).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)
link_predictor(model, optimizer, criterion, train_data, val_data, num_epochs=300, scheduler=scheduler, comment='_Cora+de+pn+sigmoid')



  0%|          | 0/300 [00:00<?, ?it/s]

### 测试

In [10]:
eval_auc(model, test_data)

0.9430560006337114

# Citeseer
## 节点分类
### 数据加载

In [8]:
dataset = Planetoid(root='data', name="CiteSeer", split='full')
data = dataset[0].to(device)
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}\n')

print(data)
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Number of validation nodes: {data.val_mask.sum()}')
print(f'Number of test nodes: {data.test_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Contains isolated nodes: {data.has_isolated_nodes()}')
print(f'Contains self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Dataset: CiteSeer():
Number of graphs: 1
Number of features: 3703
Number of classes: 6

Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])
Number of nodes: 3327
Number of edges: 9104
Average node degree: 2.74
Number of training nodes: 1827
Number of validation nodes: 500
Number of test nodes: 1000
Training node label rate: 0.55
Contains isolated nodes: True
Contains self-loops: False
Is undirected: True


### 训练

In [9]:
model = GCN(dataset.num_features, 64, 1, dataset.num_classes, self_loop=True, drop_edge=True, pair_norm=False).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)
node_classifier(model, optimizer, criterion, data, num_epochs=400, scheduler=scheduler, comment='_CiteSeer+de')



  0%|          | 0/400 [00:00<?, ?it/s]

### 测试

In [10]:
eval_acc(model, data, data.test_mask)

0.729

## 链路预测
### 数据处理

In [8]:
transform = transforms.Compose([
    transforms.NormalizeFeatures(),
    transforms.ToDevice(device),
    transforms.RandomLinkSplit(num_val=0.1, num_test=0.1, is_undirected=True,
                      add_negative_train_samples=False),
])

dataset = Planetoid(root='data', name='Citeseer', transform=transform, split='full')

train_data, val_data, test_data = dataset[0]

### 训练

In [9]:
model = GCN(dataset.num_features, 64, 1, 64, drop_edge=True, pair_norm=True).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)
link_predictor(model, optimizer, criterion, train_data, val_data, num_epochs=200, scheduler=scheduler, comment='_Citeseer+de+pn')



  0%|          | 0/200 [00:00<?, ?it/s]

### 测试

In [10]:
eval_auc(model, test_data)

0.922043231493781

# demo

In [45]:
transform = transforms.Compose([
    transforms.NormalizeFeatures(),
    # transforms.ToDevice(device),
    # transforms.RandomLinkSplit(num_val=0.1, num_test=0.1, is_undirected=True,
                      # add_negative_train_samples=False),
])

dataset = Planetoid(root='data', name='Citeseer', transform=transform, split='full')

train_data = dataset[0]
data = train_data

A = torch.zeros((data.num_nodes, data.num_nodes), dtype=torch.float)
for i in range(data.edge_index.size(1)):
    src, dst = data.edge_index[:, i]
    A[src, dst] = 1
    A[dst, src] = 1
D = torch.diag(torch.sum(A, dim=1))
for i in range(D.shape[0]):
    if D[i, i] == 0:
        print(i)
        break


192


In [46]:
D[192][192]

tensor(0.)

In [41]:
dataset = Planetoid(root='data/Cora', name='Cora', transform=transforms.NormalizeFeatures(), split='full')
data = dataset[0].to(device)

In [91]:
edge_index = torch.tensor([[0, 1, 1, 2, 2, 3],
                           [1, 0, 2, 1, 3, 2]])
edge_index = dropout_edge(edge_index, p=0.25, training=True)
edge_index

tensor([[0, 1, 1, 2, 2, 3],
        [1, 0, 2, 1, 3, 2]])