# GNN course assignment 2

### This assignment guides you to implement hand-crafted features for node- and link-level tasks on graphs. Have fun!

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import degree
from torch_geometric import transforms as T
import networkx as nx


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Cora dataset
dataset = Planetoid(root='./data', name='Cora')
data = dataset[0]  # Single graph dataset (Cora)
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

## Try different hand-crafted features below and see how they affect the performance of the model.

In [11]:
from torch_geometric.utils import (
    degree, 
    to_networkx, 
    scatter
)

### Node-Level Features ###
def extract_node_features(data: Data) -> Data:
    """
    Extracts node-level features.
    
    Parameters:
    - data: A PyTorch Geometric data object
    
    Returns:
    - data: A PyTorch Geometric data object with the extracted features
    """
    
    # E.g., each node's degree
    degrees = degree(data.edge_index[0]).view(-1, 1).float()
    # TODO:
    # You may replace the original node features with the extracted features
    # data.x = degrees
    # You may concatenate the extracted features to the original node features
    # data.x = torch.cat([data.x, degrees], dim=1)
    
    # TODO: You may include your hand-crafted features here
    out_degrees = degree(data.edge_index[0], num_nodes=data.num_nodes).view(-1, 1).float()
    in_degrees = degree(data.edge_index[1], num_nodes=data.num_nodes).view(-1, 1).float()
    clustering_coeff = nx.clustering(to_networkx(data)).values()
    neighbor_out_degrees = out_degrees[data.edge_index[1]] # 邻居的(出)度
    avg_neighbor_degree = scatter(neighbor_out_degrees, data.edge_index[0], dim=0, reduce='mean', dim_size=data.num_nodes)
    neighbor_x = data.x[data.edge_index[1]].float()
    avg_neighbor_x = scatter(neighbor_x, data.edge_index[0], dim=0, reduce='mean', dim_size=data.num_nodes)
    data.x = torch.cat([
        data.x,             
        in_degrees,
        out_degrees,
        avg_neighbor_x,
    ], dim=1)
    
    return data


### Link-Level Features ###
def extract_link_features(data: Data, edge_index: torch.Tensor) -> torch.Tensor:
    """
    Extracts link-level features for a PyTorch Geometric data object. Concatenates the extracted features to the edge attributes.
    
    Parameters:
    - data: A PyTorch Geometric data object
    - edge_index: Target edge indices; a torch.Tensor of shape (2, num_edges)
    
    Returns:
    - edge_attr: edge attributes for the target edges; a torch.Tensor of shape (num_edges, num_features)
    """
    # E.g., the inner product of the source and target node features
    inner_product = torch.sum(data.x[edge_index[0]] * data.x[edge_index[1]], dim=1).view(-1, 1).float()
    edge_attr = inner_product

    # TODO: You may include your hand-crafted features here
    src_features = data.x[edge_index[0]]
    tgt_features = data.x[edge_index[1]]
    # 2. Hadamard 积 (特征交互) [E, F]
    hadamard_product = src_features * tgt_features
    
    # 3. L1 距离 (差异性) [E, F]
    l1_diff = torch.abs(src_features - tgt_features)

    # 4. 优先连接 (拓扑特征) [E, 1]
    # 我们需要计算所有节点的度数（只需要计算一次）
    # 假设是无向图，我们使用总度数
    # 注意: data.edge_index 是图的 *所有* 边, 而 edge_index 是 *目标* 边
    node_degrees = degree(data.edge_index[0], num_nodes=data.num_nodes).float()
    
    src_degree = node_degrees[edge_index[0]].view(-1, 1)
    tgt_degree = node_degrees[edge_index[1]].view(-1, 1)
    pref_attachment = src_degree * tgt_degree

    # --- 结束 TODO ---

    # 将所有特征拼接起来
    edge_attr = torch.cat([
        inner_product,
        hadamard_product,
        l1_diff,
        pref_attachment
    ], dim=1)
    assert len(edge_attr) == edge_index.size(1)
    return edge_attr

# Node-level task: Node classification

In [4]:
### A naive Linear Probing Model for node classification ###
class LinearProbingModelNode(nn.Module):
    task = 'node'

    def __init__(self, input_dim, output_dim):
        super(LinearProbingModelNode, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, data):
        out = self.linear(data.x)
        logits = F.log_softmax(out, dim=1)
        return logits

data = extract_node_features(data) # Extract node-level features

model = LinearProbingModelNode(
        input_dim=data.x.size(1),
        output_dim=dataset.num_classes
    )

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training the model
model.train()
for epoch in range(2000):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 200 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')
        # Validation
        model.eval()
        _, pred = model(data).max(dim=1)
        correct = (pred[data.val_mask] == data.y[data.val_mask]).sum()
        acc = int(correct) / int(data.val_mask.sum())
        print(f'Accuracy: {acc:.4f}')

# Test the model
model.eval()
_, pred = model(data).max(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Test Accuracy: {acc:.4f}')


Epoch 0, Loss: 1.9581257104873657
Accuracy: 0.3160
Epoch 200, Loss: 0.001546265440993011
Accuracy: 0.6960
Epoch 400, Loss: 0.0006845730822533369
Accuracy: 0.7000
Epoch 600, Loss: 0.00039258605102077127
Accuracy: 0.6980
Epoch 800, Loss: 0.0002568513446021825
Accuracy: 0.6980
Epoch 1000, Loss: 0.00018178648315370083
Accuracy: 0.7000
Epoch 1200, Loss: 0.00013552703603636473
Accuracy: 0.6980
Epoch 1400, Loss: 0.000104812606878113
Accuracy: 0.6980
Epoch 1600, Loss: 8.328865806106478e-05
Accuracy: 0.6980
Epoch 1800, Loss: 6.756823131581768e-05
Accuracy: 0.6960
Test Accuracy: 0.6880


# Link-level task: Link prediction

In [5]:
# Reload the Cora dataset
dataset = Planetoid(root='./data/Cora', name='Cora')
data = dataset[0]  # Single graph dataset (Cora)
data

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [6]:
transform = T.RandomLinkSplit(num_val=0.1, num_test=0.1, is_undirected=data.is_undirected())
train_data, val_data, test_data = transform(data)
train_data, val_data, test_data

(Data(x=[2708, 1433], edge_index=[2, 8448], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[8448], edge_label_index=[2, 8448]),
 Data(x=[2708, 1433], edge_index=[2, 8448], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[1054], edge_label_index=[2, 1054]),
 Data(x=[2708, 1433], edge_index=[2, 9502], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_label=[1054], edge_label_index=[2, 1054]))

In [7]:
### A naive Linear Probing Model for link prediction ###
class LinearProbingModelLink(nn.Module):
    task = 'link'

    def __init__(self, input_dim, output_dim=2):
        super(LinearProbingModelLink, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, edge_attr):
        out = self.linear(edge_attr)
        logits = F.log_softmax(out, dim=1)
        return logits

In [12]:
train_data_edge_attr = extract_link_features(train_data, train_data.edge_label_index)
val_data_edge_attr = extract_link_features(val_data, val_data.edge_label_index)
test_data_edge_attr = extract_link_features(test_data, test_data.edge_label_index)

# Initialize the model
model = LinearProbingModelLink(
    input_dim=train_data_edge_attr.size(1),
    output_dim=2
)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# Training function
def train():
    model.train()
    optimizer.zero_grad()
    logits = model(train_data_edge_attr)
    labels = train_data.edge_label.long()  # assuming the label is edge_label
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()
    return loss.item()

# Validation/testing function
def test(test_data, test_data_edge_attr):
    model.eval()
    with torch.no_grad():
        logits = model(test_data_edge_attr)
        labels = test_data.edge_label.long()
        pred = logits.argmax(dim=1)
        correct = pred.eq(labels).sum().item()
        acc = correct / labels.size(0)
    return acc

# Training and Evaluation Loop
for epoch in range(0, 500):
    loss = train()
    if epoch % 50 == 0:
        val_acc = test(val_data, val_data_edge_attr)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val Acc: {val_acc:.4f}')

# Test accuracy
test_acc = test(test_data, test_data_edge_attr)
print(f'Test Accuracy: {test_acc:.4f}')

Epoch: 000, Loss: 1.2076, Val Acc: 0.4839
Epoch: 050, Loss: 0.3210, Val Acc: 0.7818
Epoch: 100, Loss: 0.2847, Val Acc: 0.7827
Epoch: 150, Loss: 0.2681, Val Acc: 0.7761
Epoch: 200, Loss: 0.2579, Val Acc: 0.7647
Epoch: 250, Loss: 0.2510, Val Acc: 0.7657
Epoch: 300, Loss: 0.2460, Val Acc: 0.7647
Epoch: 350, Loss: 0.2422, Val Acc: 0.7590
Epoch: 400, Loss: 0.2392, Val Acc: 0.7600
Epoch: 450, Loss: 0.2368, Val Acc: 0.7600
Test Accuracy: 0.7770


---
# Discussions

对于图上的任务，邻接顶点的信息是非常重要的。前面用过扩散小波提取特征，但很失败，严重过拟合。我认为这不是扩散小波变换的问题，而是分类器本身难以处理这种信息，需要建立一个更强的分类器，并加入一系列防止过拟合的措施。  
边分类时，源点和终点的信息是至关重要的。通过提取源点和终点信息的相关性，就差不多能做到边的分类。我后面加入的一些手动的特征实际上也是做这个任务，可能有些重复了。因此性能提升很小。