### Hands on tutorial of Graph Representation Learning

In this lecture, we will go through the following topics
1. Graph structued data in Python
2. GNN package - Pytorch-Geometric introduction
3. Representation in Graphs - node2vec, GCN
4. GNN with downstrean tasks
5. GNN for text classification


In [None]:
import os
import torch
import numpy as np
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
!pip install -q torch-cluster -f https://data.pyg.org/whl/torch-${TORCH}.html

from torch_geometric.nn import Node2Vec
import torch_cluster
import os.path as osp
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from tqdm.notebook import tqdm
random_walk = torch.ops.torch_cluster.random_walk

# Graph Representation Learning
The goal of Graph Representation Learning aims at learning **embedding vectors** for each node that preserves the proximity in graphs. <br>
To demonstrate, we make use of the `KarateClud` dataset, as we introduced before.

![](https://i.imgur.com/oQv59aR.png)

In [None]:
from torch_geometric.datasets import KarateClub

dataset = KarateClub()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')


data = dataset[0]  # Get the first graph object.

print(data)
print('==============================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

In [None]:
model = Node2Vec(data.edge_index, embedding_dim=16, 
                 walk_length=10,                        # lenght of rw
                 context_size=10, walks_per_node=80,
                 num_negative_samples=1,
                 p=4,q=1,
                 sparse=True)

In [None]:
loader = model.loader(batch_size=128, shuffle=True, num_workers=4)

In [None]:
for idx, (pos_nodes, neg_nodes) in enumerate(loader):
    print(idx, pos_nodes.shape, neg_nodes.shape)

In [None]:
print(pos_nodes)

In [None]:
print(neg_nodes)

## Visualization

In [None]:
import networkx as nx 
edge_tuples = [tuple(x) for x in data.edge_index.numpy().transpose()]
G = nx.from_edgelist(edge_tuples)
pos = nx.spring_layout(G, center=[0.5, 0.5])
nx.set_node_attributes(G, pos, 'pos')

In [None]:
nodelist = next(enumerate(loader))[1][0][0].tolist()
walk = nx.path_graph(len(nodelist))
nx.set_node_attributes(walk, {idx: pos[node_id] for idx, node_id in enumerate(nodelist)}, 'pos')

fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(1, 2, 1)
nx.draw_networkx(G, 
   ax=ax,
   pos=nx.get_node_attributes(G, 'pos'), 
   node_size=550,
   node_color='b',
   font_color="white",
   font_weight='bold'
                )
nx.draw(walk, 
        node_size=40,
        node_color='r',
        ax=ax,
        pos=nx.get_node_attributes(walk, 'pos'), 
        width=2,
        edge_color='r') 
ax = fig.add_subplot(1, 2, 2)
nx.draw(walk, 
        node_size=40,
        node_color='r',
        ax=ax,
        pos=nx.get_node_attributes(walk, 'pos'), 
        width=2,
        edge_color='r') 


## Training
Let's create a `Node2vec` model from the `PyG` libiary, which provides the dataloader for creating training instances as well as calculating the objective function. <br>
The objective function is defined as follows

\begin{equation}
L(\Theta) = \log \left ( \sigma (z_u^{\top} z_v)  \right) - \sum_{i=1}^k \log \left ( \sigma (z_u^{\top} z_{n_i})  \right), n_i \sim P_V
\end{equation}

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
node2vec_model = Node2Vec(data.edge_index, embedding_dim=2, walk_length=10,
                 context_size=4, walks_per_node=80,
                 num_negative_samples=5, p=4, q=1, sparse=True).to(device)

loader = node2vec_model.loader(batch_size=128, shuffle=True, num_workers=2)
optimizer = torch.optim.SparseAdam(list(node2vec_model.parameters()), lr=0.01)

In [None]:
def train():
    node2vec_model.train()
    total_loss = 0
    for pos_rw, neg_rw in tqdm(loader):
        optimizer.zero_grad()
        loss = node2vec_model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')

In [None]:
# obtain labels for each nodes
G = nx.karate_club_graph()
labels = np.asarray([G.nodes[i]['club'] != 'Mr. Hi' for i in G.nodes]).astype(np.int64)

# color mapping
mapping = {0:"purple",1:"green"}
node_colors = [mapping[i] for i in labels]

In [None]:
fig = plt.figure(figsize=(20, 10))

# plot the original graph
ax = fig.add_subplot(1, 2, 1)
nx.draw_networkx(G, 
   ax=ax,
   pos = nx.spring_layout(G, center=[0.5, 0.5]), 
   node_size=550,
   node_color=node_colors,
   font_color="white",
   font_weight='bold'
                )

# visualize embedding in 2D space
ax = fig.add_subplot(1, 2, 2)
with torch.no_grad():
    embedding = node2vec_model(torch.arange(data.num_nodes, device=device))
    embedding = embedding.cpu().numpy()
    pos = {i:v for i,v in enumerate(embedding)}

# make plots
nx.draw_networkx_nodes(G, 
   ax=ax,
   pos=pos, 
   node_size=550,
   node_color=node_colors,
)

nx.draw_networkx_labels(
    G,
    pos=pos, 
   font_color="white",
   font_weight='bold',
   )
plt.show()

# Practice: Representation learning on large graphs
<!-- This tutorial will teach you how to apply **Graph Neural Networks (GNNs) to the task of node classification**.
Here, we are given the ground-truth labels of only a small subset of nodes, and want to infer the labels for all the remaining nodes (*transductive learning*). -->

To demonstrate, we make use of the `Cora` dataset, which is a **citation network** where nodes represent documents.
Each node is described by a 1433-dimensional bag-of-words feature vector.
Two documents are connected if there exists a citation link between them.
The task is to infer the category of each document (7 in total).<br>

This dataset was first introduced by [Yang et al. (2016)](https://arxiv.org/abs/1603.08861) as one of the datasets of the `Planetoid` benchmark suite.
We again can make use [PyTorch Geometric](https://github.com/rusty1s/pytorch_geometric) for an easy access to this dataset via [`torch_geometric.datasets.Planetoid`](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.Planetoid)

Another interesting application citation graphs could be found at [connected papers](https://www.connectedpapers.com/).

In [None]:
from torch_geometric.datasets import Planetoid

dataset = 'Cora'
path = osp.join('.', 'data', dataset)
dataset = Planetoid(root=path, name='Cora', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Overall, this dataset is quite similar to the previously used [`KarateClub`](https://pytorch-geometric.readthedocs.io/en/latest/modules/datasets.html#torch_geometric.datasets.KarateClub) network.
We can see that the `Cora` network holds 2,708 nodes and 10,556 edges, resulting in an average node degree of 3.9.
For training this dataset, we are given the ground-truth categories of 140 nodes (20 for each class).
This results in a training node label rate of only 5%.

In contrast to `KarateClub`, this graph holds the additional attributes `val_mask` and `test_mask`, which denotes which nodes should be used for validation and testing. We can further see that this network is undirected, and that there exists no isolated nodes (each document has at least one citation).
<!-- Furthermore, we make use of **[data transformations](https://pytorch-geometric.readthedocs.io/en/latest/notes/introduction.html#data-transforms) via `transform=NormalizeFeatures()`**.
Transforms can be used to modify your input data before inputting them into a neural network, *e.g.*, for normalization or data augmentation.
Here, we [row-normalize](https://pytorch-geometric.readthedocs.io/en/latest/modules/transforms.html#torch_geometric.transforms.NormalizeFeatures) the bag-of-words input feature vectors. -->


## Practice: learning node embeddings for each node (document)
1. Please use node2vec to learn embeddings on `Cora` dataset
2. Define your node2vec model configurations
3. Training
4. Vistualization nodes in 2D space. The color of node indicates the "label" of the node. What did you observe in the figure?

In [None]:
# TODO: your code here!
# define your node2vec model and train it to obtain node embeddings!
node2vec_model = None
loader = None
optimizer = None

In [None]:
# define your training loop here

In [None]:
# after your model is learned, run the following scripts and see what you get!
@torch.no_grad()
def plot_points(colors):
    node2vec_model.eval()
    z = node2vec_model(torch.arange(data.num_nodes, device=device))
    z = TSNE(n_components=2).fit_transform(z.cpu().numpy())
    y = data.y.cpu().numpy()

    plt.figure(figsize=(8, 8))
    for i in range(dataset.num_classes):
        plt.scatter(z[y == i, 0], z[y == i, 1], s=20, color=colors[i])
    plt.axis('off')
    plt.show()

colors = [
    '#ffc0cb', '#bada55', '#008080', '#420420', '#7fe5f0', '#065535',
    '#ffd700'
]
plot_points(colors)

# Application of GRL: Semi-supervised document classification
As mentioned previously, `Cora` contains 2K documents of scientific papers. However, **only 5% of documents are given the ground-truth labels** in the training set while the remaining documents are unlabeled.

Since each paper(node) is described by a 1433-dimensional bag-of-words feature vector, let's first build a document classifier and see how it works!

In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F

class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(MLP, self).__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(dataset.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = self.lin2(x)
        return x

In [None]:
model = MLP(hidden_channels=64)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
    model.train()
    optimizer.zero_grad()  
    out = model(data.x)  
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  
    optimizer.step() 
    return loss

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
@torch.no_grad()
def test():
    model.eval()
    out = model(data.x)
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
    return test_acc

test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

## Improving document classification with node embedding
It seems that the MLP model with the Bag-of-Word feature cannot perform well in the semi-supervised scenario.<br>
Can we make use of the graph structure between documents and generate additional features in unsupervised fashion where the unlabeled document could provide some signals for us.

In [None]:
# Here we use a simple Logistic regression classifier for document classification
from sklearn.linear_model import LogisticRegression

In [None]:
with torch.no_grad():
    node2vec_model.eval()
    # the node embedding is actually the document embedding learned from graph structure
    node_embedding = node2vec_model().cpu().numpy() 
    print(node_embedding.shape)

In [None]:
# Training
clf = LogisticRegression() # create logistic regression model
clf.fit(node_embedding[data.train_mask,:], data.y[data.train_mask]) # fit on training set

# Testing
test_acc = clf.score(node_embedding[data.test_mask,:], data.y[data.test_mask])
print(f'Test Accuracy: {test_acc:.4f}')

Wow! We improve the testing accuracy from ~60%(BOW+MLP) to 70% with network embedding(without using the BOW features) !


Can we further improve the performance by considering both **network structure and BOW features simultaneously?**

In [None]:
# training data
train_bow = data.x[data.train_mask].numpy()
train_node_embedding = node_embedding[data.train_mask,:]
train_features = np.hstack([train_bow,train_node_embedding])

# testing data
test_bow = data.x[data.test_mask].numpy()
test_node_embedding = node_embedding[data.test_mask,:]
test_features = np.hstack([test_bow,test_node_embedding])

print("Training feature:",train_features.shape)
print("Testing feature:",test_features.shape)

In [None]:
# Training
clf = LogisticRegression() # create logistic regression model
clf.fit(train_features, data.y[data.train_mask]) # fit on training set

# Testing
test_acc = clf.score(test_features, data.y[data.test_mask])
print(f'Test Accuracy: {test_acc:.4f}')

Not looking good...

Don't worry! We will introduce how the state-of-the-art solution: **Graph Neural Network(GNN)** solves this issue in the following topic.

# Application\#2: Citation recommendation (link prediction)

In the citation prediction problem, we are given the citation graph where documents connect with each other if one cites another. <br>
The goal in to recommend pontential citatations for specific papers. In other words, we're try to find a missing link between documents. <br>
This problem is also a famous task in graph mining called "link prediction" as people would like to know the potential links between nodes.<br>

In [None]:
import os.path as osp

import torch
from sklearn.metrics import roc_auc_score

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      add_negative_train_samples=True),
])
dataset = Planetoid(path, name='Cora', transform=transform)
# After applying the `RandomLinkSplit` transform, the data is transformed from
# a data object to a list of tuples (train_data, val_data, test_data), with
# each element representing the corresponding split.
train_data, val_data, test_data = dataset[0]

In [None]:
print("--------Training data------")
print(train_data)
print("Training edges:")
print(train_data.edge_label_index)
print("Labels")
print(train_data.edge_label)

print()

print("--------Validation data------")
print(val_data)
print("Validation edges:")
print(val_data.edge_label_index)
print("Labels")
print(val_data.edge_label[:10])

print()
print("--------Testing data------")
print(test_data)
print("Testing edges:")
print(test_data.edge_label_index)
print("Labels")
print(test_data.edge_label)

## Obtain node embedding with training edges

In [None]:
# Create node2vec model to obtain node embeddings
# Note: We only use the edges in training data (since we're predicting the remaining edges)
node2vec_model = Node2Vec(train_data.edge_index, embedding_dim=128, walk_length=20,
                 context_size=10, walks_per_node=10,
                 num_negative_samples=1, p=1, q=1, sparse=True).to(device)

loader = node2vec_model.loader(batch_size=128, shuffle=True, num_workers=4)
optimizer = torch.optim.SparseAdam(list(node2vec_model.parameters()), lr=0.01)

In [None]:
def train():
    node2vec_model.train()
    total_loss = 0
    for pos_rw, neg_rw in tqdm(loader):
        optimizer.zero_grad()
        loss = node2vec_model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')

## Use the node embedding as feature to train link prediction model
An interesting issue here is: Node2vec gives us the **"node-level"** features while we need the **"edge-level"** information to determin whether a edge exist or not. <br>
Here's some candidate:
1. Concatenation of node A and node B's embeddings 
2. Elementwise-subtract between node A and node B's embeddings 
3. Elementwise-product between node A and node B's embeddings 

Empirically, people use the third approach which leads to the best performance!

In [None]:
with torch.no_grad():
    node2vec_model.eval()
    node_embedding = node2vec_model().cpu().numpy() 
    print(node_embedding.shape)

In [None]:
# obtaining embedding feautre
train_embedding_pair = node_embedding[train_data.edge_label_index.T]
print(train_embedding_pair.shape)

# apply element-wise product to represent the "edge" feature
train_features = train_embedding_pair[:,0,:] * train_embedding_pair[:,1,:]
print(train_features.shape) # we have 8976 examples with 128 dimensional feature

In [None]:
# use LR as classifier for link prediction
link_clf = LogisticRegression()
link_clf.fit(train_features,train_data.edge_label.numpy())

In [None]:
# feature for predicting testing edges
test_embedding_pair = node_embedding[test_data.edge_label_index.T]
print(test_embedding_pair.shape)

# apply element-wise product to represent the "edge" feature
test_features = test_embedding_pair[:,0,:] * test_embedding_pair[:,1,:]
print(test_features.shape) 

In [None]:
# calculate link prediction accuracy 
acc = link_clf.score(test_features, test_data.edge_label.numpy())
print(f"Accuracy score:{acc:.4f}")
roc = roc_auc_score(test_data.edge_label.numpy(), link_clf.predict_proba(test_features)[:,1])
print(f"ROC score:{roc:.4f}")