In [1]:
import os
import numpy as np
import pandas as pd
import torch

from torch_geometric.data import Data

## Load dataset

The Wiki-talk dataset we will use is not available in the ```torch_geometric``` package. Thus we need to load it manually.

In [2]:
# params
data_dir = 'data/wiki-talk/'
train_size = 0.5
lang = 'br'    # specify which language of Wikipedia 

### Load edge info

In [3]:
edgelist = pd.read_csv(os.path.join(data_dir, f"{lang}-wiki-talk"), sep='\t', header=None, names=["source", "target", "timestamp"])
edgelist

Unnamed: 0,source,target,timestamp
0,3,3,2004-06-23T18:51:49Z
1,112,3,2005-08-31T08:43:02Z
2,28743,3,2015-03-17T19:23:56Z
3,33919,3,2015-04-17T19:48:20Z
4,28743,3,2015-04-19T02:47:40Z
...,...,...,...
13749,36043,36043,2015-10-18T12:40:05Z
13750,36056,36056,2015-10-19T04:55:25Z
13751,500,36149,2015-10-30T13:12:31Z
13752,200,16709,2015-11-11T08:49:42Z


### Load node info

In [4]:
df_nodes = pd.read_csv(os.path.join(data_dir, f"{lang}-user-group"), sep='\t', header=None)
df_nodes

Unnamed: 0,0,1
0,388,2
1,35159,1
2,528,1
3,2695,1
4,28743,1
...,...,...
84,1890,1
85,724,1
86,25719,1
87,3010,1


In [5]:
max_node_num = np.max(edgelist[["source", "target"]].values) + 1
node_ids = np.sort(np.unique(edgelist[["source", "target"]].values))

In [6]:
# number of nodes
N = len(node_ids)

In [7]:
# dataset (roles)
roles = df_nodes[[0, 1]].values

y = [0] * max_node_num
for r in roles:
    y[r[0]] = r[1]

y = np.array([y[i] for i in node_ids])

In [8]:
df_nodes = pd.DataFrame({'id': node_ids, 'label': y})
df_nodes

Unnamed: 0,id,label
0,1,0
1,3,0
2,5,0
3,7,0
4,8,2
...,...,...
1176,35942,0
1177,36043,0
1178,36056,0
1179,36149,0


## Convert to continuous node IDs
In the original Wiki-talk datasets, the node IDs are discontinuous as shown above, because they represent the original Wikipedia user IDs. However, graph neural networks typically expect node indices to range from 0 to `N-1` where `N` is the number of nodes.
Thus, the conversion is necessary. We now show a way to convert the discontinous node IDs to continuous ones.

In [9]:
# create a mapping between the original IDs and continuous IDs
node_id_mapping = { node_id : i for i, node_id in enumerate(node_ids) }

In [10]:
# replace the original IDs with the continuous IDs
edgelist[["source", "target"]] = edgelist[["source", "target"]].replace(node_id_mapping)
df_nodes['id'] = df_nodes['id'].replace(node_id_mapping)

In [11]:
df_nodes['id'].replace(node_id_mapping)

0          0
1          0
2          2
3          1
4          4
        ... 
1176    1176
1177    1177
1178    1178
1179     526
1180    1180
Name: id, Length: 1181, dtype: int64

## Compute node features

In [12]:
import networkx as nx

# we can use directed graph here because we do not rely on the node classification algorithms from nx
G = nx.DiGraph()
G.add_nodes_from(range(N))

In [13]:
G.add_edges_from(edgelist[["source", "target"]].values)

In [14]:
# Compute Degree
deg = [val for (node, val) in G.degree()]

# Compute In-Degree
in_deg = [val for (node, val) in G.in_degree()]

# Compute Out-Degree
out_deg = [val for (node, val) in G.out_degree()]

# Compute Clustering Coefficient
clust_coeff = list(nx.clustering(G.to_undirected()).values())

# Compute PageRank
pr = list(nx.pagerank(G, alpha=0.85).values())

In [15]:
x = np.array([deg, in_deg, out_deg, clust_coeff, pr]).T

### Split training and test data

In [16]:
# Generate random permutation of node indices
perm = torch.randperm(N)

# Select train and test nodes
train_idx = perm[: int(train_size * N)]
test_idx = perm[int(train_size * N) :]

# Initialize train_mask and test_mask with False
train_mask = torch.zeros(N, dtype=torch.bool)
test_mask = torch.zeros(N, dtype=torch.bool)

# Set the selected indices to True
train_mask[train_idx] = True
test_mask[test_idx] = True

### Load data to PyG

In [17]:
edge_index = torch.tensor(edgelist[["target", "source"]].values.T, dtype=torch.long)

# node features and labels
x = torch.tensor(x, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float)

In [18]:
data = Data(x=x, y=y, edge_index=edge_index, train_mask=train_mask, test_mask=test_mask)
data

Data(x=[1181, 5], edge_index=[2, 13754], y=[1181], train_mask=[1181], test_mask=[1181])

In [19]:
data.validate(raise_on_error=True)

True

## Define and train a simply Graph Convolutional Network model

In [20]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(5, 16)
        self.conv2 = GCNConv(16, 3)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [21]:
# find the best device to run on
if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
# move the model and data to the device
model = GCN().to(device)
data = data.to(device)

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

## Evaluation

To evaluate the Graph Convolutional Network model we just trained, we use it to predict the labels of the nodes in the test set, and compare the results with the ground truth. We print the accuracy as the metrics.

In [24]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.9492


In [25]:
torch.sum(pred[data.test_mask])

tensor(0, device='mps:0')

In [27]:
torch.sum(data.y[data.test_mask])

tensor(36., device='mps:0')