In [None]:
%load_ext autoreload
%matplotlib inline
import sys

import pyTigerGraph
pyTigerGraph.__path__

# Recommender System with EvolveGCN on PyG
Dataset: LastFM (based: [JODIE: Predicting Dynamic Embedding Trajectory in Temporal Interaction Networks](http://snap.stanford.edu/jodie/))
GNN Model: LightGCN [\[2002.02126\] LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation](https://arxiv.org/abs/2002.02126)

### Connect to TigerGraph

In [None]:
from pyTigerGraph import TigerGraphConnection

conn = TigerGraphConnection(
    host="http://127.0.0.1", # Change the address to your database server's
    graphname="LastFM_hetrec",  # Specify LastFM (HetRec) dataset
    username="tigergraph",
    password="tigergraph"
)

In [None]:
%%time
# Basic metadata about the graph such as schema.
# print(conn.gsql("ls"))

In [None]:
print("Vertex Count:", conn.getVertexCount('*'))
print("Edge Count:", conn.getEdgeCount('*'))

(Custimization Point for pyTigerGraph)
 In order to handle bipartite graphs which is comprised of users and items, we explicty specify # of users and items - which is required to compute the similarty score between user embeddings and items embeddings in GNN models.  To create a matrix for users/items embedding, we need to know # of users and items.  To that end. pyTigerGraph should support functions or attributes to extract these numbers for bipartite graphs - by propopsing two functions - tgraph.number_of_source_vertices(), tgraph.number_of_target_vertices(). These proposed methods should be used for general graphs as long as we target recommendation tasks.

In [None]:
num_users, num_items, num_tags = conn.getVertexCount("UserH"), conn.getVertexCount("Artist"), conn.getVertexCount("Tag")
num_user_items = num_users + num_items
num_nodes = num_user_items + num_tags
num_user_items, num_nodes

In recommendation tasks, the input dataset must be split by edges instead of nodes. pyTigerGraph supports node masking to split dataset, but edge masking and custom functions, which extract subgraphs with specified set of edges (training / validation / testing edges ), should be also provided.

In [None]:
%%time
# Train : Val : Test = 70 : 15 : 15
splitter = conn.gds.edgeSplitter(train_mask=0.70, val_mask=0.15, test_mask=0.15)
splitter.run()

# Graph loader with edge features
Currently the `GraphLoader` of pyTigerGraph only considers node-based labels but for recommender systems, it needs to support edge-based labels (training/validation/testing) to split datasets into training/validation/testing datasets by edge masking and custom functions. To implement this feature, we need to write GSQL to get edge-based labels. 

In [None]:
graph_loader = conn.gds.graphLoader(
    v_in_feats=["x"],
    v_out_labels=[],
    v_extra_feats=[],
    e_in_feats=["time"],
    e_out_labels=[],
    e_extra_feats=["edge_type", "train_mask", "val_mask", "test_mask"],
    num_batches=1,
    shuffle=False,
    output_format="PyG",
    add_self_loop=False,
    loader_id=None,
    buffer_size=4
)

In [None]:
# Get the whole graph from the loader in PyG format
whole_graph = graph_loader.data
print(whole_graph.edge_type)
whole_graph

## Extract sub-dataset by edges
The following codes are not needed for node classification since PyG supports node-based partioning to trainining/validation/testing. As previously described, edge partioning is needed for link prediction or recommender tasks. To solve this constraint, The `GraphLoader` should provide custom functions to extract subgraphs for training/validation/testing  based on edges.  The following "train_data" or "val_data'. .. indicate subgraphs in PyG. 

In [None]:
train_loader = conn.gds.graphLoader(
    v_in_feats=["x"],
    v_out_labels=[],
    v_extra_feats=[],
    e_in_feats=["time"],
    e_out_labels=[],
    e_extra_feats=["edge_type", "train_mask"],
    num_batches=1,
    shuffle=False,
    filter_by="train_mask",
    output_format="PyG",
    add_self_loop=False,
    loader_id=None,
    buffer_size=4
)
train_graph = train_loader.data

val_loader = conn.gds.graphLoader(
    v_in_feats=["x"],
    v_out_labels=[],
    v_extra_feats=[],
    e_in_feats=["time"],
    e_out_labels=[],
    e_extra_feats=["edge_type", "val_mask"],
    num_batches=1,
    shuffle=False,
    filter_by="val_mask",
    output_format="PyG",
    add_self_loop=False,
    loader_id=None,
    buffer_size=4
)
val_graph = val_loader.data

test_loader = conn.gds.graphLoader(
    v_in_feats=["x"],
    v_out_labels=[],
    v_extra_feats=[],
    e_in_feats=["time"],
    e_out_labels=[],
    e_extra_feats=["edge_type", "test_mask"],
    num_batches=1,
    shuffle=False,
    filter_by="test_mask",
    output_format="PyG",
    add_self_loop=False,
    loader_id=None,
    buffer_size=4
)
test_graph = test_loader.data

print("train_graph:", train_graph)
print("val_graph:", val_graph)
print("test_graph:", test_graph)

## Create and set bipartite edges to train/val/test subgraphs

For each subgraph dataset (training, validation and testing), construct undirected bipartite edges for message passing in the LightGCN model from the directed edges as the ground-truth labels of the link prediction.

In [None]:
from torch_geometric.utils import to_undirected

train_edge_index_d = train_graph.edge_index
train_edge_type = train_graph.edge_type
train_edge_index_u, train_edge_type_u = to_undirected(train_edge_index_d, edge_attr=train_edge_type)
train_graph.edge_label_index = train_edge_index_d
train_graph.edge_index = train_edge_index_u
train_graph.edge_type = train_edge_type_u

val_edge_index_d = val_graph.edge_index
val_edge_type = val_graph.edge_type
val_edge_index_u, val_edge_type_u = to_undirected(val_edge_index_d, edge_attr=val_edge_type)
val_graph.edge_label_index = val_edge_index_d
val_graph.edge_index = val_edge_index_u
val_graph.edge_type = val_edge_type_u

test_edge_index_d = test_graph.edge_index
test_edge_type = test_graph.edge_type
test_edge_index_u, test_edge_type_u = to_undirected(test_edge_index_d, edge_attr=test_edge_type)
test_graph.edge_label_index = test_edge_index_d
test_graph.edge_index = test_edge_index_u
test_graph.edge_type = test_edge_type_u

In [None]:
train_graph, val_graph, test_graph

In [None]:
train_graph.edge_type
num_relations = whole_graph.edge_type.max().item() + 1  # Number of edge types
num_relations

# Construct EvolveGCN model and optimizer

We build a EvolveGCN model, and use the Adam optimizer with a learning rate of 0.001.

In [None]:
%autoreload
import numpy as np
import torch
from torch.optim import Optimizer
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data

sys.path.append("..")
from recsys.model.rgcn import RGCN
from recsys.config import config
from recsys.utils.sample_negative import sample_negative_edges
from recsys.data.lastfm import LastFMDataset, LastFMHetRecDataset

print("EvolveGCN training configuration:", config)

## Construct model and optimizer

In [None]:
%autoreload
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = config.embedding_dim

gnn = RGCN(
    embedding_dim=input_dim,
    num_nodes=num_nodes,
    num_users=num_users,
    num_items=num_items,
    num_layers=config.num_layers,
    num_relations=num_relations,
).to(device)

opt = torch.optim.Adam(gnn.parameters(), lr=config.lr)  # using Adam optimizer

## Training and Testing

In [None]:
%autoreload
def train(
    model,  # GNN (EvolveGCN) model
    data_mp: Data,  # Message passing edges for multi-scale embedding propagation
    loader: DataLoader,  # DataLoader in batches of supervision/evaluation edges
    opt: Optimizer,  # Optimizer
    num_users: int,  # Number of user nodes
    num_nodes: int,  # Number of total nodes (users + items)
    device: torch.device,  # Device (CPU or GPU)
):
    total_loss = 0
    total_examples = 0
    model.train()
    i = 0
    for batch in loader:  # positive (existing) edges
        i += 1
        del batch.batch
        del batch.ptr  # delete unwanted attributes

        opt.zero_grad()
        # Generate negative (non-existing) edges
        negs = sample_negative_edges(batch, num_users, num_user_items, device)
        data_mp, batch, negs = data_mp.to(device), batch.to(device), negs.to(device)
        loss = model.calc_loss(data_mp, batch, negs)  # Train and compute loss
        loss.backward()
        opt.step()

        num_examples = batch.edge_index.shape[1]
        total_loss += loss.item() * num_examples
        total_examples += num_examples
    avg_loss = total_loss / total_examples
    return avg_loss


def test(
    model,  # GNN (EvolveGCN) model
    data_mp: Data,  # Message passing edges for multi-scale embedding propagation
    loader: DataLoader,  # DataLoader in batches of evaluation edges
    k: int,  # Top-k
    device: torch.device,  # Device (CPU or GPU)
):
    model.eval()
    all_recalls = {}
    with torch.no_grad():
        data_mp = data_mp.to(device)  # Save multi-scale embeddings if save_dir is not None

        # Run evaluation
        for batch in loader:  # Batches of positive (existing) edges
            del batch.batch; del batch.ptr  # delete unwanted attributes
            batch = batch.to(device)
            recalls = model.evaluation(data_mp, batch, k)  # Evaluate model performance
            for customer_idx in recalls:
                assert customer_idx not in all_recalls
            all_recalls.update(recalls)
    recall_at_k = np.mean(list(all_recalls.values()))
    return recall_at_k

## Prepare existing edges for ground-truth data by representing them as edge list
For each subgraph dataset (training, validation and testing), construct a `DataLoader` to load ground-truth positive (existing) edges for training.

In [None]:
%autoreload
train_existing_edges = LastFMHetRecDataset("tmp", edge_index=train_graph.edge_label_index, edge_type=train_graph.edge_type)
val_existing_edges = LastFMHetRecDataset("tmp", edge_index=val_graph.edge_label_index, edge_type=val_graph.edge_type)
test_existing_edges = LastFMHetRecDataset("tmp", edge_index=test_graph.edge_label_index, edge_type=test_graph.edge_type)

train_label_loader = DataLoader(train_existing_edges, batch_size=config.batch_size, shuffle=True)
val_label_loader = DataLoader(val_existing_edges, batch_size=config.batch_size, shuffle=False)
test_label_loader = DataLoader(test_existing_edges, batch_size=config.batch_size, shuffle=False)

### Train the model

In [None]:
%%time
%autoreload

from time import time

input_dim = config.embedding_dim

all_train_losses = list()  # list of (epoch, training loss)
all_val_recalls = list()  # list of (epoch, validation recall@k)
all_train_recalls_lgcn = list()  # list of training recalls in LightGCN

st = time()
for epoch in range(config.epochs):
    train_loss = train(gnn, train_graph, train_label_loader, opt, num_users, num_nodes, device)
    all_train_losses.append((epoch, train_loss))

    val_recall = test(gnn, val_graph, val_label_loader, config.k, device)
    all_val_recalls.append((epoch, val_recall))
    all_train_recalls_lgcn.append(val_recall)
    tm = time() - st
    print(f"Epoch {epoch}: train loss={train_loss:.6f}, val_recall={val_recall:.6f}, time={tm:.2f}[s]")

### Test the model

In [None]:
# Print best validation recall@k value
best_val_recall = max(all_val_recalls, key=lambda x: x[1])
print(f"Best validation recall@k: {best_val_recall[1]} at epoch {best_val_recall[0]}")

# Print final recall@k on test set
test_recall = test(gnn, test_graph, test_label_loader, config.k, device)
print(f"Test set recall@k: {test_recall}")

### Visualize testing results

In [None]:
import matplotlib.pyplot as plt

all_epochs = list(range(len(all_train_recalls_lgcn)))
fig = plt.figure()
fig.patch.set_facecolor("white")

plt.plot(all_epochs, all_train_recalls_lgcn, label="RGCN")
plt.ylim(bottom=0.0)
plt.xlabel("Epoch")
plt.ylabel("Recall@k")
plt.title(f"Train recalls (top-k: {config.k}, dim: {input_dim}, lr: {config.lr})")
plt.legend()
plt.show()