In [1]:
import os

os.environ["DGLBACKEND"] = "pytorch"
from functools import partial

import dgl
import dgl.function as fn
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
from dgl.nn import GINConv
import numpy as np
from pygod.utils import load_data as pygod_load_data

In [2]:
class RGCNLayer(nn.Module):
    def __init__(
        self,
        in_feat,
        out_feat,
        num_rels,
        num_bases=-1,
        bias=None,
        activation=None,
        is_input_layer=False,
    ):
        super(RGCNLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.bias = bias
        self.activation = activation
        self.is_input_layer = is_input_layer

        # sanity check
        if self.num_bases <= 0 or self.num_bases > self.num_rels:
            self.num_bases = self.num_rels
        # weight bases in equation (3)
        self.weight = nn.Parameter(
            torch.Tensor(self.num_bases, self.in_feat, self.out_feat)
        )
        if self.num_bases < self.num_rels:
            # linear combination coefficients in equation (3)
            self.w_comp = nn.Parameter(
                torch.Tensor(self.num_rels, self.num_bases)
            )
        # add bias
        if self.bias:
            self.bias = nn.Parameter(torch.Tensor(out_feat))
        # init trainable parameters
        nn.init.xavier_uniform_(
            self.weight, gain=nn.init.calculate_gain("relu")
        )
        if self.num_bases < self.num_rels:
            nn.init.xavier_uniform_(
                self.w_comp, gain=nn.init.calculate_gain("relu")
            )
        if self.bias:
            nn.init.xavier_uniform_(
                self.bias, gain=nn.init.calculate_gain("relu")
            )

    def forward(self, g):
        if self.num_bases < self.num_rels:
            # generate all weights from bases (equation (3))
            weight = self.weight.view(
                self.in_feat, self.num_bases, self.out_feat
            )
            weight = torch.matmul(self.w_comp, weight).view(
                self.num_rels, self.in_feat, self.out_feat
            )
        else:
            weight = self.weight
        if self.is_input_layer:

            def message_func(edges):
                # for input layer, matrix multiply can be converted to be
                # an embedding lookup using source node id
                embed = weight.view(-1, self.out_feat)
                index = edges.data[dgl.ETYPE] * self.in_feat + edges.src["id"]
                return {"msg": embed[index] * edges.data["norm"]}

        else:

            def message_func(edges):
                w = weight[edges.data[dgl.ETYPE]]
                msg = torch.bmm(edges.src["h"].unsqueeze(1), w).squeeze()
                msg = msg * edges.data["norm"]
                return {"msg": msg}

        def apply_func(nodes):
            h = nodes.data["h"]
            if self.bias:
                h = h + self.bias
            if self.activation:
                h = self.activation(h)
            return {"h": h}

        g.update_all(message_func, fn.sum(msg="msg", out="h"), apply_func)

In [3]:
class Model(nn.Module):
    def __init__(
        self,
        num_nodes,
        h_dim,
        out_dim,
        num_rels,
        num_bases=-1,
        num_hidden_layers=1,
    ):
        super(Model, self).__init__()
        self.num_nodes = num_nodes
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.num_hidden_layers = num_hidden_layers

        # create rgcn layers
        self.build_model()

        # create initial features
        self.features = self.create_features()

    def build_model(self):
        self.layers = nn.ModuleList()
        # input to hidden
        i2h = self.build_input_layer()
        self.layers.append(i2h)
        # hidden to hidden
        for _ in range(self.num_hidden_layers):
            h2h = self.build_hidden_layer()
            self.layers.append(h2h)
        # hidden to output
        h2o = self.build_output_layer()
        self.layers.append(h2o)

    # initialize feature for each node
    def create_features(self):
        features = torch.arange(self.num_nodes)
        return features

    def build_input_layer(self):
        return RGCNLayer(
            self.num_nodes,
            self.h_dim,
            self.num_rels,
            self.num_bases,
            activation=F.relu,
            is_input_layer=True,
        )

    def build_hidden_layer(self):
        return RGCNLayer(
            self.h_dim,
            self.h_dim,
            self.num_rels,
            self.num_bases,
            activation=F.relu,
        )

    def build_output_layer(self):
        return RGCNLayer(
            self.h_dim,
            self.out_dim,
            self.num_rels,
            self.num_bases,
            activation=partial(F.softmax, dim=1),
        )

    def forward(self, g):
        if self.features is not None:
            g.ndata["id"] = self.features
        for layer in self.layers:
            layer(g)
        return g.ndata.pop("h")

# Amazon

In [4]:
from dgl.data import FraudAmazonDataset
amazon = FraudAmazonDataset()
g1 = amazon[0]
num_classes = amazon.num_classes
feat = g1.ndata['feature']
label = g1.ndata['label']

Done loading data from cached files.


In [5]:
train_mask = g1.nodes['user'].data['train_mask']
test_mask = g1.nodes['user'].data['test_mask']
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze()
labels = g1.nodes['user'].data.pop("label")
num_rels = len(g1.canonical_etypes)
num_classes = amazon.num_classes

# normalization factor
for cetype in g1.canonical_etypes:
    g1.edges[cetype].data["norm"] = dgl.norm_by_dst(g1, cetype).unsqueeze(1)

In [6]:
# configurations
n_hidden = 16  # number of hidden units
n_bases = -1  # use number of relations as number of bases
n_hidden_layers = 0  # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 100  # epochs to train
lr = 0.01  # learning rate
l2norm = 0  # L2 norm coefficient

# create graph
g = dgl.to_homogeneous(g1, edata=["norm"])
node_ids = torch.arange(g.num_nodes())
#target_idx = node_ids[g1.ndata[dgl.NTYPE] == category_id]

# create model
model = Model(
    g.num_nodes(),
    n_hidden,
    num_classes,
    num_rels,
    num_bases=n_bases,
    num_hidden_layers=n_hidden_layers,
)

In [7]:
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)

print("start training...")
model.train()
for epoch in range(n_epochs):
    optimizer.zero_grad()
    logits = model.forward(g)
    #logits = logits[target_idx]
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
    loss.backward()

    optimizer.step()

    train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
    train_acc = train_acc.item() / len(train_idx)
    val_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
    val_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx])
    val_acc = val_acc.item() / len(test_idx)
    print(
        "Epoch {:05d} | ".format(epoch)
        + "Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
            train_acc, loss.item()
        )
        + "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
            val_acc, val_loss.item()
        )
    )

start training...
Epoch 00000 | Train Accuracy: 0.7037 | Train Loss: 0.6930 | Validation Accuracy: 0.6877 | Validation loss: 0.6930
Epoch 00001 | Train Accuracy: 0.9056 | Train Loss: 0.6533 | Validation Accuracy: 0.8936 | Validation loss: 0.6552
Epoch 00002 | Train Accuracy: 0.9056 | Train Loss: 0.6140 | Validation Accuracy: 0.8936 | Validation loss: 0.6181
Epoch 00003 | Train Accuracy: 0.9057 | Train Loss: 0.5749 | Validation Accuracy: 0.8936 | Validation loss: 0.5812
Epoch 00004 | Train Accuracy: 0.9057 | Train Loss: 0.5382 | Validation Accuracy: 0.8936 | Validation loss: 0.5466
Epoch 00005 | Train Accuracy: 0.9054 | Train Loss: 0.5059 | Validation Accuracy: 0.8936 | Validation loss: 0.5160
Epoch 00006 | Train Accuracy: 0.9054 | Train Loss: 0.4791 | Validation Accuracy: 0.8936 | Validation loss: 0.4905
Epoch 00007 | Train Accuracy: 0.9054 | Train Loss: 0.4580 | Validation Accuracy: 0.8936 | Validation loss: 0.4703
Epoch 00008 | Train Accuracy: 0.9054 | Train Loss: 0.4422 | Validation

In [8]:
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

# After training, when evaluating on test data:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    logits = model.forward(g)
    probabilities = F.softmax(logits[test_idx], dim=1)  # Convert logits to probabilities
    # Assuming your positive class is 1 (adjust accordingly if it's different)
    positive_probabilities = probabilities[:, 1]  # Get probabilities for the positive class
    # Calculate ROC-AUC
    roc_auc = roc_auc_score(labels[test_idx].cpu(), positive_probabilities.cpu())
    print(f"ROC-AUC Score: {roc_auc}")

ROC-AUC Score: 0.8294357675531168


# Yelp

In [9]:
from dgl.data import FraudYelpDataset
yelp = FraudYelpDataset()
g2 = yelp[0]
num_classes = yelp.num_classes
feat = g2.ndata['feature']
label = g2.ndata['label']
train_mask = g2.ndata['train_mask']
test_mask = g2.ndata['test_mask']

Done loading data from cached files.


In [10]:
train_mask = g2.nodes['review'].data['train_mask']
test_mask = g2.nodes['review'].data['test_mask']
train_idx = torch.nonzero(train_mask, as_tuple=False).squeeze()
test_idx = torch.nonzero(test_mask, as_tuple=False).squeeze()
labels = g2.nodes['review'].data.pop("label")
num_rels = len(g2.canonical_etypes)
num_classes = amazon.num_classes

# normalization factor
for cetype in g2.canonical_etypes:
    g2.edges[cetype].data["norm"] = dgl.norm_by_dst(g2, cetype).unsqueeze(1)

In [11]:
# configurations
n_hidden = 16  # number of hidden units
n_bases = -1  # use number of relations as number of bases
n_hidden_layers = 0  # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 100  # epochs to train
lr = 0.01  # learning rate
l2norm = 0  # L2 norm coefficient

# create graph
g = dgl.to_homogeneous(g2, edata=["norm"])
node_ids = torch.arange(g.num_nodes())
#target_idx = node_ids[g2.ndata[dgl.NTYPE] == category_id]

# create model
model = Model(
    g.num_nodes(),
    n_hidden,
    num_classes,
    num_rels,
    num_bases=n_bases,
    num_hidden_layers=n_hidden_layers,
)

In [12]:
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)

print("start training...")
model.train()
for epoch in range(n_epochs):
    optimizer.zero_grad()
    logits = model.forward(g)
    #logits = logits[target_idx]
    loss = F.cross_entropy(logits[train_idx], labels[train_idx])
    loss.backward()

    optimizer.step()

    train_acc = torch.sum(logits[train_idx].argmax(dim=1) == labels[train_idx])
    train_acc = train_acc.item() / len(train_idx)
    val_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
    val_acc = torch.sum(logits[test_idx].argmax(dim=1) == labels[test_idx])
    val_acc = val_acc.item() / len(test_idx)
    print(
        "Epoch {:05d} | ".format(epoch)
        + "Train Accuracy: {:.4f} | Train Loss: {:.4f} | ".format(
            train_acc, loss.item()
        )
        + "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format(
            val_acc, val_loss.item()
        )
    )

start training...
Epoch 00000 | Train Accuracy: 0.7030 | Train Loss: 0.6929 | Validation Accuracy: 0.7103 | Validation loss: 0.6929
Epoch 00001 | Train Accuracy: 0.8586 | Train Loss: 0.6646 | Validation Accuracy: 0.8610 | Validation loss: 0.6671
Epoch 00002 | Train Accuracy: 0.8586 | Train Loss: 0.6335 | Validation Accuracy: 0.8609 | Validation loss: 0.6394
Epoch 00003 | Train Accuracy: 0.8586 | Train Loss: 0.6016 | Validation Accuracy: 0.8607 | Validation loss: 0.6103
Epoch 00004 | Train Accuracy: 0.8586 | Train Loss: 0.5708 | Validation Accuracy: 0.8609 | Validation loss: 0.5813
Epoch 00005 | Train Accuracy: 0.8584 | Train Loss: 0.5430 | Validation Accuracy: 0.8610 | Validation loss: 0.5540
Epoch 00006 | Train Accuracy: 0.8583 | Train Loss: 0.5192 | Validation Accuracy: 0.8610 | Validation loss: 0.5297
Epoch 00007 | Train Accuracy: 0.8583 | Train Loss: 0.5001 | Validation Accuracy: 0.8610 | Validation loss: 0.5092
Epoch 00008 | Train Accuracy: 0.8583 | Train Loss: 0.4855 | Validation

In [13]:
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score

# After training, when evaluating on test data:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    logits = model.forward(g)
    probabilities = F.softmax(logits[test_idx], dim=1)  # Convert logits to probabilities
    # Assuming your positive class is 1 (adjust accordingly if it's different)
    positive_probabilities = probabilities[:, 1]  # Get probabilities for the positive class
    # Calculate ROC-AUC
    roc_auc = roc_auc_score(labels[test_idx].cpu(), positive_probabilities.cpu())
    print(f"ROC-AUC Score: {roc_auc}")

ROC-AUC Score: 0.7329092264805646


# Reddit

In [5]:
reddit = pygod_load_data('reddit')
g3 = dgl.graph((reddit.edge_index[0], reddit.edge_index[1]))
g3.ndata['feature'] = reddit.x
g3.ndata['label'] = reddit.y.type(torch.LongTensor)

In [6]:
num_nodes = g3.number_of_nodes()
indices = np.random.permutation(num_nodes)

# Assuming 70% training, 15% validation, 15% testing split
train_size = int(num_nodes * 0.8)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)

train_mask[indices[:train_size]] = True
test_mask[indices[train_size:]] = True

# Assign the masks to your graph
g3.ndata['train_mask'] = train_mask
g3.ndata['test_mask'] = test_mask

In [7]:
class ApplyNodeFunc(nn.Module):
    """
    This module applies a linear transformation followed by a non-linearity.
    """
    def __init__(self, mlp):
        super(ApplyNodeFunc, self).__init__()
        self.mlp = mlp

    def forward(self, h):
        h = self.mlp(h)
        h = F.relu(h)
        return h

class GINLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GINLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)
        self.apply_func = ApplyNodeFunc(self.linear)

    def forward(self, g, h):
        ginconv = GINConv(self.apply_func, 'sum')  # 'sum' is the aggregator type
        return ginconv(g, h)

class GINModel(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats, num_layers):
        super(GINModel, self).__init__()
        self.layers = nn.ModuleList()
        # Input layer
        self.layers.append(GINLayer(in_feats, hidden_feats))
        # Hidden layers
        for i in range(num_layers - 2):
            self.layers.append(GINLayer(hidden_feats, hidden_feats))
        # Output layer
        self.layers.append(GINLayer(hidden_feats, out_feats))

    def forward(self, g):
        h = g.ndata['feature']
        for layer in self.layers:
            h = layer(g, h)
        return h

In [8]:
def train(model, g, labels, epochs, lr):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        logits = model(g)
        loss = F.cross_entropy(logits[g.ndata['train_mask']], labels[g.ndata['train_mask']])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f'Epoch {epoch}, Loss: {loss.item()}')

In [9]:
# Assuming g3 is your graph and it has 'feature' and 'label' as node data
in_feats = g3.ndata['feature'].shape[1]
hidden_feats = 64  # Example hidden feature size
out_feats = len(torch.unique(g3.ndata['label']))  # Number of classes
num_layers = 3  # Number of GIN layers

model = GINModel(in_feats, hidden_feats, out_feats, num_layers)
train(model, g3, g3.ndata['label'], epochs=100, lr=0.01)

Epoch 0, Loss: 18.26190185546875
Epoch 1, Loss: 18.448213577270508
Epoch 2, Loss: 16.28013038635254
Epoch 3, Loss: 7.517642498016357
Epoch 4, Loss: 0.6931473612785339
Epoch 5, Loss: 0.6931473612785339
Epoch 6, Loss: 0.6931473612785339
Epoch 7, Loss: 0.6931473612785339
Epoch 8, Loss: 0.6931473612785339
Epoch 9, Loss: 0.6931473612785339
Epoch 10, Loss: 0.6931473612785339
Epoch 11, Loss: 0.6931473612785339
Epoch 12, Loss: 0.6931473612785339
Epoch 13, Loss: 0.6931473612785339
Epoch 14, Loss: 0.6931473612785339
Epoch 15, Loss: 0.6931473612785339
Epoch 16, Loss: 0.6931473612785339
Epoch 17, Loss: 0.6931473612785339
Epoch 18, Loss: 0.6931473612785339
Epoch 19, Loss: 0.6931473612785339
Epoch 20, Loss: 0.6931473612785339
Epoch 21, Loss: 0.6931473612785339
Epoch 22, Loss: 0.6931473612785339
Epoch 23, Loss: 0.6931473612785339
Epoch 24, Loss: 0.6931473612785339
Epoch 25, Loss: 0.6931473612785339
Epoch 26, Loss: 0.6931473612785339
Epoch 27, Loss: 0.6931473612785339
Epoch 28, Loss: 0.693147361278533

In [10]:
model.eval()
with torch.no_grad():
    logits = model(g3)
    test_logits = logits[g3.ndata['test_mask']]
    test_labels = g3.ndata['label'][g3.ndata['test_mask']]

In [11]:
from sklearn.metrics import roc_auc_score

# Assuming binary classification and logits are output from your model
probs = torch.softmax(test_logits, dim=1)[:, 1].numpy()  # Probability for the positive class
auc_score = roc_auc_score(test_labels.numpy(), probs)
print(f'Test ROC-AUC: {auc_score}')

Test ROC-AUC: 0.5
