In [2]:
import dgl
import dgl.nn as dglnn

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
# from dgl import AddSelfLoop
# from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset

# for metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import numpy as np

In [3]:
class GCN(nn.Module):
    def __init__(self, in_size, hid_size, out_size, num_layers=2):
        super().__init__()
        self.layers = nn.ModuleList()
        assert num_layers >= 2, "Number of layers should be at least 2."
        # first layer
        self.layers.append(dglnn.GraphConv(in_size, hid_size, activation=F.relu))
        # hidden layers
        for _ in range(1, num_layers-1):
            self.layers.append(dglnn.GraphConv(hid_size, hid_size, activation=F.relu))
        # output layer
        self.layers.append(dglnn.GraphConv(hid_size, out_size))
        self.dropout = nn.Dropout(0.5)

    def forward(self, g, features):
        h = features
        for i, layer in enumerate(self.layers):
            if i != 0:  # apply dropout after the first layer
                h = self.dropout(h)
            h = layer(g, h)
        return h

In [4]:
def feature_norm(features):
    min_values = features.min(axis=0)[0]
    max_values = features.max(axis=0)[0]
    return 2*(features - min_values).div(max_values-min_values) - 1


def train(g, features, labels, masks, model, epochs=2000, patience=1500, save_path='best_model.pth'):
    train_mask = masks[0]
    val_mask = masks[1]
    loss_fcn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-2, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

    best_val_acc = 0
    patience_counter = 0

    for epoch in tqdm(range(epochs), desc="Epochs"):
        model.train()
        logits = model(g, features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Evaluate on validation set
        val_acc, val_f1 = evaluate(g, features, labels, val_mask, model)
        if epoch % 100 == 0:
            print(f"Epoch {epoch:05d} | Loss {loss.item():.4f} | Val Accuracy {val_acc:.4f} | Val F1 {val_f1:.4f}")

        # Early stopping and model saving
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), save_path)
        else:
            patience_counter += 1

        if patience_counter == patience:
            print("Early stopping triggered.")
            break


def evaluate(g, features, labels, mask, model):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        predictions = torch.argmax(logits[mask], dim=1)
        acc = (predictions == labels[mask]).float().mean()
        f1 = f1_score(labels[mask].cpu(), predictions.cpu(), average='weighted')
    return acc.item(), f1

## load datasets

In [1]:
import sys
sys.path.append('../..')  # Adds the EUG directory to the PYTHONPATH

from datasets import Bail
bail = Bail()
adj, features, idx_train, idx_val, idx_test, labels, sens, feat_names, sens_names \
    = bail.adj(), bail.features(), bail.idx_train(), bail.idx_val(), \
      bail.idx_test(), bail.labels(), bail.sens(), bail.feat_names(), bail.sens_names()

exist


In [16]:
from datasets import Nba
nba = Nba()
adj, features, idx_train, idx_val, idx_test, labels, sens, feat_names, sens_names \
    = nba.adj(), nba.features(), nba.idx_train(), nba.idx_val(), \
      nba.idx_test(), nba.labels(), nba.sens(), nba.feat_names(), nba.sens_names()

In [19]:
from datasets import Pokec_n
pokec_n = Pokec_n()
adj, features, idx_train, idx_val, idx_test, labels, sens, feat_names, sens_names \
    = pokec_n.adj(), pokec_n.features(), pokec_n.idx_train(), pokec_n.idx_val(), \
      pokec_n.idx_test(), pokec_n.labels(), pokec_n.sens(), pokec_n.feat_names(), pokec_n.sens_names()

In [22]:
from datasets import Pokec_z
pokec_z = Pokec_z()
adj, features, idx_train, idx_val, idx_test, labels, sens, feat_names, sens_names \
    = pokec_z.adj(), pokec_z.features(), pokec_z.idx_train(), pokec_z.idx_val(), \
      pokec_z.idx_test(), pokec_z.labels(), pokec_z.sens(), pokec_z.feat_names(), pokec_z.sens_names()

## process dataset

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# masks = g.ndata["train_mask"], g.ndata["val_mask"], g.ndata["test_mask"]
# Get the source and destination node IDs
# adj = adj.int().to(device)
src, dst = adj.coalesce().indices()

# Create the heterograph
g = dgl.heterograph({('node', 'edge', 'node'): (src.cpu().numpy(), dst.cpu().numpy())})
g = g.int().to(device)

# convert idx_train, idx_val, idx_test to boolean masks
train_mask = torch.zeros(adj.shape[0], dtype=torch.bool)
val_mask = torch.zeros(adj.shape[0], dtype=torch.bool)
test_mask = torch.zeros(adj.shape[0], dtype=torch.bool)
train_mask[idx_train] = True
val_mask[idx_val] = True
test_mask[idx_test] = True
masks = train_mask, val_mask, test_mask

# normalize features
features = feature_norm(features)

features = features.to(device)
labels = labels.to(device)

## train model

In [24]:
# create GCN model
in_size = features.shape[1]
out_size = int(sum(labels.unique() != -1))
model = GCN(in_size, 16, out_size).to(device)
# model training
print("Training...")
train(g, features, labels, masks, model, save_path='./gcn3layer_Pokecz.pth')

Training...


Epochs:   0%|          | 6/2000 [00:00<00:36, 54.84it/s]

Epoch 00000 | Loss 0.7337 | Val Accuracy 0.5520 | Val F1 0.3927


Epochs:   6%|▌         | 114/2000 [00:01<00:23, 80.81it/s]

Epoch 00100 | Loss 0.5333 | Val Accuracy 0.7060 | Val F1 0.7068


Epochs:  11%|█         | 217/2000 [00:02<00:22, 80.48it/s]

Epoch 00200 | Loss 0.5037 | Val Accuracy 0.7111 | Val F1 0.7112


Epochs:  16%|█▌        | 316/2000 [00:03<00:20, 83.44it/s]

Epoch 00300 | Loss 0.4965 | Val Accuracy 0.7138 | Val F1 0.7143


Epochs:  21%|██        | 414/2000 [00:05<00:19, 80.92it/s]

Epoch 00400 | Loss 0.4902 | Val Accuracy 0.7099 | Val F1 0.7103


Epochs:  26%|██▌       | 513/2000 [00:06<00:17, 85.07it/s]

Epoch 00500 | Loss 0.4904 | Val Accuracy 0.7072 | Val F1 0.7075


Epochs:  31%|███       | 612/2000 [00:07<00:16, 85.12it/s]

Epoch 00600 | Loss 0.5018 | Val Accuracy 0.7099 | Val F1 0.7103


Epochs:  36%|███▌      | 711/2000 [00:08<00:14, 85.93it/s]

Epoch 00700 | Loss 0.4960 | Val Accuracy 0.7099 | Val F1 0.7103


Epochs:  40%|████      | 810/2000 [00:09<00:13, 85.34it/s]

Epoch 00800 | Loss 0.5011 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  45%|████▌     | 909/2000 [00:11<00:13, 83.52it/s]

Epoch 00900 | Loss 0.4976 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  51%|█████     | 1017/2000 [00:12<00:11, 84.88it/s]

Epoch 01000 | Loss 0.4922 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  56%|█████▌    | 1116/2000 [00:13<00:10, 84.62it/s]

Epoch 01100 | Loss 0.5002 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  61%|██████    | 1214/2000 [00:14<00:10, 75.32it/s]

Epoch 01200 | Loss 0.5039 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  66%|██████▌   | 1312/2000 [00:15<00:08, 77.27it/s]

Epoch 01300 | Loss 0.4920 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  70%|███████   | 1409/2000 [00:17<00:07, 81.63it/s]

Epoch 01400 | Loss 0.4823 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  76%|███████▌  | 1514/2000 [00:18<00:06, 77.94it/s]

Epoch 01500 | Loss 0.4915 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  81%|████████  | 1612/2000 [00:19<00:04, 84.63it/s]

Epoch 01600 | Loss 0.5090 | Val Accuracy 0.7096 | Val F1 0.7099


Epochs:  84%|████████▎ | 1672/2000 [00:20<00:03, 82.06it/s]

Early stopping triggered.





## load model

In [23]:
# create GCN model
in_size = features.shape[1]
out_size = int(sum(labels.unique() != -1))
model = GCN(in_size, 16, out_size).to(device)

In [24]:
# load the model
model.load_state_dict(torch.load('evaluation/cases/gcn/gcn_pokecz.pth'))

<All keys matched successfully>

In [25]:
model.eval()
with torch.no_grad():
    logits = model(g, features)