In [None]:
import random
import wandb
from tqdm import tqdm
import networkx as nx
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import plotly.express as px


import torch

print(torch.__version__)
print(torch.version.cuda)

import torch_geometric
import torch_geometric.nn as pyg_nn
from torch_geometric.utils import to_networkx
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Entities
from torch_geometric.nn import GATConv
from torch_geometric.utils import k_hop_subgraph


import collections
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib as mpl
import plotly.express as px

import seaborn as sns
from functools import partial
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


import collections
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

pl.seed_everything(11)

## 1.2 AM dataset

In [None]:
datasetAM = Entities(name='AM', root='data/am')
dataAM = datasetAM[0]

EDGE TYPES

In [None]:
print(dataAM.edge_type.unique())
print(len(dataAM.edge_type.unique()))
print(dataAM.node_stores[0].train_y)
print(dataAM.node_stores[0].test_y)

tensor([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
         42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
         70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
         84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
         98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
        140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
        154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
        168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 1

NODE TYPES

In [None]:
print(f'Number of nodes: {dataAM.num_nodes}')
print(f'Number of edges: {dataAM.num_edges}')
print(f'Number of classes: {datasetAM.num_classes}')
print(f'Number of node features: {dataAM.num_node_features}')
print(f'Number of edge features: {dataAM.num_edge_features}')
print(f'Number of edge types: {len(dataAM.edge_type.unique())}')

print(f'Contains isolated nodes: {dataAM.has_isolated_nodes()}')
print(f'Contains self-loops: {dataAM.has_self_loops()}')
print(f'Is undirected: {dataAM.is_undirected()}')

print(f'Average node degree: {(dataAM.num_edges) / dataAM.num_nodes:.2f}')

Number of nodes: 1666764
Number of edges: 11976642
Number of classes: 11
Number of node features: 0
Number of edge features: 0
Number of edge types: 266
Contains isolated nodes: False
Contains self-loops: True
Is undirected: True
Average node degree: 7.19


### Random Prediction

In [None]:
edge_types = set()
for i in range(dataAM.edge_index.shape[1]):
    edge_type = dataAM.edge_type[i].detach().numpy().item(0)
    edge_types.add(edge_type)

In [None]:
def predict(node):
    return np.random.choice([*edge_types])

In [None]:
acc = []
for i in range(3):
    correct = (dataAM.test_y.numpy() == (np.array([predict(y) for y in dataAM.test_idx]))).sum()
    acc.append(correct / len(dataAM.test_idx))


print((np.mean(acc), np.std(acc)))
print(acc)


(0.0033670033670033673, 0.004761661826172038)
[0.010101010101010102, 0.0, 0.0]


### Node Predicion baseline using self made features

In [None]:
src = dataAM.edge_index[0].detach().numpy()
dst = dataAM.edge_index[1].detach().numpy()
edge_type = dataAM.edge_type.detach().numpy()

featuresAM = np.zeros((dataAM.num_nodes, 2 * dataAM.num_edge_types))

np.add.at(featuresAM, (src, edge_type), 1)
np.add.at(featuresAM, (dst, edge_type + dataAM.num_edge_types), 1)


In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

X = featuresAM[dataAM.train_idx]
y = dataAM.train_y

acc = []
for i in range(3):
    clf = svm.SVC(kernel='linear')
    clf.fit(X, y)

    # Predict the labels of the test set
    y_pred = clf.predict(featuresAM[dataAM.test_idx])

    # Calculate the accuracy of the classifier
    accuracy = accuracy_score(dataAM.test_y, y_pred)
    acc.append(accuracy)


print((np.mean(acc), np.std(acc)))
print(acc)


(0.6717171717171717, 0.0)
[0.6717171717171717, 0.6717171717171717, 0.6717171717171717]


### Gnn

Notes:
- number of edge types influences the model size significantly

In [None]:
# dataAM.x = torch.Tensor(featuresAM)
dataAM.x = torch.ones(dataAM.num_nodes, 1)

In [None]:
#https://github.com/pyg-team/pytorch_geometric/blob/master/examples/rgcn.py

# Since our model does only make use of a rather small receptive field, we
# filter the graph to only contain the nodes that are at most 2-hop neighbors
# away from any training/test node.

# from torch_geometric.utils import k_hop_subgraph

# node_idx = torch.cat([dataAM.train_idx, dataAM.test_idx], dim=0)
# node_idx, edge_index, mapping, edge_mask = k_hop_subgraph(node_idx, 2, dataAM.edge_index, relabel_nodes=True)

# dataAM.num_nodes = node_idx.size(0)
# dataAM.edge_index = edge_index
# dataAM.edge_type = dataAM.edge_type[edge_mask]
# dataAM.train_idx = mapping[:dataAM.train_idx.size(0)]
# dataAM.test_idx = mapping[dataAM.train_idx.size(0):]
# dataAM.x = dataAM.x[node_idx]


In [None]:
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader

dataAM.y = torch.zeros(dataAM.num_nodes, dtype=torch.long)
dataAM.y[dataAM.train_idx] = dataAM.train_y
dataAM.y[dataAM.test_idx] = dataAM.test_y

am_data = Data(
    x=dataAM.x,
    edge_index=dataAM.edge_index,
    edge_type=dataAM.edge_type,
    y=dataAM.y,
    train_idx=dataAM.train_idx,
    test_idx=dataAM.test_idx
)

loader = NeighborLoader(
    am_data,
    num_neighbors=[30,15,10],
    batch_size=64,
    input_nodes=dataAM.train_idx,
    num_workers=1,
    shuffle=True,
)


test_loader = NeighborLoader(
    am_data,
    num_neighbors=[30,15,10],
    batch_size=64,
    input_nodes=dataAM.test_idx,
    num_workers=1,
    shuffle=True,
)

In [None]:
class RGCNWrapper(pl.LightningModule):
    def __init__(self, in_channels, out_channels, num_relations, num_hidden_layers, norm: nn.Module = nn.Identity, num_bases=None):
        super(RGCNWrapper, self).__init__()

        self.num_hidden_layers = num_hidden_layers
        self.norms = nn.ModuleList()
        self.conv = pyg_nn.FastRGCNConv(in_channels, out_channels, num_relations, num_bases)
        for i in range(num_hidden_layers):
            self.norms.append(norm())
            setattr(self, f'conv{i}', pyg_nn.FastRGCNConv(out_channels, out_channels, num_relations, num_bases))
        self.lin = torch.nn.Linear(out_channels, datasetAM.num_classes)
    

    def forward(self, x, edge_index, edge_type):
        x = self.conv(x, edge_index, edge_type)
        x = F.relu(x)

        for i in range(self.num_hidden_layers):
            x = F.dropout(x, training=self.training)
            x = getattr(self, f'conv{i}')(x, edge_index, edge_type)
            x = self.norms[i](x)
            x = F.relu(x)

        x = F.dropout(x, training=self.training)
        x = self.lin(x)
        return F.log_softmax(x, dim=1)
        
    def training_step(self, batch):
        x, edge_index, edge_type, y = batch.x, batch.edge_index, batch.edge_type, batch.y
    
        n_id = batch.n_id
        idx = [(x in n_id)  for x in batch.train_idx]
        out = self(x, edge_index, edge_type)

        out_filtered = out[idx]
        y_filtered = y[idx]

        loss = F.nll_loss(out_filtered, y_filtered)
        self.log('train_loss', loss)
        return loss
        
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.001, weight_decay=5e-4)
        return optimizer

logger = WandbLogger(name="AM", project="adlg-2")

trainer = pl.Trainer(
    accelerator='cpu',
    max_epochs=2,
    logger=logger,
)

model_wrapper = RGCNWrapper(dataAM.x.shape[1], 36, dataAM.edge_type.max()+1, 1, num_bases=40)

trainer.fit(model_wrapper, loader)

test_loader = NeighborLoader(
    am_data,
    num_neighbors=[30,15,10],
    batch_size=64,
    input_nodes=dataAM.test_idx,
    num_workers=1,
    shuffle=True,
)

model_wrapper.eval()

preds = []
labels = []
for batch in test_loader:
    x, edge_index, edge_type, y = batch.x, batch.edge_index, batch.edge_type, batch.y
    out = model_wrapper(x, edge_index, edge_type)
    preds.append(out.argmax(1))
    labels.append(y)

# get mask of nonzero labels
mask = torch.cat(labels).bool()
labels = torch.cat(labels)[mask]
preds = torch.cat(preds)[mask]

correct = float(preds.eq(labels).sum().item())
acc = correct / len(labels)

print(f'Test Accuracy: {acc:.4f}')


In [None]:
model = RGCN(dataAM.x.shape[1], 36, dataAM.edge_type.max()+1, 1)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# if torch.cuda.is_available():
#     device = torch.device('cuda')
# elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
#     device = torch.device('mps')
# else:
#     device = torch.device('cpu')

# model = model.to(device)
# dataAM = dataAM.to(device)

# Train the model
model.train()
for batch in loader:
    optimizer.zero_grad()

    out = model(batch.x, batch.edge_index, batch.edge_type)

    loss = criterion(out[:batch.batch_size], batch.train_y[:batch.batch_size])
    loss.backward()
    optimizer.step()

# Evaluate the model
model.eval()
# pred = model(dataAM.x, dataAM.edge_index, dataAM.edge_type)
# correct = float(pred[dataAM.test_idx].argmax(1).eq(dataAM.test_y).sum().item())
# acc = correct / len(dataAM.test_idx)
# print(f'Test Accuracy: {acc:.4f}')


: 