In [11]:
from IPython.display import display
import torch
import os
import pandas as pd
from torch_geometric.data import InMemoryDataset
from urllib.request import urlopen
from torch_geometric.utils import negative_sampling
import numpy as np
from torch_geometric.nn import GCNConv
from sklearn.metrics import roc_auc_score
import torch_geometric.transforms as T
from torch_geometric.transforms import ToUndirected, RandomLinkSplit
from torch_geometric.data import Data
from tqdm.notebook import tqdm
from torch_geometric.loader import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# Define Neo4j connections
import pandas as pd
from neo4j import GraphDatabase
host = 'bolt://188.230.149.51:7687'
user = 'neo4j'
password = 'letmein'
driver = GraphDatabase.driver(host,auth=(user, password))

def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [3]:
def load_frames():
    """
    Loads the nodes and edges from Neo4j.
    :return: nodes and edges frame
    """
    dfn = run_query("""
    MATCH (u:User)
    RETURN u.id AS id, u.age AS age, u.gender AS gender
    """)
    dfn = dfn.set_index("id")

    dfe = run_query("""
    MATCH (s:User)-[:FRIEND]->(t:User)
    RETURN s.id as source_id, t.id as target_id
    """)

    return dfn, dfe


In [4]:
dfn, dfe = load_frames()
display(dfe.head())
display(dfn.head())

Unnamed: 0,source_id,target_id
0,1,16
1,1,10
2,1,12
3,1,8
4,1,7


Unnamed: 0_level_0,age,gender
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,26,1
16,23,1
3,29,1
4,26,0
17,27,0


In [5]:
class IdentityEncoder(object):
    # The 'IdentityEncoder' takes the raw column values and converts them to
    # PyTorch tensors.
    def __init__(self, dtype=None):
        self.dtype = dtype

    def __call__(self, df):
        return torch.from_numpy(df.values).view(-1, 1).to(self.dtype)

In [6]:
def transform_nodes(df, encoders = None, **kwargs):
    """
    Transforms the node frame to a Torch payload and a mapping from the original frame index.
    :param df: the nodes frame
    :param encoders: encoders to be applied to the columns
    :param kwargs: additional stuff
    :return:
    """
    # required mapping
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    x = None
    # apply encoding if specified
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim = -1)
    return x, mapping

In [28]:
nodes_x, nodes_mapping = transform_nodes(dfn, 
        encoders = {"age": IdentityEncoder(torch.float), "gender": IdentityEncoder(torch.float)})

In [29]:
def transform_edges(df_edges, nodes_mapping, encoders = None, **kwargs):
    """
    Transforms the edge frame to a Pyg compatible index and payload.
    :param df_edges: the edge frame
    :param nodes_mapping: the node index map
    :param encoders: optional encoders
    :param kwargs:
    :return:
    """
    src = [nodes_mapping[src_id] for src_id in df_edges.source_id]
    dst = [nodes_mapping[tgt_id] for tgt_id in df_edges.target_id]
    edge_index = torch.tensor([src, dst])

    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df_edges[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim = -1)

    return edge_index, edge_attr

In [30]:
edges_index, edges_label = transform_edges(
    dfe,
    nodes_mapping = nodes_mapping
)

In [31]:
def create_data(nodes_payload, edges_index):
    d = Data(x = nodes_payload, edge_index = edges_index)
    return d

data = create_data(nodes_x, edges_index)

In [32]:
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val = 0.05, num_test = 0.1, is_undirected = True,
                      add_negative_train_samples = False),
])

In [33]:
train_data, val_data, test_data = transform(data)


In [34]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        # chaining two convolutions with a standard relu activation
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        # cosine similarity
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim = -1)

    def decode_all(self, z):
        # decode the whole lot in one go
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple = False).t()

In [35]:
model = Net(data.num_features, 128, 64).to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.01)
criterion = torch.nn.BCEWithLogitsLoss()


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)

    # as explained, some negative sampling is used in each epoch
    neg_edge_index = negative_sampling(edge_index = train_data.edge_index, num_nodes = train_data.num_nodes,
                                       num_neg_samples = train_data.edge_label_index.size(1), method = 'sparse')

    edge_label_index = torch.cat([train_data.edge_label_index, neg_edge_index],
                                 dim = -1,
                                 )
    edge_label = torch.cat([train_data.edge_label, train_data.edge_label.new_zeros(neg_edge_index.size(1))
                            ], dim = 0)

    out = model.decode(z, edge_label_index).view(-1)
    loss = criterion(out, edge_label)
    # standard torch mechanics here
    loss.backward()
    optimizer.step()
    return loss

In [36]:
@torch.no_grad()
def test(d):
    model.eval()
    z = model.encode(d.x, d.edge_index)
    out = model.decode(z, d.edge_label_index).view(-1).sigmoid()
    return roc_auc_score(d.edge_label.cpu().numpy(), out.cpu().numpy())

In [None]:
final_test_auc = 0
best_val_auc = 0
epochs = 3
for epoch in range(1, epochs + 1):
    loss = train()
    # validation accuracy
    val_auc = test(val_data)
    # test accuracy
    test_auc = test(test_data)
    if val_auc > best_val_auc:
        best_val = val_auc
        final_test_auc = test_auc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_auc:.4f}, '
          f'Test: {test_auc:.4f}')

print(f'Final Test: {final_test_auc:.4f}')
# the final test accuracy
z = model.encode(test_data.x, test_data.edge_index)
final_edge_index = model.decode_all(z)