In [1]:
import torch
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import Node2Vec

from sklearn.linear_model import LogisticRegressionCV
from collections import defaultdict
import random
from tqdm import tqdm
from utils import *

from sklearn.metrics import precision_recall_fscore_support

In [2]:
data_name = 'Pubmed'

### Node classification with node2vec

In [3]:
dataset = Planetoid('./data', data_name)
data = dataset[0]
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
data.test_mask.nonzero()

tensor([[18717],
        [18718],
        [18719],
        [18720],
        [18721],
        [18722],
        [18723],
        [18724],
        [18725],
        [18726],
        [18727],
        [18728],
        [18729],
        [18730],
        [18731],
        [18732],
        [18733],
        [18734],
        [18735],
        [18736],
        [18737],
        [18738],
        [18739],
        [18740],
        [18741],
        [18742],
        [18743],
        [18744],
        [18745],
        [18746],
        [18747],
        [18748],
        [18749],
        [18750],
        [18751],
        [18752],
        [18753],
        [18754],
        [18755],
        [18756],
        [18757],
        [18758],
        [18759],
        [18760],
        [18761],
        [18762],
        [18763],
        [18764],
        [18765],
        [18766],
        [18767],
        [18768],
        [18769],
        [18770],
        [18771],
        [18772],
        [18773],
        [18774],
        [18775

In [8]:
node2vec = Node2Vec(
    data.edge_index, embedding_dim=128, walk_length=20,
    context_size=10, walks_per_node=10,
    num_negative_samples=1, p=1, q=1, sparse=True
).to(device)

In [9]:
loader = node2vec.loader(batch_size=128, shuffle=True,num_workers=4)
optimizer = torch.optim.SparseAdam(list(node2vec.parameters()), lr=0.01)

In [10]:
epochs = 100
print_every = 5

for t in range(1, epochs + 1):
    # train
    node2vec.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    loss = total_loss / len(loader)
    
    # val
    node2vec.eval()
    node_embeddings = node2vec()
    acc = node2vec.test(
        node_embeddings[data.train_mask], data.y[data.train_mask],
        node_embeddings[data.val_mask], data.y[data.val_mask],
        max_iter = 150
    )
    if t == 1 or t % print_every == 0:
        print('Epoch {:3d} | Log-likelihood: {:.6f}; Accuracy: {:.4f}'.format(t, loss, acc))
node_embeddings = node2vec()
acc = node2vec.test(
    node_embeddings[data.train_mask | data.val_mask], data.y[data.train_mask | data.val_mask], 
    node_embeddings[data.test_mask], data.y[data.test_mask]
)
print('Test Accuracy: {:.3f}'.format(acc))

Epoch   1 | Log-likelihood: 8.094423; Accuracy: 0.1600
Epoch   5 | Log-likelihood: 3.490892; Accuracy: 0.2540
Epoch  10 | Log-likelihood: 1.735913; Accuracy: 0.4060
Epoch  15 | Log-likelihood: 1.161114; Accuracy: 0.5380
Epoch  20 | Log-likelihood: 0.969538; Accuracy: 0.6220
Epoch  25 | Log-likelihood: 0.900347; Accuracy: 0.6520
Epoch  30 | Log-likelihood: 0.869082; Accuracy: 0.6740
Epoch  35 | Log-likelihood: 0.852614; Accuracy: 0.6960
Epoch  40 | Log-likelihood: 0.843454; Accuracy: 0.6880
Epoch  45 | Log-likelihood: 0.837430; Accuracy: 0.6920
Epoch  50 | Log-likelihood: 0.833483; Accuracy: 0.7040
Epoch  55 | Log-likelihood: 0.829448; Accuracy: 0.7180
Epoch  60 | Log-likelihood: 0.829054; Accuracy: 0.7000
Epoch  65 | Log-likelihood: 0.827198; Accuracy: 0.6960
Epoch  70 | Log-likelihood: 0.825836; Accuracy: 0.7000
Epoch  75 | Log-likelihood: 0.825624; Accuracy: 0.7080
Epoch  80 | Log-likelihood: 0.824754; Accuracy: 0.7160
Epoch  85 | Log-likelihood: 0.825802; Accuracy: 0.6980
Epoch  90 

### Link prediction with node2vec

In [14]:
pos_samples_tr, neg_samples_tr = sample_edges(data.edge_index, data.train_mask + data.val_mask, strict=True)
pos_samples_te, neg_samples_te = sample_edges(data.edge_index, data.test_mask, strict=True)
len(pos_samples_tr), len(neg_samples_tr), len(pos_samples_te), len(neg_samples_te)

100%|██████████| 640/640 [00:29<00:00, 21.60it/s]
100%|██████████| 1000/1000 [00:50<00:00, 19.93it/s]


(382, 640, 692, 1000)

In [15]:
node_embeddings = node2vec()
node_embeddings.shape

torch.Size([2708, 128])

In [16]:
# apply Hadamard as binary operator to embeddings
embeddings_pos_hdmd_tr = node_embeddings[pos_samples_tr, :][:, 0, :] * node_embeddings[pos_samples_tr, :][:, 1, :]
embeddings_neg_hdmd_tr = node_embeddings[neg_samples_tr, :][:, 0, :] * node_embeddings[neg_samples_tr, :][:, 1, :]
embeddings_pos_hdmd_te = node_embeddings[pos_samples_te, :][:, 0, :] * node_embeddings[pos_samples_te, :][:, 1, :]
embeddings_neg_hdmd_te = node_embeddings[neg_samples_te, :][:, 0, :] * node_embeddings[neg_samples_te, :][:, 1, :]

In [17]:
# concatenate embeddings and targets
embeddings_hdmd_tr = torch.cat([
    embeddings_pos_hdmd_tr, 
    embeddings_neg_hdmd_tr, 
]).detach().cpu().numpy()

targets_tr = torch.cat([
    torch.ones(len(embeddings_pos_hdmd_tr)),
    torch.zeros(len(embeddings_neg_hdmd_tr)),
]).detach().cpu().numpy()

embeddings_hdmd_te = torch.cat([
    embeddings_pos_hdmd_te,
    embeddings_neg_hdmd_te
]).detach().cpu().numpy()

targets_te = torch.cat([
    torch.ones(len(embeddings_pos_hdmd_te)),
    torch.zeros(len(embeddings_neg_hdmd_te))
]).detach().cpu().numpy()

embeddings_hdmd_tr.shape, targets_tr.shape, embeddings_hdmd_te.shape, targets_te.shape

((1022, 128), (1022,), (1692, 128), (1692,))

In [18]:
%%time
clf = LogisticRegressionCV(class_weight='balanced', max_iter=1000, n_jobs=-1)
clf.fit(embeddings_hdmd_tr, targets_tr)
tr_outputs = clf.predict(embeddings_hdmd_tr)
te_outputs = clf.predict(embeddings_hdmd_te)

tr_prec, tr_recall, _, _ = precision_recall_fscore_support(targets_tr, tr_outputs, average='micro')
te_prec, te_recall, _, _ = precision_recall_fscore_support(targets_te, te_outputs, average='micro')

print('tr prec: {:.4f}; tr recall: {:.4f}'.format(tr_prec, tr_recall))
print('te prec: {:.4f}; te recall: {:.4f}'.format(te_prec, te_recall))

Accuracy on combined train, val: 0.990215
Accuracy on test: 0.985225
CPU times: user 42.1 ms, sys: 88.1 ms, total: 130 ms
Wall time: 1.96 s


In [296]:
targets_tr

array([1., 1., 1., ..., 0., 0., 0.], dtype=float32)