In [5]:
import torch
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch_geometric.utils import negative_sampling
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from torch_geometric.data import Data
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

import pandas as pd
import numpy as np

In [6]:
doc2vec_model = Doc2Vec.load("test_doc2vec.model")

200

In [7]:
word_feature_array = []

for i in range(len(doc2vec_model.docvecs)):
    word_feature_array.append(doc2vec_model.docvecs[i])

word_feature_array = np.array(word_feature_array)

In [9]:
node_type = pd.read_csv("./datasets/Training/node_classification.csv")
node_type_onehot = np.zeros([len(node_type), 4])
for i in range(len(node_type)):
    node_type_onehot[i][node_type.iloc[i][1]-1] = 1

Unnamed: 0,id,page_type
0,0,1
1,1,2
2,2,3
3,3,2
4,4,4


In [118]:
all_feature_array = np.concatenate((word_feature_array, node_type_onehot), axis=1)

In [119]:
edge = pd.read_csv("./datasets/Training/training_graph.csv")
edge = edge.to_numpy().T

In [121]:
all_data = Data(x=torch.tensor(all_feature_array, dtype=torch.float32),
    edge_index=torch.tensor(edge, dtype=torch.long), edge_attr=None)

all_data.num_nodes = len(all_feature_array)
all_data.num_features = len(all_feature_array[0])

In [122]:
train_test_data = train_test_split_edges(input)

In [123]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.num_features, 128)
        self.conv2 = GCNConv(128, 64)

    def encode(self):
        x = self.conv1(data.x, data.train_pos_edge_index)
        x = x.relu()
        return self.conv2(x, data.train_pos_edge_index)

    def decode(self, z, pos_edge_index, neg_edge_index):
        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1)
        logits = (z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)
        return logits

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()

    def edgepred(self,z,te):
        prob_adj = z @ z.t()
        for k in range(len(te)):
            probability = prob_adj[te[k][0]][te[k][1]]
            probability = probability.detach().numpy()
            te[k].append(probability)
        
        return te
    
    def pred_one_edge(self, z, new_node, top_k = 5, remove_negative = True):
        prob_adj = z @ z.t()
        rank_list = np.zeros(data.num_nodes)
        for k in range(data.num_nodes):
            if k == new_node:
                rank_list[k] = -np.inf
            else:
                probability = prob_adj[new_node][k]
                probability = probability.detach().numpy()
                rank_list[k] = probability

        index_list = rank_list.argsort()
        top_k_index = index_list[-1:-top_k-1:-1]
        top_k_score = rank_list[top_k_index]

        if remove_negative:
            for neg_point, inde in enumerate(top_k_score):
                if inde < 0:
                    break
            top_k_index = top_k_index[:neg_point]
            top_k_score = top_k_score[:neg_point]

        return top_k_index, top_k_score


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
model, data = Net().to(device), data.to(device)
model = model.float()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)


def get_link_labels(pos_edge_index, neg_edge_index):
    E = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(E, dtype=torch.float, device=device)
    link_labels[:pos_edge_index.size(1)] = 1.
    return link_labels


def train():
    model.train()
    neg_edge_index = negative_sampling(
        edge_index=data.train_pos_edge_index, num_nodes=data.num_nodes,
        num_neg_samples=data.train_pos_edge_index.size(1),
        force_undirected=True,
    )
    optimizer.zero_grad()
    z = model.encode()
    link_logits = model.decode(z, data.train_pos_edge_index, neg_edge_index)
    link_labels = get_link_labels(data.train_pos_edge_index, neg_edge_index)
    loss = F.binary_cross_entropy_with_logits(link_logits, link_labels)
    loss.backward()
    optimizer.step()
    return loss


@torch.no_grad()
def test():
    model.eval()
    perfs = []
    for prefix in ["val", "test"]:
        pos_edge_index = data[f'{prefix}_pos_edge_index']
        neg_edge_index = data[f'{prefix}_neg_edge_index']
        z = model.encode()
        link_logits = model.decode(z, pos_edge_index, neg_edge_index)
        link_probs = link_logits.sigmoid()
        link_labels = get_link_labels(pos_edge_index, neg_edge_index)
        perfs.append(roc_auc_score(link_labels.cpu(), link_probs.cpu()))
    return perfs

In [126]:
best_val_perf = test_perf = 0
for epoch in range(1, 30):
    train_loss = train()
    val_perf, tmp_test_perf = test()
    if val_perf > best_val_perf:
        best_val_perf = val_perf
        test_perf = tmp_test_perf
    log = 'Epoch: {:03d}, Loss: {:.4f}, Val: {:.4f}, Test: {:.4f}'
    print(log.format(epoch, train_loss, best_val_perf, test_perf))

Epoch: 001, Loss: 0.4684, Val: 0.9271, Test: 0.9289
Epoch: 002, Loss: 0.4653, Val: 0.9287, Test: 0.9304
Epoch: 003, Loss: 0.4658, Val: 0.9303, Test: 0.9320
Epoch: 004, Loss: 0.4649, Val: 0.9313, Test: 0.9330
Epoch: 005, Loss: 0.4625, Val: 0.9320, Test: 0.9336
Epoch: 006, Loss: 0.4617, Val: 0.9331, Test: 0.9345
Epoch: 007, Loss: 0.4624, Val: 0.9345, Test: 0.9357
Epoch: 008, Loss: 0.4625, Val: 0.9356, Test: 0.9369
Epoch: 009, Loss: 0.4590, Val: 0.9361, Test: 0.9375
Epoch: 010, Loss: 0.4585, Val: 0.9367, Test: 0.9382
Epoch: 011, Loss: 0.4576, Val: 0.9372, Test: 0.9389
Epoch: 012, Loss: 0.4578, Val: 0.9375, Test: 0.9395
Epoch: 013, Loss: 0.4569, Val: 0.9380, Test: 0.9403
Epoch: 014, Loss: 0.4563, Val: 0.9392, Test: 0.9417
Epoch: 015, Loss: 0.4550, Val: 0.9399, Test: 0.9426
Epoch: 016, Loss: 0.4534, Val: 0.9402, Test: 0.9431
Epoch: 017, Loss: 0.4530, Val: 0.9410, Test: 0.9439
Epoch: 018, Loss: 0.4513, Val: 0.9425, Test: 0.9454
Epoch: 019, Loss: 0.4514, Val: 0.9425, Test: 0.9454
Epoch: 020, 

In [127]:
z = model.encode()

In [137]:
test_data = pd.read_csv("datasets/Test Dataset/test_edges.csv")
test_data = test_data.values.tolist()
test_data_nodes_score = model.edgepred(z, test_data)

In [148]:
test_data_score = [float(test_data_nodes_score[i][2]) for i in range(len(test_data))]
test_data_score[:5]

[2.958502769470215,
 -0.3188187777996063,
 -0.2500545382499695,
 2.3628921508789062,
 0.08081190288066864]

In [149]:
test_prediction = [1 if score >0 else 0 for score in test_data_score]
test_prediction[:5]

[1, 0, 0, 1, 1]

In [151]:
recommend_node, node_score = model.pred_one_edge(z, 1, top_k = 3)
recommend_node

array([14497, 16895], dtype=int64)

In [152]:
node_score

array([8.49590302, 8.32092667])