In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time
os.chdir('/home/gebhart/projects/sheaf_kg')
import sheaf_kg.harmonic_extension as harmonic_extension
# from sheaf_kg.train_sheafE_nonconstant_diag import ModifiedSE

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pykeen
import torch
from pykeen.pipeline import pipeline

In [2]:
dataset = 'FB15k'
num_test = 1000
path_len = 2
model_name = 'StructuredEmbedding_1000epochs_64dim_SoftplusLossloss_42seed_20210128-1346'
save_loc = '/home/gebhart/projects/sheaf_kg/data/{}/{}/trained_model.pkl'.format(dataset,model_name)
model = torch.load(save_loc).to('cpu')

In [3]:
ds = pykeen.datasets.get_dataset(dataset=dataset)

training = ds.training.mapped_triples
testing = ds.testing.mapped_triples

In [4]:
def create_multi_hop_dataset(training, test_size, path_length):
    ''' Function to random walk on knowledge graph triplets to generate
    multi-hop testing dataset. This function assumes `training` is of the 
    form provided by pykeen. That is, a tensor of size (d x 3) where d 
    is the number of triplets, column 1 is the head entity id, column 2 
    is the relation id, and column 3 is the tail entity id. 
    
    There is likely a much faster way to generate this dataset with better 
    preprocessing and a better representation for the triplet graph. But 
    this should do for now.
    '''
    head_ents = torch.unique(training[:,0])
    tail_ents = torch.unique(training[:,2])
    unq_ents = torch.unique(torch.cat([head_ents,tail_ents]))
    random_start_idxs = torch.randint(unq_ents.shape[0], (test_size,))
    
    path_ents = torch.zeros((test_size,path_length), dtype=torch.int64) # track nodes along random walk
    comp_relations = torch.zeros((test_size,path_length), dtype=torch.int64) # track the relations crossed along each walk
    inv_relations = torch.zeros((test_size,path_length), dtype=torch.int64) # track whether the crossed relations are inverted
    for pidx in range(test_size):
        random_start_idx = random_start_idxs[pidx]
        ent = unq_ents[random_start_idx]
        path_ents[pidx,0] = ent
        for step in range(path_length):
            head_instances = training[training[:,0] == ent]
            tail_instances = training[training[:,2] == ent]
            instances = torch.cat([head_instances, tail_instances])
            random_step_idx = torch.randint(instances.shape[0], (1,))
            step_edge = instances[random_step_idx[0]]
            if step_edge[0] == ent:
                # forward relation
                inv_relations[pidx,step] = 1
                ent = step_edge[2]
            else:
                # inverse relation
                inv_relations[pidx,step] = -1
                ent = step_edge[0]
            path_ents[pidx,step] = ent
            comp_relations[pidx,step] = step_edge[1]
    return path_ents, comp_relations, inv_relations

In [5]:
path_ents, comp_relations, inv_relations = create_multi_hop_dataset(training, num_test, path_len)

In [6]:
path_ents.shape, comp_relations.shape

(torch.Size([5000, 2]), torch.Size([5000, 2]))

In [7]:
model

StructuredEmbedding(
  (loss): SoftplusLoss(
    (softplus): Softplus(beta=1, threshold=20)
  )
  (regularizer): NoRegularizer()
  (entity_embeddings): Embedding(
    (_embeddings): Embedding(14951, 64)
  )
  (left_relation_embeddings): Embedding(
    (_embeddings): Embedding(1345, 4096)
  )
  (right_relation_embeddings): Embedding(
    (_embeddings): Embedding(1345, 4096)
  )
)

In [8]:
hits_at = [1,3,5,10]
results = np.zeros((path_ents.shape[0],len(hits_at)))
for i in range(path_ents.shape[0]):
    path_ent = path_ents[i,:]
    comp_rel = comp_relations[i,:]
    inverses = inv_relations[i,:]

    source_ents = path_ent[:-1]

    # create edge indices as required by harmonic_extension.py, these are linear chains which index into path_ent
    edge_indices = np.concatenate([np.arange(0,path_ent.shape[0])[:,np.newaxis].T, np.arange(1,path_ent.shape[0]+1)[:,np.newaxis].T], axis=0)

    source_embeddings = model.entity_embeddings(indices=source_ents).view(-1, model.embedding_dim).detach().numpy()
    target_embeddings = model.entity_embeddings(indices=None).view(-1, model.embedding_dim).detach().numpy()

    left_restrictions = model.left_relation_embeddings(indices=comp_rel).view(-1, model.embedding_dim, model.embedding_dim).detach().numpy()
    right_restrictions = model.right_relation_embeddings(indices=comp_rel).view(-1, model.embedding_dim, model.embedding_dim).detach().numpy()

    restrictions = np.empty((comp_rel.shape[0], 2, left_restrictions.shape[2], left_restrictions.shape[1]))
    for j in range(inverses.shape[0]):
        if inverses[j] == -1:
            restrictions[j,0] = right_restrictions[j]
            restrictions[j,1] = left_restrictions[j]
        else:
            restrictions[j,0] = left_restrictions[j]
            restrictions[j,1] = right_restrictions[j]
#         restrictions[j,0] = left_restrictions[j]
#         restrictions[j,1] = right_restrictions[j]

    L = harmonic_extension.Laplacian(edge_indices, restrictions)
    
    source_vertices = np.arange(path_ent.shape[0]-1)
    target_vertices = [path_ent.shape[0]]
    Q = harmonic_extension.compute_costs(L,source_vertices,target_vertices,source_embeddings.flatten(),target_embeddings.T,source_embeddings.shape[1])
    ind = np.argpartition(Q, 10)[:10]
    sorted_ind = ind[np.argsort(Q[ind])]
    for kix in range(len(hits_at)):
        if np.isin(path_ent[-1], sorted_ind[:hits_at[kix]]):
            results[i,kix] = 1.


In [9]:
for kix in range(len(hits_at)):
    print('Hits@{}: {}%'.format(hits_at[kix], np.sum(results[:,kix])/results.shape[0]*100))

Hits@1: 3.6999999999999997%
Hits@3: 6.1%
Hits@5: 7.76%
Hits@10: 10.26%
