In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time
import pickle
from tqdm import tqdm
# for some reason, need to go to the sheaf_kg directory in order for torch.load to work
os.chdir('/home/gebhart/projects/sheaf_kg/sheaf_kg')

import sheaf_kg.tensor_harmonic_extension as harmonic_extension
from sheaf_kg.sheafE_models import SheafE_Multisection, SheafE_Diag

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pykeen
import torch
from pykeen.pipeline import pipeline

In [2]:
torch.manual_seed(0)
np.random.seed(0)

In [3]:
dataset = 'FB15k-237'
num_test = 200
use_section = 0
train_test_queries = 'test'
model_name = 'SheafE_Multisection_64embdim_64esdim_64sec_2norm_1000epochs_SoftplusLossloss_20210301-2201'
save_loc = '/home/gebhart/projects/sheaf_kg/data/{}/{}/trained_model.pkl'.format(dataset,model_name)
betae_path = '/home/gebhart/projects/sheaf_kg/data/{}-betae'.format(dataset)
model = torch.load(save_loc).to('cpu')

In [4]:
query_structures = [('e', ('r', 'r')), ('e', ('r', 'r', 'r')), (('e', ('r',)), ('e', ('r',))), (('e', ('r',)), ('e', ('r',)), ('e', ('r',))), (('e', ('r', 'r')), ('e', ('r',))), ((('e', ('r',)), ('e', ('r',))), ('r',))]

query_name_dict = {('e',('r',)): '1p', 
                    ('e', ('r', 'r')): '2p',
                    ('e', ('r', 'r', 'r')): '3p',
                    (('e', ('r',)), ('e', ('r',))): '2i',
                    (('e', ('r',)), ('e', ('r',)), ('e', ('r',))): '3i',
                    ((('e', ('r',)), ('e', ('r',))), ('r',)): 'ip',
                    (('e', ('r', 'r')), ('e', ('r',))): 'pi',
                    (('e', ('r',)), ('e', ('r', 'n'))): '2in',
                    (('e', ('r',)), ('e', ('r',)), ('e', ('r', 'n'))): '3in',
                    ((('e', ('r',)), ('e', ('r', 'n'))), ('r',)): 'inp',
                    (('e', ('r', 'r')), ('e', ('r', 'n'))): 'pin',
                    (('e', ('r', 'r', 'n')), ('e', ('r',))): 'pni',
                    (('e', ('r',)), ('e', ('r',)), ('u',)): '2u-DNF',
                    ((('e', ('r',)), ('e', ('r',)), ('u',)), ('r',)): 'up-DNF',
                    ((('e', ('r', 'n')), ('e', ('r', 'n'))), ('n',)): '2u-DM',
                    ((('e', ('r', 'n')), ('e', ('r', 'n'))), ('n', 'r')): 'up-DM'
                }

In [5]:
# ds = pykeen.datasets.get_dataset(dataset=dataset)
ds = pykeen.datasets.get_dataset(dataset=dataset, dataset_kwargs=dict(create_inverse_triples=True))
training = ds.training.mapped_triples
relid2label = ds.training.relation_id_to_label 
label2relid = {v:k for k,v in relid2label.items()}

entid2label = ds.training.entity_id_to_label 
label2entid = {v:k for k,v in entid2label.items()}

You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out


In [6]:
with open(os.path.join(betae_path,f'{train_test_queries}-queries.pkl'), 'rb') as f:
    test_queries = pickle.load(f)

if train_test_queries == 'test':
    with open(os.path.join(betae_path,f'{train_test_queries}-easy-answers.pkl'), 'rb') as f:
        test_answers = pickle.load(f)
        
elif train_test_queries == 'train':
    with open(os.path.join(betae_path,f'{train_test_queries}-answers.pkl'), 'rb') as f:
        test_answers = pickle.load(f)
    
with open(os.path.join(betae_path,'id2rel.pkl'), 'rb') as f:
    id2rel = pickle.load(f)
    
with open(os.path.join(betae_path,'id2ent.pkl'), 'rb') as f:
    id2ent = pickle.load(f)

In [7]:
def map_ent(e):
    return label2entid[id2ent[e]]
def map_rel(r):
    orientation = 1
    relname = id2rel[r]
    if relname[0] == '-':
        orientation = -1
    return label2relid[relname[1:]], orientation

In [8]:
def L_p(query, model):
    '''query of form ('e', ('r', 'r', ... , 'r')).
    here we assume 2 or more relations are present so 2p or greater
    '''
    ent = map_ent(query[0])
    invs = []
    rels = []
    for r in query[1]:
        mapped_id, orientation = map_rel(r) 
        rels.append(mapped_id)
        invs.append(orientation)
    n_path_ents = len(rels)
    edge_indices = torch.LongTensor(np.concatenate([np.arange(0,n_path_ents)[:,np.newaxis].T, np.arange(1,n_path_ents+1)[:,np.newaxis].T], axis=0))
    
    rel_idx_tensor = torch.LongTensor(rels)
    left_restrictions = torch.index_select(model.left_embeddings, 0, rel_idx_tensor)
    right_restrictions = torch.index_select(model.right_embeddings, 0, rel_idx_tensor)
        
    restrictions = torch.empty((len(rels), 2, left_restrictions.shape[1], left_restrictions.shape[2]))

    for invix in range(len(invs)):
        if invs[invix] == -1:
            restrictions[invix,0,:,:] = right_restrictions[invix]
            restrictions[invix,1,:,:] = left_restrictions[invix]
        else:
            restrictions[invix,0,:,:] = left_restrictions[invix]
            restrictions[invix,1,:,:] = right_restrictions[invix]
    
    ent_idx_tensor = torch.LongTensor([ent])
    source_embeddings = torch.index_select(model.ent_embeddings, 0, ent_idx_tensor).view(-1, model.embedding_dim, model.num_sections)
    
    B = torch.LongTensor(np.array([0,n_path_ents],np.int))
    U = torch.LongTensor(np.array(range(1,n_path_ents),np.int))
    source_vertices = np.array([0])
    target_vertices = np.array([1])
    LSchur = harmonic_extension.Kron_reduction(edge_indices, restrictions, B, U)
    return LSchur, source_vertices, target_vertices, source_embeddings

def L_i(query, model):
    '''query of form (('e', ('r',)), ('e', ('r',)), ... , ('e', ('r',)))'''
    num_intersects = len(query)
    ents = []
    rels = []
    invs = []
    for pair in query:
        ents.append(map_ent(pair[0]))
        rel, inv = map_rel(pair[1][0])
        rels.append(rel)
        invs.append(inv)
    n_ents = len(ents)
    
    edge_indices = torch.LongTensor(np.concatenate([np.full(n_ents,n_ents)[:,np.newaxis].T, np.arange(0,n_ents)[:,np.newaxis].T], axis=0))
    
    rel_idx_tensor = torch.LongTensor(rels)
    left_restrictions = torch.index_select(model.left_embeddings, 0 , rel_idx_tensor)
    right_restrictions = torch.index_select(model.right_embeddings, 0, rel_idx_tensor)
    
    restrictions = torch.empty((len(rels), 2, left_restrictions.shape[1], left_restrictions.shape[2]))

    for invix in range(len(invs)):
        if invs[invix] == 1:
            restrictions[invix,0,:,:] = right_restrictions[invix]
            restrictions[invix,1,:,:] = left_restrictions[invix]
        else:
            restrictions[invix,0,:,:] = left_restrictions[invix]
            restrictions[invix,1,:,:] = right_restrictions[invix]
    
    ent_idx_tensor = torch.LongTensor(ents)
    source_embeddings = torch.index_select(model.ent_embeddings, 0, ent_idx_tensor).view(-1, model.embedding_dim, model.num_sections)
    
    L = harmonic_extension.Laplacian(edge_indices, restrictions)
    source_vertices = np.arange(n_ents)
    target_vertices = np.array([n_ents])
    return L, source_vertices, target_vertices, source_embeddings

def L_ip(query, model):
    '''query of form ((('e', ('r',)), ('e', ('r',))), ('r',))'''
    ents = [map_ent(t[0]) for t in query[0]]
    rel0, inv0 = map_rel(query[0][0][1][0])
    rel1, inv1 = map_rel(query[0][1][1][0])
    rel2, inv2 = map_rel(query[1][0])
    rels = [rel0, rel1, rel2]
    invs = [inv0, inv1, inv2]
    n_ents = len(ents)
    edge_indices = torch.LongTensor(np.array([[0,2],[1,2],[2,3]],np.int).T)
    
    rel_idx_tensor = torch.LongTensor(rels)
    left_restrictions = torch.index_select(model.left_embeddings, 0, rel_idx_tensor)
    right_restrictions = torch.index_select(model.right_embeddings, 0, rel_idx_tensor)
    
    restrictions = torch.empty((len(rels), 2, left_restrictions.shape[1], left_restrictions.shape[2]))

    for invix in range(len(invs)):
        if invs[invix] == -1:
            restrictions[invix,0,:,:] = right_restrictions[invix]
            restrictions[invix,1,:,:] = left_restrictions[invix]
        else:
            restrictions[invix,0,:,:] = left_restrictions[invix]
            restrictions[invix,1,:,:] = right_restrictions[invix]
    
    ent_idx_tensor = torch.LongTensor(ents)
    source_embeddings = torch.index_select(model.ent_embeddings, 0, ent_idx_tensor).view(-1, model.embedding_dim, model.num_sections)
    
    B = torch.LongTensor(np.array([0,2,3],np.int))
    U = torch.LongTensor(np.array([1],np.int))
    source_vertices = np.array([0,1])
    target_vertices = np.array([2])
    LSchur = harmonic_extension.Kron_reduction(edge_indices, restrictions, B, U)
    return LSchur, source_vertices, target_vertices, source_embeddings
    
def L_pi(query, model):
    '''query of form (('e', ('r', 'r')), ('e', ('r',)))'''
    ents = [map_ent(t[0]) for t in query]
    rel0, inv0 = map_rel(query[0][1][0])
    rel1, inv1 = map_rel(query[0][1][1])
    rel2, inv2 = map_rel(query[1][1][0])
    rels = [rel0, rel1, rel2]
    invs = [inv0, inv1, inv2]
    n_ents = len(ents)
    edge_indices = torch.LongTensor(np.array([[0,2],[2,3],[1,3]],np.int).T)
    
    rel_idx_tensor = torch.LongTensor(rels)
    left_restrictions = torch.index_select(model.left_embeddings, 0, rel_idx_tensor)
    right_restrictions = torch.index_select(model.right_embeddings, 0, rel_idx_tensor)
    
    restrictions = torch.empty((len(rels), 2, left_restrictions.shape[1], left_restrictions.shape[2]))

    for invix in range(len(invs)):
        if invs[invix] == -1:
            restrictions[invix,0,:,:] = right_restrictions[invix]
            restrictions[invix,1,:,:] = left_restrictions[invix]
        else:
            restrictions[invix,0,:,:] = left_restrictions[invix]
            restrictions[invix,1,:,:] = right_restrictions[invix]
    
    ent_idx_tensor = torch.LongTensor(ents)
    source_embeddings = torch.index_select(model.ent_embeddings, 0, ent_idx_tensor).view(-1, model.embedding_dim, model.num_sections)
    
    B = torch.LongTensor(np.array([0,1,3],np.int))
    U = torch.LongTensor(np.array([2],np.int))
    source_vertices = np.array([0,1])
    target_vertices = np.array([2])
    LSchur = harmonic_extension.Kron_reduction(edge_indices, restrictions, B, U)
    return LSchur, source_vertices, target_vertices, source_embeddings

query_name_fn_dict = {'2p': L_p, '3p':L_p, '2i': L_i, '3i':L_i, 'ip':L_ip, 'pi': L_pi}

In [9]:
%%time
allhits1 = []
allhits3 = []
allhits5 = []
allhits10 = []
allmrr = []
query_names = []
# target_embeddings = model.ent_embeddings.view(-1, model.embedding_dim, model.num_sections)[:,:,use_section].T
target_embeddings = torch.mean(model.ent_embeddings.view(-1, model.embedding_dim, model.num_sections), 2).T
for query_structure in query_structures:
    print('Running query : {}'.format(query_structure))
    query_name = query_name_dict[query_structure]
    query_names.append(query_name)
    fn = query_name_fn_dict[query_name]
    hits1 = 0.
    hits3 = 0.
    hits5 = 0.
    hits10 = 0.
    mrr = 0.
    cnt = 0
    # the len() > 0 part is to determine whether we have an "easy" query
    queries = [q for q in test_queries[query_structure] if len(test_answers[q]) > 0] 
    for query in tqdm(queries[:num_test]):
        # we have a non-trivial "easy" query
        answers = [map_ent(a) for a in test_answers[query]]
        L, source_vertices, target_vertices, source_embeddings = fn(query, model)
        Q = harmonic_extension.compute_costs(L,source_vertices,target_vertices,torch.mean(source_embeddings, 2).flatten(),target_embeddings,source_embeddings.shape[1])
        sortd,_ = torch.sort(Q)
        idxleft = torch.searchsorted(sortd, Q[answers], right=False) + 1
        idxright = torch.searchsorted(sortd, Q[answers], right=True) + 1
        nl = idxleft.shape[0]
        nr = idxright.shape[0]
        idxright = idxleft # throw this for optimistic ranking
        hits1 += ((torch.sum(idxleft <= 1)/nl + torch.sum(idxright <= 1)/nr) / 2.)
        hits3 += ((torch.sum(idxleft <= 3)/nl + torch.sum(idxright <= 3)/nr) / 2.)
        hits5 += ((torch.sum(idxleft <= 5)/nl + torch.sum(idxright <= 5)/nr) / 2.)
        hits10 += ((torch.sum(idxleft <= 10)/nl + torch.sum(idxright <= 10)/nr) / 2.)
        mrr += ((torch.sum(1./idxleft)/nl + torch.sum(1./idxright)/nr) / 2.)
        cnt += 1
    if cnt > 0:
        allhits1.append(hits1/cnt)
        allhits3.append(hits3/cnt)
        allhits5.append(hits5/cnt)
        allhits10.append(hits10/cnt)
        allmrr.append(mrr/cnt)
    else:
        default = 0.
        allhits1.append(default)
        allhits3.append(default)
        allhits5.append(default)
        allhits10.append(default)
        allmrr.append(default)


 10%|▉         | 19/200 [00:00<00:00, 185.25it/s]

Running query : ('e', ('r', 'r'))


100%|██████████| 200/200 [00:01<00:00, 185.86it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Running query : ('e', ('r', 'r', 'r'))


100%|██████████| 200/200 [00:01<00:00, 162.74it/s]
 12%|█▏        | 24/200 [00:00<00:00, 234.50it/s]

Running query : (('e', ('r',)), ('e', ('r',)))


100%|██████████| 200/200 [00:00<00:00, 233.63it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Running query : (('e', ('r',)), ('e', ('r',)), ('e', ('r',)))


100%|██████████| 200/200 [00:00<00:00, 230.14it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]

Running query : (('e', ('r', 'r')), ('e', ('r',)))
Running query : ((('e', ('r',)), ('e', ('r',))), ('r',))
CPU times: user 16.6 s, sys: 43.5 ms, total: 16.7 s
Wall time: 4.45 s





In [10]:
cols = ['hits@1', 'hits@3', 'hits@5', 'hits@10', 'mrr']
df = pd.DataFrame(np.array([allhits1, allhits3, allhits5, allhits10, allmrr]).T, columns=cols, index=query_names) 

In [11]:
print(model_name)
df * 100 # for percents...

SheafE_Multisection_64embdim_64esdim_64sec_2norm_1000epochs_SoftplusLossloss_20210301-2201


Unnamed: 0,hits@1,hits@3,hits@5,hits@10,mrr
2p,5.942549,8.798885,11.330973,12.967964,8.413564
3p,7.612195,10.887084,13.127328,14.633913,10.059942
2i,1.320742,1.849611,3.009926,5.222957,2.709301
3i,4.014133,5.515484,6.016835,7.613242,5.572514
pi,0.0,0.0,0.0,0.0,0.0
ip,0.0,0.0,0.0,0.0,0.0
