In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pykeen
import torch
from pykeen.pipeline import pipeline

In [2]:
dataset = 'FB15k-237'

In [3]:
ds = pykeen.datasets.get_dataset(dataset=dataset)

training = ds.training.mapped_triples
testing = ds.testing.mapped_triples
ds.training.relation_id_to_label

You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out


{0: '/american_football/football_team/current_roster./sports/sports_team_roster/position',
 1: '/award/award_category/category_of',
 2: '/award/award_category/disciplines_or_subjects',
 3: '/award/award_category/nominees./award/award_nomination/nominated_for',
 4: '/award/award_category/winners./award/award_honor/award_winner',
 5: '/award/award_category/winners./award/award_honor/ceremony',
 6: '/award/award_ceremony/awards_presented./award/award_honor/award_winner',
 7: '/award/award_ceremony/awards_presented./award/award_honor/honored_for',
 8: '/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for',
 9: '/award/award_nominee/award_nominations./award/award_nomination/award',
 10: '/award/award_nominee/award_nominations./award/award_nomination/award_nominee',
 11: '/award/award_nominee/award_nominations./award/award_nomination/nominated_for',
 12: '/award/award_winner/awards_won./award/award_honor/award_winner',
 13: '/award/award_winning_work/awards_won

In [4]:
ds.training.entity_id_to_label

{0: '/m/010016',
 1: '/m/0100mt',
 2: '/m/0102t4',
 3: '/m/0104lr',
 4: '/m/0105y2',
 5: '/m/0106dv',
 6: '/m/0108xl',
 7: '/m/0109vk',
 8: '/m/010bnr',
 9: '/m/010bxh',
 10: '/m/010cw1',
 11: '/m/010dft',
 12: '/m/010h9y',
 13: '/m/010hn',
 14: '/m/010m55',
 15: '/m/010nlt',
 16: '/m/010p3',
 17: '/m/010r6f',
 18: '/m/010rvx',
 19: '/m/010t4v',
 20: '/m/010tkc',
 21: '/m/010v8k',
 22: '/m/010xjr',
 23: '/m/010y34',
 24: '/m/010z5n',
 25: '/m/0113sg',
 26: '/m/0114m0',
 27: '/m/0118d3',
 28: '/m/011_3s',
 29: '/m/011_6p',
 30: '/m/011_vz',
 31: '/m/011hdn',
 32: '/m/011hq1',
 33: '/m/011j5x',
 34: '/m/011k11',
 35: '/m/011k1h',
 36: '/m/011k4g',
 37: '/m/011k_j',
 38: '/m/011kn2',
 39: '/m/011lpr',
 40: '/m/011lvx',
 41: '/m/011pcj',
 42: '/m/011s0',
 43: '/m/011s9r',
 44: '/m/011v3',
 45: '/m/011vx3',
 46: '/m/011w20',
 47: '/m/011w4n',
 48: '/m/011w54',
 49: '/m/011wdm',
 50: '/m/011wtv',
 51: '/m/011x_4',
 52: '/m/011xg5',
 53: '/m/011xhx',
 54: '/m/011xjd',
 55: '/m/011xy1',
 56: '

In [5]:
def create_multi_hop_dataset(training, test_size, path_length):
    ''' Function to random walk on knowledge graph triplets to generate
    multi-hop testing dataset. This function assumes `training` is of the 
    form provided by pykeen. That is, a tensor of size (d x 3) where d 
    is the number of triplets, column 1 is the head entity id, column 2 
    is the relation id, and column 3 is the tail entity id. 
    
    There is likely a much faster way to generate this dataset with better 
    preprocessing and a better representation for the triplet graph. But 
    this should do for now.
    '''
    head_ents = torch.unique(training[:,0])
    tail_ents = torch.unique(training[:,2])
    unq_ents = torch.unique(torch.cat([head_ents,tail_ents]))
    random_start_idxs = torch.randint(unq_ents.shape[0], (test_size,))
    
    start_end_ents = torch.zeros((test_size,2), dtype=torch.int) # track starting and ending entities across random walk
    interior_ents = torch.zeros((test_size,path_length-1), dtype=torch.int) # track pass-through nodes along random walk
    comp_relations = torch.zeros((test_size,path_length), dtype=torch.int) # track the relations crossed along each walk
    inv_relations = torch.zeros((test_size,path_length), dtype=torch.int) # track whether the crossed relations are inverted
    for pidx in range(test_size):
        random_start_idx = random_start_idxs[pidx]
        ent = unq_ents[random_start_idx]
        start_end_ents[pidx,0] = ent
        for step in range(path_length):
            head_instances = training[training[:,0] == ent]
            tail_instances = training[training[:,2] == ent]
            instances = torch.cat([head_instances, tail_instances])
            random_step_idx = torch.randint(instances.shape[0], (1,))
            step_edge = instances[random_step_idx[0]]
            if step_edge[0] == ent:
                # forward relation
                inv_relations[pidx,step] = 1
                ent = step_edge[2]
            else:
                # inverse relation
                inv_relations[pidx,step] = -1
                ent = step_edge[0]
            if step < path_length - 1:
                interior_ents[pidx,step] = ent
            comp_relations[pidx,step] = step_edge[1]
        start_end_ents[pidx,1] = ent
    return start_end_ents, interior_ents, comp_relations, inv_relations

In [6]:
start_end_ents, comp_relations, inv_relations = create_multi_hop_dataset(training, 1000, 4)

In [7]:
start_end_ents

tensor([[ 7426, 12094],
        [11854,  7079],
        [13464,  9447],
        ...,
        [ 3785, 10517],
        [ 8116,  7689],
        [ 8309,  4148]], dtype=torch.int32)

In [8]:
comp_relations

tensor([[200, 209, 199, 200],
        [199, 209, 201, 201],
        [ 17,  17,  47,  47],
        ...,
        [ 89,  89,  68,  68],
        [161, 234, 166,  73],
        [119, 188, 195, 189]], dtype=torch.int32)

In [9]:
inv_relations

tensor([[ 1,  1,  1, -1],
        [ 1,  1, -1,  1],
        [ 1, -1, -1,  1],
        ...,
        [ 1, -1, -1,  1],
        [-1,  1,  1, -1],
        [ 1, -1,  1,  1]], dtype=torch.int32)