# look up initial neighbourhood


In [23]:
# load dataset
import json
conversations_path = './data/train_set/train_set_ALL.json'

with open(conversations_path, "r") as data:
    conversations = json.load(data)
print("%d conversations loaded"%len(conversations))

# load graph
from hdt import HDTDocument, TripleComponentRole
from settings import *

hdt_file = 'wikidata2018_09_11.hdt'
kg = HDTDocument(hdt_path+hdt_file)
namespace = 'predef-wikidata2018-09-all'
PREFIX_E = 'http://www.wikidata.org/entity/'

# prepare to retrieve all adjacent nodes including literals
predicates_ids = []
kg.configure_hops(1, predicates_ids, namespace, True, False)

6720 conversations loaded


In [37]:
# load all predicate labels
from predicates import properties

relationid2label = {}
for p in properties['results']['bindings']:
    _id = p['property']['value'].split('/')[-1]
    label = p['propertyLabel']['value']
    relationid2label[_id] = label

print(relationid2label)

{'P6': 'head of government', 'P10': 'video', 'P14': 'traffic sign', 'P15': 'route map', 'P16': 'highway system', 'P17': 'country', 'P18': 'image', 'P19': 'place of birth', 'P20': 'place of death', 'P21': 'sex or gender', 'P22': 'father', 'P25': 'mother', 'P26': 'spouse', 'P27': 'country of citizenship', 'P30': 'continent', 'P31': 'instance of', 'P35': 'head of state', 'P36': 'capital', 'P37': 'official language', 'P38': 'currency', 'P39': 'position held', 'P40': 'child', 'P41': 'flag image', 'P47': 'shares border with', 'P50': 'author', 'P51': 'audio', 'P53': 'family', 'P54': 'member of sports team', 'P57': 'director', 'P58': 'screenwriter', 'P59': 'constellation', 'P61': 'discoverer or inventor', 'P65': 'site of astronomical discovery', 'P66': 'ancestral home', 'P69': 'educated at', 'P78': 'top-level Internet domain', 'P81': 'connecting line', 'P84': 'architect', 'P85': 'anthem', 'P86': 'composer', 'P87': 'librettist', 'P88': 'commissioned by', 'P91': 'sexual orientation', 'P92': 'mai

In [73]:
from collections import Counter, defaultdict


def lookup_predicate_labels(predicate_ids):
    p_labels_map = defaultdict(list)
    for p_id in predicate_ids:
        p_uri = kg.global_id_to_string(p_id, TripleComponentRole.PREDICATE)
        label = p_uri.split('/')[-1]
        if label in relationid2label:
            label = relationid2label[label]
        else:
            label = label.split('#')[-1]
        p_labels_map[label].append(p_id)
    return p_labels_map


rdfsLabelURI='http://www.w3.org/2000/01/rdf-schema#label'

def lookup_entity_labels(entity_ids):
    e_labels_map = defaultdict(list)
    for e_id in entity_ids:
        e_uri = kg.global_id_to_string(e_id, TripleComponentRole.OBJECT)
        (triples, cardinality) = kg.search_triples(e_uri, rdfsLabelURI, "")
        if cardinality > 0:
            label = triples.next()[2]
            # strip language marker
            label = label.split('"')[1]
            e_labels_map[label].append(e_id)
    return e_labels_map



answers_in_subgraph = Counter()

def check_answer_in_subgraph(conversation, subgraph):
    answer1 = conversation['questions'][0]['answer']
    # consider only answers which are entities
    if ('www.wikidata.org' in answer1):
        answer1_id = kg.string_to_global_id(PREFIX_E+answer1.split('/')[-1], TripleComponentRole.OBJECT)
        in_subgraph = answer1_id in entity_ids
        answers_in_subgraph.update([in_subgraph])
        # consider only answer entities that are in the subgraph
        if in_subgraph:
            answer1_idx = entity_ids.index(answer1_id)
            return answer1_idx


max_triples = 50000000
offset = 0

# collect only samples where the answer is entity and it is adjacent to the seed entity
dataset = []
for conversation in conversations:
    question1 = conversation['questions'][0]['question']
    # use oracle for the correct initial entity
    seed_entity = conversation['seed_entity'].split('/')[-1]
    seed_entity_id = kg.string_to_global_id(PREFIX_E+seed_entity, TripleComponentRole.OBJECT)
    
    # retrieve all adjacent nodes including literals
    subgraph1 = kg.compute_hops([seed_entity_id], max_triples, offset)
    entity_ids, predicate_ids, adjacencies = subgraph1

    # check that the answer is in the subgraph
    answer1_idx = check_answer_in_subgraph(conversation, entity_ids)
    if answer1_idx:
        seed_entity_idx = entity_ids.index(seed_entity_id)
        p_labels_map = lookup_predicate_labels(predicate_ids)
        dataset.append([(question1, p_labels_map, seed_entity_idx, subgraph1), answer1_idx])

print(answers_in_subgraph)
print("Compiled dataset with %d samples"%len(dataset))

Counter({True: 4320, False: 576})
Compiled dataset with 4320 samples


In [75]:
# check one sample from the dataset
sample = dataset[0]

(question1, p_labels_map, seed_entity_id, subgraph), answer1_id = sample
entity_ids, predicate_ids, adjacencies = subgraph

print(question1)

# look up all labels for entities and predicates
e_labels_map = lookup_entity_labels(entity_ids)
e_labels = list(e_labels_map.keys())
p_labels = list(p_labels_map.keys())
print(e_labels)
print(p_labels)

print("%d entities and %d predicates in the subgraph"%(len(e_labels), len(p_labels)))

Which author wrote the novel 1Q84?
['Commons margmiðlunarskrá', '1Q84', 'Shinchosha', 'Alternativweltgeschichte-Roman', '1984 (roman)', 'Aomame', 'Tengo Kawana', 'Ushikawa', 'Komatsu', 'Fuka-Eri', 'Tamaru', 'Die alte Dame', 'Professeur Ebisuno', 'Der Leader', 'Ayumi', 'Cena Ignotus za nejlepší zahraniční román', 'Aile içi şiddet', 'Parallelwelt', 'Ji̍t-pún Hòng-sàng Hia̍p-hoē', 'Erzählung', 'A-Témoin ti Jéhovah', "Chekhov's gun", 'Historie i historien', '1984 m.', '2001 : ਖਲਾਅ ਵਿਚ ਇੱਕ ਖੋਜ', 'Sinfonietta', "Mother's Little Helper", 'Macbeth (Shakespeare)', 'Concert per a violí', 'Das letzte Ufer', 'El cas de Thomas Crown', 'Oliver Tvistin macəraları', 'Brott och straff', 'Den afrikanska farmen', 'A Karamazov testvérek', 'A Metamorfose', 'Die Insel Sachalin', 'Bontemperita klavarinstrumento', 'Lady Jane', 'D Mattäus-Bassioon vom Johann Sebastian Bach', 'Tokyo Nikki', "It's Only a Paper Moon", 'Louis Armstrong Plays W.C. Handy', 'Sweet Lorraine', 'A la recerca del temps perdut', 'A sikoly