In [38]:
# load graph
import json
from collections import Counter, defaultdict
import numpy as np

from hdt import HDTDocument, TripleComponentRole

from settings import *
from predicates import properties


hdt_file = 'wikidata2018_09_11.hdt'
kg = HDTDocument(hdt_path+hdt_file)
namespace = 'predef-wikidata2018-09-all'
PREFIX_E = 'http://www.wikidata.org/entity/'

# prepare to retrieve all adjacent nodes including literals
predicates_ids = []
kg.configure_hops(1, predicates_ids, namespace, True, False)

# load all predicate labels

relationid2label = {}
for p in properties['results']['bindings']:
    _id = p['property']['value'].split('/')[-1]
    label = p['propertyLabel']['value']
    relationid2label[_id] = label
    
def check_answer_in_subgraph(answer, entity_ids):
    # consider only answers which are entities
    if ('www.wikidata.org' in answer):
        answer_id = kg.string_to_global_id(PREFIX_E+answer.split('/')[-1], TripleComponentRole.OBJECT)
        in_subgraph = answer_id in entity_ids
        # consider only answer entities that are in the subgraph
        if in_subgraph:
            answer_idx = entity_ids.index(answer_id)
            return answer_idx

# load the training dataset
train_conversations_path = './data/train_set/train_set_ALL.json'

with open(train_conversations_path, "r") as data:
        conversations = json.load(data)
print("%d conversations loaded"%len(conversations))

6720 conversations loaded


In [39]:
# check how many times an answer to the question fall into the initial (seed) subgraph separately for each order in the question sequence

max_triples = 50000000
offset = 0

# collect only samples where the answer is entity and it is adjacent to the seed entity
train_dataset = []

graph_sizes = []
max_n_edges = 2409 # max size of the graph allowed in the number of edges

# consider a sample of the dataset
n_limit = None
if n_limit:
    conversations = conversations[:n_limit]

counts = Counter()
n_entities = []
for conversation in conversations:
    for i in range(len(conversation['questions'])):
        question = conversation['questions'][i]['question']
        answer = conversation['questions'][i]['answer']
        # use oracle for the correct initial entity
        seed_entity = conversation['seed_entity'].split('/')[-1]
        seed_entity_id = kg.string_to_global_id(PREFIX_E+seed_entity, TripleComponentRole.OBJECT)

        # retrieve all adjacent nodes including literals
        subgraph = kg.compute_hops([seed_entity_id], max_triples, offset)
        entity_ids, predicate_ids, adjacencies = subgraph
        assert len(predicate_ids) == len(adjacencies)
    #         print("conversation")
        # check that the answer is in the subgraph
        answer_idx = check_answer_in_subgraph(answer, entity_ids)
        if answer_idx:
            counts[i] += 1
        n_entities.append(len(entity_ids))
        
print(counts)
# show distribution stats
print("Min: %d Mean: %.2f Max: %d"%(min(n_entities), np.mean(n_entities), max(n_entities)))

Counter({0: 4320, 1: 3424, 2: 3104, 3: 3104, 4: 2464})
Min: 37 Mean: 700.87 Max: 12605
