# Sample Conversation

In [69]:
import json
train_conversations_path = './data/train_set/train_set_ALL.json'
PREFIX_E = 'http://www.wikidata.org/entity/'

# load training set
with open(train_conversations_path, "r") as data:
    conversations = json.load(data)
print("%d conversations loaded"%len(conversations))

6720 conversations loaded


In [115]:
# load a sample conversation as a sequence of entities
conversation = conversations[4]
question = conversation['questions'][0]['question']
print(question)

entities = []
seed_entity = conversation['seed_entity']
entities.append(PREFIX_E + seed_entity.split('/')[-1])
print(seed_entity)
print('\n')

n_questions = len(conversation['questions'])
for i in range(n_questions):
    question = conversation['questions'][i]['question']
    print(question)
    
    answer = conversation['questions'][i]['answer']
    # consider only answers which are entities
    if ('www.wikidata.org' in answer): 
        print(answer)
        entity = PREFIX_E + answer.split('/')[-1]
        entities.append(entity)

print('\n')    
print("%d questions %d answer-entities"%(n_questions, len(entities)-1))

# intermediate entities: Karen Carpenter https://www.wikidata.org/wiki/Q1250861
# TODO retrieve relations between these entities

Which famous author wrote the fantasy book series Harry Potter?
https://www.wikidata.org/wiki/Q8337


Which famous author wrote the fantasy book series Harry Potter?
https://www.wikidata.org/wiki/Q34660
Which book was the first one written?
https://www.wikidata.org/wiki/Q43361
What novel was the final one?
https://www.wikidata.org/wiki/Q46758
What character joins Harry Potter after being saved by him?
https://www.wikidata.org/wiki/Q174009
The series consists of which amount of books?


5 questions 4 answer-entities


# Retrieve Relations

In [116]:
# load KG
from hdt import HDTDocument, TripleComponentRole
from settings import *

hdt_file = 'wikidata2018_09_11.hdt'
kg = HDTDocument(hdt_path+hdt_file)

max_triples = 50000
offset = 0

In [117]:
# look up entity ids in the KG
matched_entity_ids = []
for entity in entities:
    matched_entity_ids.append(kg.string_to_global_id(entity, TripleComponentRole.OBJECT))
print(matched_entity_ids)

[50438799, 25203200, 34545680, 38167359, 7397250]


In [118]:
# retrieve relevant subgraph
subgraph = kg.compute_hops(matched_entity_ids, max_triples, offset)
entity_ids, predicate_ids, adjacencies = subgraph

n_entities = len(entity_ids)
n_relations = len(predicate_ids)

assert n_relations == len(adjacencies)

print("Subgraph with %d entities and %d relation types"%(n_entities, n_relations))

Subgraph with 4153 entities and 415 relation types


In [119]:
# activate matched entities
row, col, data = [], [], []
score = 1
for i, e in enumerate(matched_entity_ids):
    idx = entity_ids.index(e)
    row.append(i)
    col.append(idx)
    data.append(score)
x = sp.csr_matrix((data, (row, col)), shape=(len(matched_entity_ids), n_entities))
print("%d entities activated"%len(matched_entity_ids))

5 entities activated


In [120]:
# load adjacencies
def generate_adj_sp(adjacencies, n_entities, include_inverse):
    '''
    Build adjacency matrix
    '''
    adj_shape = (n_entities, n_entities)
    
    # colect all predicate matrices separately into a list
    sp_adjacencies = []
    for edges in adjacencies:
        
        # split subject (row) and object (col) node URIs
        n_edges = len(edges)
        row, col = np.transpose(edges)
        
        # duplicate edges in the opposite direction
        if include_inverse:
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2
        
        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
        sp_adjacencies.append(adj)
    
    return np.asarray(sp_adjacencies)

A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)
print("%d adjacency matrices for each of the relation types" % len(A))

415 adjacency matrices for each of the relation types


In [121]:
# MP separately for each relation type
from sklearn.preprocessing import normalize, binarize

y = sp.csr_matrix((len(matched_entity_ids), n_entities))

for _A in A:
    _y = x @ _A
    # normalize: cut top to 1
#     _y[_y > 1] = 1
    y += _y
    
sum_a = sum(y)
sum_a_norm = sum_a.toarray()[0] / 2
# normalize: cut top to 1
# sum_a_norm[sum_a_norm > 1] = 1
# activations across components
y_counts = binarize(y, threshold=0.0)
count_a = sum(y_counts).toarray()[0]
# final scores
y = (sum_a_norm + count_a) / (2 + 1)

# check output size
assert y.shape[0] == n_entities

In [122]:
# TODO find all edges between the matched nodes
# there is an edge to the node if it remains activated after MP
top = np.argwhere(y > 0).T.tolist()[0]
if len(top) > 0:
    activated_ids = np.asarray(entity_ids)[top]
    answer_uris = []
    for a in activated_ids:
        uri = kg.global_id_to_string(a, TripleComponentRole.SUBJECT)
        if uri:
            answer_uris.append(uri)
            if uri in entities:
                print(uri)

# directly connected matched nodes

http://www.wikidata.org/entity/Q8337
http://www.wikidata.org/entity/Q34660
http://www.wikidata.org/entity/Q43361
http://www.wikidata.org/entity/Q46758
http://www.wikidata.org/entity/Q174009


In [123]:
# find entities that were activated more than once
top = np.argwhere(y > 1).T.tolist()[0]
if len(top) > 0:
    activated_ids = np.asarray(entity_ids)[top]
    answer_uris = []
    for a in activated_ids:
        uri = kg.global_id_to_string(a, TripleComponentRole.SUBJECT)
        if uri:
            answer_uris.append(uri)
            print(uri)

# those are the entities located at the intersection of the matched nodes
# TODO explain relations to these nodes
# TODO are they necessary to connect the matched nodes or is there already an edge between this pair of nodes

http://www.wikidata.org/entity/Q8337
http://www.wikidata.org/entity/Q34660
http://www.wikidata.org/entity/Q43361
http://www.wikidata.org/entity/Q46758
http://www.wikidata.org/entity/Q174009
http://www.wikidata.org/entity/Q5410773
http://www.wikidata.org/entity/Q190125
http://www.wikidata.org/entity/Q568642
http://www.wikidata.org/entity/Q452283
http://www.wikidata.org/entity/Q3244512
http://www.wikidata.org/entity/Q102438
http://www.wikidata.org/entity/Q161678
http://www.wikidata.org/entity/Q20711488
http://www.wikidata.org/entity/Q216930
http://www.wikidata.org/entity/Q232009
http://www.wikidata.org/entity/Q1250951
http://www.wikidata.org/entity/Q173998
http://www.wikidata.org/entity/Q176132
http://www.wikidata.org/entity/Q176772
http://www.wikidata.org/entity/Q177439
http://www.wikidata.org/entity/Q179641
http://www.wikidata.org/entity/Q187923
http://www.wikidata.org/entity/Q190366
http://www.wikidata.org/entity/Q192179
http://www.wikidata.org/entity/Q27924622
http://www.wikidata.org

In [None]:
# get connection for each of these entities to each of the matched entities