In [21]:
# params
split = 'train_set'  # dataset split
kg = 'wikidata2020'  # background KG serialised in HDT
limit = 2  # number of samples
k = 10  # number of top matched predicates for MP

In [39]:
# load subgraphs
import os
import json
from collections import defaultdict

import numpy as np
import scipy.sparse as sp
import scipy.spatial.distance

from sentence_transformers import SentenceTransformer
from hdt_utils import HDT_Graph

from predicates_dictionary import predicates
from settings import data_path

_dir = '%s/subgraphs/%s/' % (data_path, split)

In [None]:
# load graph and transformer
wikidata = HDT_Graph(kg)
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')  # predicate/question scoring function

In [8]:
# load adjacencies
def generate_adj_sp(adjacencies, n_entities, include_inverse):
    '''
    Build adjacency matrix
    '''
    adj_shape = (n_entities, n_entities)
    
    # colect all predicate matrices separately into a list
    sp_adjacencies = []
    for edges in adjacencies:
        # split subject (row) and object (col) node URIs
        n_edges = len(edges)
        row, col = np.transpose(edges)
        
        # duplicate edges in the opposite direction
        if include_inverse:
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2
        
        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
        sp_adjacencies.append(adj)
    
    return np.asarray(sp_adjacencies)

In [50]:
# iterate over samples
for file_name in os.listdir(_dir)[:limit]:
    with open(_dir + file_name, "r") as data:
        conversation = json.load(data)
        seed_entity = conversation['seed_entity']
        print(seed_entity)
        
        entity_ids = conversation['entities']
        predicate_ids = conversation['predicates']
        adjacencies = conversation['adjacencies']
        n_entities = len(entity_ids)
        print("Subgraph with %d entities and %d predicates" % (n_entities, len(predicate_ids))) 
        
        A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)
        
        # seed activation
        x = np.zeros(n_entities)
        idx = entity_ids.index(conversation['seed_entity_id'])
        x[idx] = 1
        
        # get all labels for predicates in the graph
        ps = {predicates[p]: i for i, p in enumerate(predicate_ids) if p in predicates}
        p_labels = list(ps.keys())
        # encode all predicates
        p_vectors = model.encode(p_labels)
        
        print('\n')
        
        for i, es in enumerate(conversation['answer_entities']):
            print(conversation['questions'][i])
            print(conversation['answer_texts'][i])
            # answer entities
            if es:
                print(es)
                p = conversation['questions'][i]
                # encode question
                q_vector = model.encode([p])[0]
                
                # compare question to all predicates in the graph
                dists = []
                for i, p_vector in enumerate(p_vectors):
                    dists.append(scipy.spatial.distance.cosine(q_vector, p_vector))
                assert len(dists) == len(p_labels)
                
                # get top-k scored predicates
                top_k = sorted(range(len(dists)), key=lambda k: dists[k])[:k]
                top_ids = []
                p = []
                for i in top_k:
                    top_label = p_labels[i]
                    print(top_label)
                    top_ids.append(ps[top_label])
                    p.append(1 - dists[i])
                p = np.array(p)
                
                # select only the adjacency matrices for the top-k predicates in the subgraph
                _A = A[[top_ids]]
                _y = x @ sum(p*_A)
                top = np.argwhere(_y > 0).T.tolist()[0]
                results = defaultdict(list)
                if len(top) > 0:
                    activations = np.asarray(entity_ids)[top]
                    for i, _id in enumerate(activations):
                        uri = wikidata.look_up_uri(_id, 'entity')
                        if uri:
                            score = _y[top[i]]
                            results[score].append(uri)
                    
                    # sort results
                    results = sorted(results.items(), reverse=True)
                    for score, uris in results:
                        print(score)
                        for u in uris:
                            print(u)
                        break

                else:
                    print('No answer')
                print('\n')

        
        
        break

http://www.wikidata.org/entity/Q185449
Subgraph with 10999 entities and 494 predicates


Genre of the book Eragon?
young adult literature;speculative fiction novel
['http://www.wikidata.org/entity/Q1233720', 'http://www.wikidata.org/entity/Q10992055']
The Encyclopedia of Science Fiction ID
ISFDB title ID
ISFDB author ID
ISFDB publisher ID
Discogs artist ID
genre
Encyclopædia Britannica Online ID
narrative location
Brockhaus Enzyklopädie online ID
Library of Congress authority ID
0.4277104437351227
http://www.wikidata.org/entity/Q10992055
http://www.wikidata.org/entity/Q1233720


and who wrote it ?
Christopher Paolini
['http://www.wikidata.org/entity/Q93620']
creator
author
ISFDB author ID
narrative location
country of origin
occupation
place of birth
Babelio author ID
after a work by
country of citizenship
0.436457097530365
http://www.wikidata.org/entity/Q93620


and when was he born ?
17 November 1983
When was the book published ?
26 August 2003
and who published the book ?
Alfred A. 

  _A = A[[top_ids]]
