In [10]:
# load a subgraph
import json

path = '/ivi/ilps/personal/svakule'
split = 'train_set'
conv_id = 1

qa_pairs = []
with open('%s/subgraphs/%s/%d.json' % (path, split, conv_id), "r") as data:
    conversation = json.load(data)
    seed_entity = conversation['seed_entity']
    print(conversation['seed_entity_text'])
    print(seed_entity)
    print('\n')
    
    for i, es in enumerate(conversation['answer_entities']):
        print(conversation['questions'][i])
        print(conversation['answer_texts'][i])
        for e in es:
            if e:
                print(e)
                qa_pairs.append([seed_entity, conversation['questions'][i], e])
    
    matched_entity_ids = [conversation['seed_entity_id']] + [a for _as in conversation['answer_ids'] for a in _as if a]
    print("\n%d matched entities" % len(matched_entity_ids))
    
    entity_ids = conversation['entities']
    predicate_ids = conversation['predicates']
    adjacencies = conversation['adjacencies']
    n_entities = len(entity_ids)
    print("Subgraph with %d entities and %d predicates" % (n_entities, len(predicate_ids)))


American Hustle
http://www.wikidata.org/entity/Q9013673


Does Bradley Cooper star in American Hustle?
Yes
When was he born?
5 January 1975
And who composed?
Danny Elfman
http://www.wikidata.org/entity/Q193338
Did the movie win a Golden Globe award?
Yes
How long is the movie?
138 minute

2 matched entities
Subgraph with 1160 entities and 410 predicates


# Forward pass

## Check connectivity

In [11]:
# check the answer entity is reachable from the seed entity
import numpy as np
import scipy.sparse as sp

# load adjacencies
def generate_adj_sp(adjacencies, n_entities, include_inverse):
    '''
    Build adjacency matrix
    '''
    adj_shape = (n_entities, n_entities)
    
    # colect all predicate matrices separately into a list
    sp_adjacencies = []
    for edges in adjacencies:
        # split subject (row) and object (col) node URIs
        n_edges = len(edges)
        row, col = np.transpose(edges)
        
        # duplicate edges in the opposite direction
        if include_inverse:
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2
        
        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
        sp_adjacencies.append(adj)
    
    return np.asarray(sp_adjacencies)

# seed activation
x = np.zeros(n_entities)
for e in matched_entity_ids:
    idx = entity_ids.index(e)
    x[idx] = 1

A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)

for i, _A in enumerate(A):
    # MP
    _y = x @ _A
  
    # report edges between matched entities
    z = _y * x
    overlap = int(sum(z))
    if overlap:
        print("%d matched entities are connected with an edge" % int(overlap))
        print(i)
        print(predicate_ids[i])

2 matched entities are connected with an edge
355
14078


## Predict relations

In [12]:
from predicates_dictionary import predicates

# get all labels for predicates in the graph
ps = {predicates[p]: i for i, p in enumerate(predicate_ids) if p in predicates}
print(ps)
p_labels = list(ps.keys())
# print(p_labels)

{'Nationale Thesaurus voor Auteurs ID': 185, 'film editor': 186, 'occupation': 187, 'Box Office Mojo film ID': 188, 'Rotten Tomatoes ID': 189, 'NNDB people ID': 190, 'AlloCiné film ID': 191, 'AlloCiné person ID': 192, 'CANTIC ID': 193, 'instrument': 194, 'NLA Trove ID': 195, 'described by source': 196, 'winner': 197, 'genre': 198, 'religion': 199, 'nominated for': 200, 'languages spoken, written or signed': 201, 'Encyclopædia Britannica Online ID': 202, 'executive producer': 203, 'published in': 204, 'based on': 205, 'title': 206, 'birth name': 207, 'has quality': 208, 'name in native language': 209, 'AllMovie movie ID': 210, 'University of Barcelona authority ID': 211, 'cast member': 212, 'producer': 213, 'award received': 214, 'country': 215, 'Metacritic ID': 216, 'AllMusic artist ID': 217, 'GTAA ID': 218, 'performer': 219, 'DNF film ID': 220, 'genealogics.org person ID': 221, 'Netflix ID': 222, 'place of birth': 223, 'Spotify artist ID': 224, 'Discogs artist ID': 225, 'MovieMeter fi

In [13]:
# P86 composer 177
print(qa_pairs)
p = qa_pairs[0][1]
print(p)

[['http://www.wikidata.org/entity/Q9013673', 'And who composed?', 'http://www.wikidata.org/entity/Q193338']]
And who composed?


In [14]:
# encode all predicates
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

sentence_embeddings = model.encode([p] + p_labels)

In [15]:
import scipy.spatial.distance

q_vector = sentence_embeddings[0]
dists = []
for i, p_vector in enumerate(sentence_embeddings[1:]):
    dists.append(scipy.spatial.distance.cosine(q_vector, p_vector))
assert len(dists) == len(p_labels)

In [16]:
k = 1
top_k = sorted(range(len(dists)), key=lambda k: dists[k])[:k]
for i in top_k:
    top_label = p_labels[i]
    print(top_label)
    top_id = ps[top_label]
    print(top_id)

composer
355


In [17]:
# load graph
from hdt_utils import HDT_Graph
wikidata = HDT_Graph('wikidata2020')

In [18]:
# MP to find the answer in the graph propagate only over the top-matched predicate

# seed activation
x = np.zeros(n_entities)
idx = entity_ids.index(conversation['seed_entity_id'])
x[idx] = 1

# find adjacency matrix for the predicate in the subgraph
_A = A[top_id]
# MP
_y = x @ _A
top = np.argwhere(_y > 0).T.tolist()[0]
if len(top) > 0:
    activations = np.asarray(entity_ids)[top]
    for _id in activations:
        uri = wikidata.look_up_uri(_id, 'entity')
        print(uri)


http://www.wikidata.org/entity/Q193338
