In [5]:
# load subgraphs
import json

path = '/ivi/ilps/personal/svakule'
split = 'train_set'
conv_id = 1

qa_pairs = []
with open('%s/subgraphs/%s/%d.json' % (path, split, conv_id), "r") as data:
    conversation = json.load(data)
    seed_entity = conversation['seed_entity']
    print(conversation['seed_entity_text'])
    print(seed_entity)
    print('\n')
    
    for i, es in enumerate(conversation['answer_entities']):
        print(conversation['questions'][i])
        print(conversation['answer_texts'][i])
        for e in es:
            if e:
                print(e)
                qa_pairs.append([seed_entity, conversation['questions'][i], e])
    
    matched_entity_ids = [conversation['seed_entity_id']] + [a for _as in conversation['answer_ids'] for a in _as if a]
    print("\n%d matched entities" % len(matched_entity_ids))
    
    entity_ids = conversation['entities']
    predicate_ids = conversation['predicates']
    adjacencies = conversation['adjacencies']
    n_entities = len(entity_ids)
    print("Subgraph with %d entities and %d predicates" % (n_entities, len(predicate_ids)))


American Hustle
http://www.wikidata.org/entity/Q9013673


Does Bradley Cooper star in American Hustle?
Yes
When was he born?
5 January 1975
And who composed?
Danny Elfman
http://www.wikidata.org/entity/Q193338
Did the movie win a Golden Globe award?
Yes
How long is the movie?
138 minute

2 matched entities
Subgraph with 1160 entities and 410 predicates


# Forward pass

## Check connectivity

In [6]:
# check the answer entity is reachable from the seed entity
import numpy as np
import scipy.sparse as sp

# load adjacencies
def generate_adj_sp(adjacencies, n_entities, include_inverse):
    '''
    Build adjacency matrix
    '''
    adj_shape = (n_entities, n_entities)
    
    # colect all predicate matrices separately into a list
    sp_adjacencies = []
    for edges in adjacencies:
        # split subject (row) and object (col) node URIs
        n_edges = len(edges)
        row, col = np.transpose(edges)
        
        # duplicate edges in the opposite direction
        if include_inverse:
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2
        
        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
        sp_adjacencies.append(adj)
    
    return np.asarray(sp_adjacencies)

# seed activation
x = np.zeros(n_entities)
for e in matched_entity_ids:
    idx = entity_ids.index(e)
    x[idx] = 1

A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)
for i, _A in enumerate(A):
    # MP
    _y = x @ _A
  
    # report edges between matched entities
    z = _y * x
    overlap = int(sum(z))
    if overlap:
        print("%d matched entities are connected with an edge" % int(overlap))

2 matched entities are connected with an edge


## Predict relations

In [7]:
# load all predicate labels
from predicates import properties

relationid2label = {}
for p in properties['results']['bindings']:
    _id = p['property']['value'].split('/')[-1]
    label = p['propertyLabel']['value']
    relationid2label[_id] = label
p_labels = list(relationid2label.values())
print(p_labels)

['head of government', 'video', 'traffic sign', 'route map', 'highway system', 'country', 'image', 'place of birth', 'place of death', 'sex or gender', 'father', 'mother', 'spouse', 'country of citizenship', 'continent', 'instance of', 'head of state', 'capital', 'official language', 'currency', 'position held', 'child', 'flag image', 'shares border with', 'author', 'audio', 'family', 'member of sports team', 'director', 'screenwriter', 'constellation', 'discoverer or inventor', 'site of astronomical discovery', 'ancestral home', 'educated at', 'top-level Internet domain', 'connecting line', 'architect', 'anthem', 'composer', 'librettist', 'commissioned by', 'sexual orientation', 'main regulatory text', 'coat of arms image', 'noble title', 'editor', 'field of work', 'member of political party', 'native language', 'taxon rank', 'occupation', 'employer', 'signature', 'illustrator', 'measured physical quantity', 'founded by', 'airline hub', 'airline alliance', 'home venue', 'chemical stru

In [8]:
# P86 composer
print(qa_pairs)
p = qa_pairs[0][1]
print(p)

[['http://www.wikidata.org/entity/Q9013673', 'And who composed?', 'http://www.wikidata.org/entity/Q193338']]
And who composed?


In [28]:
# encode all predicates
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

sentence_embeddings = model.encode([p] + p_labels)

In [29]:
import scipy.spatial.distance

q_vector = sentence_embeddings[0]
dists = []
for i, p_vector in enumerate(sentence_embeddings[1:]):
    dists.append(scipy.spatial.distance.cosine(q_vector, p_vector))
assert len(dists) == len(p_labels)

In [31]:
k = 1
top_k = sorted(range(len(dists)), key=lambda k: dists[k])[:k]
for i in top_k:
    print(p_labels[i])

composer
