In [1]:
import json
import requests
from collections import defaultdict

import numpy as np
import scipy.sparse as sp

from elasticsearch import Elasticsearch
from hdt import HDTDocument, TripleComponentRole

from settings import *

data_path = './data/wikidata-disambig-train.json'
rdfsLabelURI = 'http://www.w3.org/2000/01/rdf-schema#label'

hdt_file = 'wikidata20200309.hdt'
PREFIX_E = 'http://www.wikidata.org/entity/'
namespace = 'predef-wikidata2020-03-all'
kg = HDTDocument(hdt_path+hdt_file)
predicates_ids = []
kg.configure_hops(1, predicates_ids, namespace, True, False)

index_name = 'wikidata_clef'
es = Elasticsearch()


def tagme_get_all_entities(utterance, tagmeToken=tagmeToken):
    '''
    Returns all entities found with TagMe
    '''
    request_successfull = False
    while not request_successfull:
        try:
            results = json.loads(requests.get('https://tagme.d4science.org/tagme/tag?lang=en&gcube-token=' + tagmeToken + '&text=' + utterance).content)
            request_successfull = True
        except:
            print(utterance)
            time.sleep(5)

    # parse mentions
    mentions = []
    for mention in results['annotations']:
        mentions.append(mention['spot'])
    return mentions


def generate_adj_sp(adjacencies, n_entities, include_inverse):
    '''
    Build adjacency matrix
    '''
    adj_shape = (n_entities, n_entities)
    
    # create a single adjacency matrix
    adj = sp.csr_matrix((adj_shape))
    for edges in adjacencies:
        
        # split subject (row) and object (col) node URIs
        n_edges = len(edges)
        row, col = np.transpose(edges)
        
        # duplicate edges in the opposite direction
        if include_inverse:
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2
        
        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj += sp.csr_matrix((data, (row, col)), shape=adj_shape)
    return adj


def ned(matched_entities, mention, max_triples=50000000, offset=0, mention_score=100):
    
    # get all adjacent notes
    all_ids = [v for vs in matched_entities.values() for v in vs]
    subgraph1 = kg.compute_hops(all_ids, max_triples, offset)
    
    # prepare matrices for MP
    entity_ids, predicate_ids, adjacencies = subgraph1
    n_entities = len(entity_ids)
    if predicate_ids:
        A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)
        
        # index entity ids global -> local
        entities_dict = {k: v for v, k in enumerate(entity_ids)}

        # activate matched entities
        row, col, data = [], [], []
        for i, span in enumerate(matched_entities):
            for entity_id in matched_entities[span]:
                if entity_id in entities_dict:
                    local_id = entities_dict[entity_id]
                    row.append(i)
                    col.append(local_id)
                    score = 1
                    if span == mention:
                        score = mention_score
                    data.append(score)
        x = sp.csr_matrix((data, (row, col)), shape=(len(matched_entities), n_entities))

        # MP
        y = x @ A
        y = sum(y).toarray()[0]

        top = np.argwhere(y > mention_score).T.tolist()[0]
        activations = defaultdict(int)
        if len(top) > 0:
            activations1 = np.asarray(entity_ids)[top]

            # store the activation values per id answer id
            for i, e in enumerate(entity_ids):
                if e in activations1:
                    activations[e] += y[i]
        answers = [{a_id: a_score} for a_id, a_score in sorted(activations.items(), key=lambda item: item[1], reverse=True)[:500] if a_score%mention_score != 0]
        answers_ids = [_id for a in answers for _id in a]
        answer_uris = []
        for a in answers_ids:
            uri = kg.global_id_to_string(a, TripleComponentRole.SUBJECT)
            if uri:
                answer_uris.append(uri)
    # filter out answers that do not have labels
    top_answers_uris = []

    for uri in answer_uris:
#         filter out redirects e.g. http://www.wikidata.org/entity/statement/Q271189-081D418E-7709-4074-9864-EDD6B4C46601
        if not 'statement' in uri.split('/'):
            top_answers_uris.append(uri)

    answers = top_answers_uris
    print("%d answers found"%len(answers))
    return answers


top = 20

with open(data_path) as train_file:
    train_set = json.load(train_file)
    print("%d training examples"%len(train_set))
    for sample in train_set[1:2]:
        print(sample['string'])
        correct_id = sample['correct_id']
        print(correct_id)
        text_doc = sample['text'].strip()
        print(text_doc)
        # get mentions annotated by TagMe
        mentions = tagme_get_all_entities(text_doc)
        print(mentions)

        # link mentions to Wikidata
        top_entities = {}
        candidate_entities = []
        candidate_uris = []
        for m in mentions:
            m = m.lower()
            results = es.search(index=index_name,
                                body={"query": {"multi_match": {"query": m,
                                                                "fields": ["label.ngrams", "label.snowball^20"],
                                                               }}},
                                      size=top)['hits']['hits']
            
            # evaluate: check correct entity uri is in the candidate set
            if m == sample['string']:
                for entity in results:
                    entity_uri = entity['_source']['uri']
                    candidate_uris.append(entity_uri)
                    entity_id = kg.string_to_global_id(entity_uri, TripleComponentRole.OBJECT)
                    candidate_entities.append(entity_id)
                print(candidate_uris)
                print(PREFIX_E+correct_id in candidate_uris)
            else:
                entity_ids = []
                for entity in results:
                    entity_uri = entity['_source']['uri']
                    entity_id = kg.string_to_global_id(entity_uri, TripleComponentRole.OBJECT)
                    entity_ids.append(entity_id)
                top_entities[m] = entity_ids

        # NED
        scores = []
        for i, c in enumerate(candidate_entities):
            print(candidate_uris[i])
            top_entities[sample['string']] = [c]
            result_entities = ned(top_entities, sample['string'])
            print(result_entities)
            scores.append(len(result_entities))
        # evaluate: check correct entity id is in the result set
        print(candidate_uris[np.argmax(scores)])
        print(correct_id)
        print(scores)
        break

100000 training examples
corfu
Q121378
the city of Corfu Greece, Empress of Austria Elisabeth of Bavaria also known as Sissi built in 1890 a summer palace with Achilles as its central theme and
['the city', 'city of Corfu', 'Corfu', 'Greece', 'Austria', 'Elisabeth of Bavaria', 'Bavaria', 'Sissi', 'summer palace', 'Achilles', 'central', 'theme']
['http://www.wikidata.org/entity/Q205832', 'http://www.wikidata.org/entity/Q238325', 'http://www.wikidata.org/entity/Q121378', 'http://www.wikidata.org/entity/Q3453113', 'http://www.wikidata.org/entity/Q10750017', 'http://www.wikidata.org/entity/Q20464115', 'http://www.wikidata.org/entity/Q1132682', 'http://www.wikidata.org/entity/Q25162044', 'http://www.wikidata.org/entity/Q25168424', 'http://www.wikidata.org/entity/Q14204125', 'http://www.wikidata.org/entity/Q20657766', 'http://www.wikidata.org/entity/Q16322753', 'http://www.wikidata.org/entity/Q5170491', 'http://www.wikidata.org/entity/Q26321630', 'http://www.wikidata.org/entity/Q21555048', '

5 answers found
['http://www.wikidata.org/entity/Q4167410', 'http://www.wikidata.org/entity/Q10750017', 'http://www.wikidata.org/entity/Q14206921', 'http://www.wikidata.org/entity/Q2561172', 'http://www.wikidata.org/entity/Q3693533']
http://www.wikidata.org/entity/Q121378
291 answers found
['http://www.wikidata.org/entity/Q1144233', 'http://www.wikidata.org/entity/Q11794876', 'http://www.wikidata.org/entity/Q205832', 'http://www.wikidata.org/entity/Q12810411', 'http://www.wikidata.org/entity/Q170295', 'http://www.wikidata.org/entity/Q13496', 'http://www.wikidata.org/entity/Q2028', 'http://www.wikidata.org/entity/Q3900', 'http://www.wikidata.org/entity/Q7905902', 'http://www.wikidata.org/entity/Q24262483', 'http://www.wikidata.org/entity/Q333229', 'http://www.wikidata.org/entity/Q12884653', 'http://www.wikidata.org/entity/Q12886041', 'http://www.wikidata.org/entity/Q15135101', 'http://www.wikidata.org/entity/Q15732398', 'http://www.wikidata.org/entity/Q15957071', 'http://www.wikidata.or

6 answers found
['http://www.wikidata.org/entity/Q30', 'http://www.wikidata.org/entity/Q59789566', 'http://www.wikidata.org/entity/Q115014', 'http://www.wikidata.org/entity/Q23020463', 'http://www.wikidata.org/entity/Q8048283', 'http://www.wikidata.org/entity/Q751708']
http://www.wikidata.org/entity/Q10750017
1 answers found
['http://www.wikidata.org/entity/Q238325']
http://www.wikidata.org/entity/Q20464115
2 answers found
['http://www.wikidata.org/entity/Q705551', 'http://www.wikidata.org/entity/Q3305213']
http://www.wikidata.org/entity/Q1132682
6 answers found
['http://www.wikidata.org/entity/Q1491836', 'http://www.wikidata.org/entity/Q145', 'http://www.wikidata.org/entity/Q617597', 'http://www.wikidata.org/entity/Q39804', 'http://www.wikidata.org/entity/Q697196', 'http://www.wikidata.org/entity/Q84']
http://www.wikidata.org/entity/Q25162044
2 answers found
['http://www.wikidata.org/entity/Q41', 'http://www.wikidata.org/entity/Q12875675']
http://www.wikidata.org/entity/Q25168424
2 an