## Usage: 
* Run on the same node as Elasticsearch (for now)
* **Specify input and output paths in the config dictionary below**
* Relax and hope for the best – it's terribly slow (I used parallel notebooks to get the solution ready on time; a smarter way to speed it up will be used in the future)

In [2]:
config = {
    'DATA_FILE': '../HIPE_2020/data/training-v1.1/en/HIPE-data-v1.1-dev-en.tsv',
    'OUTPUT_FILE': 'mp_bundle2_en_2.tsv'
}

In [3]:
DATA_FILE=config['DATA_FILE']
OUTPUT_FILE=config['OUTPUT_FILE']

## Step 0.0: prepare the data

The input data should be in CLEF HIPE format and contain entity mentions

In [3]:
from utils.data_processing import read_data_to_dfs_sentences, merge_dfs

In [4]:
raw_dfs = read_data_to_dfs_sentences(DATA_FILE)

HBox(children=(FloatProgress(value=0.0, max=33174.0), HTML(value='')))




In [5]:
dfs = merge_dfs(raw_dfs)

In [6]:
dfs[0]

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC


## Step 0.1: prepare Elasticsearch for candidate extraction

In [7]:
from elasticsearch import Elasticsearch
es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
es.info()

{'name': 'ilps-cn007',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'vU7MO42lR2KzA6bPvSP_pA',
 'version': {'number': '7.7.0',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '81a1e9eda8e6183f5237786246f6dced26a10eaf',
  'build_date': '2020-05-12T02:01:37.602180Z',
  'build_snapshot': False,
  'lucene_version': '8.5.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

## Step 0.2: prepare HDT for message passing

In [8]:
import json
import requests
from collections import defaultdict

import numpy as np
import scipy.sparse as sp

from hdt import HDTDocument, TripleComponentRole

# from settings import *

data_path = './data/wikidata-disambig-train.json'
rdfsLabelURI = 'http://www.w3.org/2000/01/rdf-schema#label'

hdt_file = 'wikidata20200309.hdt'
hdt_path = '/ivi/ilps/personal/vprovat/'

PREFIX_E = 'http://www.wikidata.org/entity/'
namespace = 'predef-wikidata2020-03-all'
kg = HDTDocument(hdt_path+hdt_file)
predicates_ids = []
kg.configure_hops(1, predicates_ids, namespace, True, False)

## Step 1: searching for candidates

In [9]:
from utils.elastic_getters import wikidata_search_, wikidata_search_precise, wikidata_search_fuzzy

In [10]:
def get_candidates(es, entity):
    '''
    Searches for an entity in Elasticsearch.
    Is very much suboptimal and will be updated in the future – now left as it is for reproducibility
    '''
    
    hits = wikidata_search_(es, entity) 
    hits_exact = wikidata_search_precise(es, entity)
    hits_fuzzy = wikidata_search_fuzzy(es, entity)
        
    res_raw = hits_exact + hits + hits_fuzzy  #+ hits_variants
    
    # removing duplicates:
    res_pure = []
    seen_Qs = set()
    for entry in res_raw:
        Q = entry['_source']['label_exact']
        if Q not in seen_Qs:
            seen_Qs.add(Q)
            res_pure.append(entry)
            
    # removing least relevant results:
    res_filtered = []
    if not res_pure:
        return res_pure # if nothing at all is found (happens very rarely)
    
    best_score = res_pure[0]['_score'] if res_pure[0]['_score'] else 20
    for item in res_pure:
        if item['_score'] and item['_score'] > 0.6 * best_score:
            res_filtered.append(item)
        if not item['_score']: # it means the results were sorted already - just do nothing
            res_filtered.append(item)
            
    return res_filtered

#### Uncomment the cell below to test candidate search:


In [11]:
# get_candidates(es, 'Portugal')

## Step 2: getting the top candidate using MP

In [12]:
def generate_adj_sp(adjacencies, n_entities, include_inverse):
    '''
    Build adjacency matrix
    '''
    adj_shape = (n_entities, n_entities)
    
    # create a single adjacency matrix
    adj = sp.csr_matrix((adj_shape))
    for edges in adjacencies:
        
        # split subject (row) and object (col) node URIs
        n_edges = len(edges)
        row, col = np.transpose(edges)
        
        # duplicate edges in the opposite direction
        if include_inverse:
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2
        
        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj += sp.csr_matrix((data, (row, col)), shape=adj_shape)
    return adj


def ned(matched_entities, mention, max_triples=50000000, offset=0, mention_score=100):
    
    # get all adjacent nodes
    all_ids = [v for vs in matched_entities.values() for v in vs]
    subgraph1 = kg.compute_hops(all_ids, max_triples, offset)
    
    # prepare matrices for MP
    entity_ids, predicate_ids, adjacencies = subgraph1
    n_entities = len(entity_ids)
    if predicate_ids:
        A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)
        
        # index entity ids global -> local
        entities_dict = {k: v for v, k in enumerate(entity_ids)}

        # activate matched entities
        row, col, data = [], [], []
        for i, span in enumerate(matched_entities):
            for entity_id in matched_entities[span]:
                if entity_id in entities_dict:
                    local_id = entities_dict[entity_id]
                    row.append(i)
                    col.append(local_id)
                    score = 1
                    if span == mention:
                        score = mention_score
                    data.append(score)
        x = sp.csr_matrix((data, (row, col)), shape=(len(matched_entities), n_entities))

        # MP
        y = x @ A
        y = sum(y).toarray()[0]

        top = np.argwhere(y > mention_score).T.tolist()[0]
        activations = defaultdict(int)
        if len(top) > 0:
            activations1 = np.asarray(entity_ids)[top]

            # store the activation values per id answer id
            for i, e in enumerate(entity_ids):
                if e in activations1:
                    activations[e] += y[i]
        answers = [{a_id: a_score} for a_id, a_score in sorted(activations.items(), key=lambda item: item[1], reverse=True)[:500] if a_score%mention_score != 0]
        answers_ids = [_id for a in answers for _id in a]
        answer_uris = []
        for a in answers_ids:
            uri = kg.global_id_to_string(a, TripleComponentRole.SUBJECT)
            if uri:
                answer_uris.append(uri)
    # filter out answers that do not have labels
    top_answers_uris = []

    for uri in answer_uris:
#         filter out redirects e.g. http://www.wikidata.org/entity/statement/Q271189-081D418E-7709-4074-9864-EDD6B4C46601
        if not 'statement' in uri.split('/'):
            top_answers_uris.append(uri)

    answers = top_answers_uris
#     print("%d answers found"%len(answers))
    return answers

In [13]:
'''
Removing punctuation
'''

import string
table = str.maketrans(dict.fromkeys(string.punctuation)) 

def remove_punctuation(s):
    new_s = s.translate(table)
    #also take care of extra whitespaces if they happen
    new_s = ' '.join(new_s.split()).strip(' ')
    return new_s

'''
Replacing punctuation with whitespaces - same as above basically
'''

translator = str.maketrans(string.punctuation + '’', ' '*(len(string.punctuation)+1))
def replace_punctuation_with_spaces(s):
    new_s = s.translate(translator)
    #also take care of extra whitespaces if they happen
    new_s = ' '.join(new_s.split()).strip(' ')
    return new_s

In [14]:
from utils.data_processing import df_to_sentence, extract_entity_mentions

In [15]:
def get_top_candidates_for_all_mentions(mentions, es, top=10):
    top_entities = {} # context entities
    candidate_entities = []
    top_candidate_uris = {}
    for pos, raw_label in mentions:
        num_words = len(raw_label.split(" "))
        label = df_to_sentence(df[pos:pos+num_words])
        m = label.lower()
        results = get_candidates(es, m)[:top]

        entity_ids = []
        entity_uris = []
        for entity in results:
            entity_uri = entity['_source']['uri']
            entity_uris.append(entity_uri)
            entity_id = kg.string_to_global_id(entity_uri, TripleComponentRole.OBJECT)
            entity_ids.append(entity_id)
        top_entities[m] = entity_ids
        top_candidate_uris[m] = entity_uris
        
    return top_entities, top_candidate_uris

In [16]:
def link_one_mention(mention, cur_candidates, cur_uris, top_entities_):
    top_entities = top_entities_.copy()
    scores = []
    for i, c in enumerate(cur_candidates):
#             print('candidate URI:')
#             print(candidate_uris[i]) 
        top_entities[mention] = [c] # entity id in HDT
        result_entities = ned(top_entities, mention)
#         print('results:')
#         print(result_entities)
#         print()
        scores.append(len(result_entities))
    # evaluate: check correct entity id is in the result set
#         print('Finished')
#         print(candidate_uris[np.argmax(scores)])
#         print(correct_id)
#         print(scores)

    try:
        maxind = np.argmax(scores)
#         print(maxind)
#         print(cur_uris[maxind])
        answer = cur_uris[maxind].split('/')[-1]
#         print(answer)
    except Exception:
        answer = 'NIL'
    return answer

## Step 3: performing entity linking for all sentences in our dataset

In [17]:
from tqdm.auto import tqdm

In [18]:
dfs_with_links = []
for i, df in enumerate(tqdm(dfs)):
    # Handling word wrapping and NoSpaceAfter flags while preserving the data format:
    sentence = df_to_sentence(df)
#     print(sentence)
    mentions = extract_entity_mentions(df)
#     print(mentions)
    all_candidates, candidate_uris = get_top_candidates_for_all_mentions(mentions, es)
#     print(all_candidates)
    new_tokens = sentence.split(' ')
    
    # Entity linking starts:
    df_with_links = df.copy()
    for pos, raw_label in mentions:
        num_words = len(raw_label.split(" "))
        label = df_to_sentence(df[pos:pos+num_words]) # in case the label has word wrapping too
        
        num_removed_spaces = sum(1 for item in df['MISC'].tolist()[:pos] if item == 'NoSpaceAfter')
        pos_in_sentence = pos-num_removed_spaces
        
        answer = link_one_mention(label.lower(), all_candidates[label.lower()], candidate_uris[label.lower()],
                                  all_candidates)
        
        # Adding the links
        cur_pos = pos
        while cur_pos - pos < num_words:
            df_with_links['NEL-LIT'][cur_pos] = answer
            df_with_links['NEL-METO'][cur_pos] = answer
            cur_pos += 1
            
    dfs_with_links.append(df_with_links)

HBox(children=(FloatProgress(value=0.0, max=1084.0), HTML(value='')))




In [19]:
df_with_links

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC
0,But,O,O,O,O,O,O,_,_,_
1,nothing,O,O,O,O,O,O,_,_,_
2,can,O,O,O,O,O,O,_,_,_
3,measure,O,O,O,O,O,O,_,_,_
4,the,O,O,O,O,O,O,_,_,_
5,spir,O,O,O,O,O,O,_,_,NoSpaceAfter
6,¬,O,O,O,O,O,O,_,_,EndOfLine|NoSpaceAfter
7,itual,O,O,O,O,O,O,_,_,_
8,power,O,O,O,O,O,O,_,_,_
9,for,O,O,O,O,O,O,_,_,_


## Step 5: saving the results

In [20]:
from utils.data_processing import write_results

In [21]:
write_results(dfs_with_links, OUTPUT_FILE)