## Usage: 
* Run on the same node as Elasticsearch (for now)
* **Specify input and output paths in the config dictionary below**
* Relax and hope for the best – it's terribly slow (I used parallel notebooks to get the solution ready on time; a smarter way to speed it up will be used in the future)

In [32]:
config = {
    'DATA_FILE': 'data_HIPE/training-v1.1/en/HIPE-data-v1.1-dev-en.tsv',
    'OUTPUT_FILE': 'submissions/UvA.ILPS_bundle5_EN_DEV.tsv'
}

In [33]:
DATA_FILE=config['DATA_FILE']
OUTPUT_FILE=config['OUTPUT_FILE']

## Step 0.0: prepare the data

The input data should be in CLEF HIPE format and contain entity mentions

In [34]:
from utils.data_processing import read_data_to_dfs_sentences, merge_dfs

In [35]:
raw_dfs = read_data_to_dfs_sentences(DATA_FILE)

HBox(children=(IntProgress(value=0, max=33174), HTML(value='')))

In [36]:
dfs = merge_dfs(raw_dfs)

## Step 0.1: prepare the embeddings

In [9]:
from flair.embeddings import StackedEmbeddings, FlairEmbeddings, TransformerWordEmbeddings

en_embeddings = StackedEmbeddings([FlairEmbeddings("en-impresso-hipe-v1-forward"), 
                                   FlairEmbeddings("en-impresso-hipe-v1-backward"),
                                   TransformerWordEmbeddings("bert-large-cased")])


In [10]:
from flair.data import Sentence, segtok_tokenizer


In [12]:
from flair.embeddings import DocumentRNNEmbeddings

# document embedding is an LSTM over stacked word embeddings
lstm_embeddings_en = DocumentRNNEmbeddings([en_embeddings], rnn_type='lstm')

## Step 0.2: prepare Elasticsearch for candidate extraction

In [13]:
from elasticsearch import Elasticsearch
es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
es.info()

{'name': 'ilps-cn007',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'vU7MO42lR2KzA6bPvSP_pA',
 'version': {'number': '7.7.0',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '81a1e9eda8e6183f5237786246f6dced26a10eaf',
  'build_date': '2020-05-12T02:01:37.602180Z',
  'build_snapshot': False,
  'lucene_version': '8.5.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

## Step 1: searching for candidates

In [14]:
from utils.elastic_getters import wikidata_search_, wikidata_search_precise, wikidata_search_fuzzy

In [15]:
def combine_variants(cur_sequence, cur_len, target_len, remaining_variants_per_word):
    '''
    For entities longer than one word - combines all possible variants of historical spelling per word.
    Returns a list of strings, each representing one option of spelling the entire entity
    
    Params:
        cur_sequence: a string containing currently obtained spelling options
        cur_len: length of cur_sequence, i.e. number of words processed
        target_len: target number of words (length of the entity)
        remaining_variants_per_word: list of lists, each containing spelling variants of a word not yet processed
    '''
    if cur_len == target_len:
        return [cur_sequence.strip()]
    
    cur_variants = remaining_variants_per_word[0]
    res = []
    for variant in cur_variants:
        res.extend(combine_variants(cur_sequence+' '+variant, cur_len+1, target_len,
                                   remaining_variants_per_word[1:]))
    return res
    
                       

In [16]:
import natas
from fuzzywuzzy import fuzz

def gen_word_variants(entity):
    '''
    Returns a list of entity spelling variants, obtained using the natas library
    and combined using the function above
    '''
    words = [w.strip() for w in entity.split(' ') if w]
    raw_variants_per_word = natas.normalize_words(words, n_best=3)
    
    variants_per_word = []
    for i, item in enumerate(raw_variants_per_word):
        cur_variants = item if item else [words[i]]
        if words[i] not in cur_variants:
            cur_variants.append(words[i])
        variants_per_word.append(cur_variants)
        
    assert len(variants_per_word) == len(words)
    
    return combine_variants('', 0, len(words), variants_per_word)
            

In [17]:
def get_candidates(es, entity):
    '''
    Searches for an entity in Elasticsearch.
    Is very much suboptimal and will be updated in the future – now left as it is for reproducibility
    '''
    
    hits = wikidata_search_(es, entity) 
    hits_exact = wikidata_search_precise(es, entity)
    hits_fuzzy = wikidata_search_fuzzy(es, entity)
    
    hits_variants = [] # searching for historical spelling variants
    if len(entity.split(' ')) < 4: # only for short entities, otherwise it will take forever
        variants = gen_word_variants(entity)
        for variant in variants:
            hits_variants.extend(wikidata_search_precise(es, variant))
        
    
    if not hits_exact and len(entity.split(' ')) > 4: # a terribly long entity - try searching for its first 2 words
        shorter_entity = " ".join(entity.split(' ')[:2])
        variants = gen_word_variants(shorter_entity)
        for variant in variants:
            hits_variants.extend(wikidata_search_fuzzy(es, variant))
        
    res_raw = hits_exact + hits + hits_fuzzy + hits_variants
    
    # removing duplicates:
    res_pure = []
    seen_Qs = set()
    for entry in res_raw:
        Q = entry['_source']['label_exact']
        if Q not in seen_Qs:
            seen_Qs.add(Q)
            res_pure.append(entry)
            
    # removing least relevant results:
    res_filtered = []
    if not res_pure:
        return res_pure # if nothing at all is found (happens very rarely)
    
    best_score = res_pure[0]['_score'] if res_pure[0]['_score'] else 20
    for item in res_pure:
        if item['_score'] and item['_score'] > 0.6 * best_score:
            res_filtered.append(item)
        if not item['_score']: # it means the results were sorted already - just do nothing
            res_filtered.append(item)
            
    return res_filtered

#### Uncomment the cell below to test candidate search:


In [18]:
# get_candidates(es, 'Portugal')

[{'_index': 'wikidata_clef',
  '_type': '_doc',
  '_id': 'eRo5LXIByThoYigYu24R',
  '_score': None,
  '_source': {'uri': 'http://www.wikidata.org/entity/Q45',
   'label': 'portugal',
   'count': 140530,
   'id': 36166617,
   'label_exact': 'Q45'},
  'sort': [140530]},
 {'_index': 'wikidata_clef',
  '_type': '_doc',
  '_id': 'B7XRLHIByThoYigY1BFA',
  '_score': None,
  '_source': {'uri': 'http://www.wikidata.org/entity/Q14110517',
   'label': 'portugal',
   'count': 1161,
   'id': 4043769,
   'label_exact': 'Q14110517'},
  'sort': [1161]},
 {'_index': 'wikidata_clef',
  '_type': '_doc',
  '_id': 'KIqyLXIByThoYigYwOhW',
  '_score': None,
  '_source': {'uri': 'http://www.wikidata.org/entity/Q7232531',
   'label': 'portugal',
   'count': 267,
   'id': 64240553,
   'label_exact': 'Q7232531'},
  'sort': [267]},
 {'_index': 'wikidata_clef',
  '_type': '_doc',
  '_id': 'I-glLXIByThoYigY4VSE',
  '_score': None,
  '_source': {'uri': 'http://www.wikidata.org/entity/Q415558',
   'label': 'portugal',

## Step 2: ranking the candidates using their wikidata descriptions

In [19]:
from utils.elastic_getters import wikidata_get_description

In [20]:
import requests
def get_description(es, Q='Q14110517', lang='en'):
    '''
    Returns an entity description from wikidata
    '''
    res = wikidata_get_description(es, Q)
    if res:
        return res[0]['_source']['description']
    else:
        return ''

In [22]:
'''
Removing punctuation
'''

import string
table = str.maketrans(dict.fromkeys(string.punctuation)) 

def remove_punctuation(s):
    new_s = s.translate(table)
    #also take care of extra whitespaces if they happen
    new_s = ' '.join(new_s.split()).strip(' ')
    return new_s

In [23]:
'''
Replacing punctuation with whitespaces - same as above basically
'''

translator = str.maketrans(string.punctuation + '’', ' '*(len(string.punctuation)+1))
def replace_punctuation_with_spaces(s):
    new_s = s.translate(translator)
    #also take care of extra whitespaces if they happen
    new_s = ' '.join(new_s.split()).strip(' ')
    return new_s

In [26]:
'''
Calculating Levenstein similarity between strings
'''

from fuzzywuzzy import fuzz

def lev_similarity(s1, s2):
    ratio = fuzz.ratio(s1.lower(), s2.lower())
    return ratio/100

In [27]:
import math
from torch.nn import CosineSimilarity
cos = CosineSimilarity(dim=1)

THRESHOLD = 0.7
def get_similarities_replacing(sentence, entity, start_pos, es, embeddings=lstm_embeddings_en, 
                               window_size=5, candidate_window_size=5, lang='en'): 
    
    '''
    Takes a sentence and an entity mention inside it, returns a ranked list of candidates
    
        sentence: a sentence containing the entity mention; string
        entity: the mentioned entity; string
        start_pos: number of the first word of the entity in the sentence
        
        es: ElasticSearch
        embeddings: contextualised document embeddings from Flair
        window_size: number of words to be used for symmetrical context surrounding the entity within the sentence
         (window_size=5 -> include 5 words to the left and 5 to the right)
         
        candidate_window_size: maximum length of the candidate description to be inserted in the sentence
        lang: now only English is supported, but I hope to try it with French and German later
        
    '''
    
    # Embedding the part of the sentence surrounding the entity:
    sent = remove_punctuation(sentence).split(' ')
    entity = replace_punctuation_with_spaces(entity)
    
    end_pos = start_pos + len(entity.split(' '))
    
    target_left = sent[max(0, start_pos-window_size):start_pos] # left part of the sentence
    target_right = sent[end_pos:min(len(sent), end_pos+window_size)]
    
    target_context = target_left + \
                        sent[start_pos:end_pos] + \
                        target_right
    
    target = Sentence(" ".join(target_context))
    embeddings.embed(target)
    target_vector = target.embedding.unsqueeze(0)
    
    
    # Substituting candidate descriptions and calculating their scores:
    res = []
    candidates = get_candidates(es, entity)
    for candidate in candidates:
        Q = candidate['_source']['label_exact']
        label = candidate['_source']['label']
        desc_raw = get_description(es, Q, lang)
        if not desc_raw:
            desc_raw = label
            
        desc = remove_punctuation(desc_raw).split(' ')

        # Getting a new vector embedding:

        desc_context = desc[0:min(candidate_window_size, len(desc))]

        replaced = " ".join(target_left + desc_context + target_right)

        seq = Sentence(replaced)

        embeddings.embed(seq)
        candidate_vector = seq.embedding.unsqueeze(0)

        # Measuring distance:
        
        distance_context = cos(target_vector, candidate_vector).item()
        
        w_short = 0.53+math.sqrt(1/math.log(100+len(Q))) # a heuristic to ever-so-slightly prefer shorter Q labels
        w_lev = lev_similarity(label, entity) # use Levenstein distance ratio as a weight when calculating the scores
        distance = distance_context * w_lev * w_short
        
        res.append((Q, distance, label + ' ' + " ".join(desc_context)))
     
    # Sorting the results by score
    res = sorted(list(set(res)), key = lambda x: -x[1])
    
    # Now, adding NIL:
    res_with_nil = []
    nil_added = False
    for q, score, desc in res:
        if score < THRESHOLD and not nil_added:
            res_with_nil.append(('NIL', THRESHOLD, ""))
            nil_added = True
            
        res_with_nil.append((q, score, desc))
        
    return res_with_nil

#### Uncomment the cell below to test candidate search:


In [28]:
# get_similarities_replacing("We went to England for a business trip", "England", 3, es)

[('Q21', 0.920724974151172, 'england country in northwest Europe part'),
 ('Q20398466', 0.8923825737585285, 'england painting by Richard Ansdell'),
 ('Q17629554', 0.891240048642801, 'england album'),
 ('Q79282', 0.8849781282991566, 'england city in Lonoke County Arkansas'),
 ('Q27881912', 0.8774320394566534, 'england painting by Thomas Creswick'),
 ('Q2131751', 0.8757769562449264, 'england British progressive rock band'),
 ('Q257294', 0.8716176641365393, 'england Wikimedia disambiguation page'),
 ('Q11111401', 0.8612603645698638, 'england family name'),
 ('Q17653976', 0.836010612710725, 'england Wikinews article'),
 ('Q62061800', 0.825158486056637, 'england manuscript map drawn by Erwin'),
 ('NIL', 0.7, ''),
 ('Q138046', 0.5779662168727834, 'england  england novel'),
 ('Q60410910', 0.4829642873290349, ' england  england  1998 edition'),
 ('Q5377954',
  0.46817721307320653,
  'england  my england 1995 film by Tony Palmer'),
 ('Q5377956',
  0.4199796295839431,
  'england  their england b

## Step 4: performing entity linking for all sentences in our dataset

In [29]:
from tqdm.auto import tqdm

In [30]:
from utils.data_processing import df_to_sentence, extract_entity_mentions

In [37]:
dfs_with_links = []
for i, df in enumerate(tqdm(dfs)):
    # Handling word wrapping and NoSpaceAfter flags while preserving the data format:
    sentence = df_to_sentence(df)
    mentions = extract_entity_mentions(df)
    new_tokens = sentence.split(' ')
    
    # Entity linking starts:
    df_with_links = df.copy()
    for pos, raw_label in mentions:
        num_words = len(raw_label.split(" "))
        label = df_to_sentence(df[pos:pos+num_words]) # in case the label has word wrapping too
        
        num_removed_spaces = sum(1 for item in df['MISC'].tolist()[:pos] if item == 'NoSpaceAfter')
        pos_in_sentence = pos-num_removed_spaces
        
        res = get_similarities_replacing(sentence, label, pos_in_sentence, es)
        
        found_qs = [item[0] for item in res]
        
        answer = "|".join(found_qs[:min(5, len(found_qs))])
        
        # Adding the links
        cur_pos = pos
        while cur_pos - pos < num_words:
            df_with_links['NEL-LIT'][cur_pos] = answer
            df_with_links['NEL-METO'][cur_pos] = answer
            cur_pos += 1
            
    dfs_with_links.append(df_with_links)

HBox(children=(IntProgress(value=0, max=1084), HTML(value='')))

In [38]:
dfs_with_links[0]

Unnamed: 0,TOKEN,NE-COARSE-LIT,NE-COARSE-METO,NE-FINE-LIT,NE-FINE-METO,NE-FINE-COMP,NE-NESTED,NEL-LIT,NEL-METO,MISC


## Step 5: saving the results

In [39]:
from utils.data_processing import write_results

In [40]:
write_results(dfs_with_links, OUTPUT_FILE)