# Annotation tool

This tool is used to annotate links. The tool is configured to annotate fb15k and dbpedia.

First, specify the paths of the test files

In [20]:
import pickle as pkl
import numpy
import torch
import json
import ipywidgets as widgets
from ipywidgets import interact, Layout, ButtonStyle
from IPython.display import Markdown
import requests
from time import sleep
import random
import datetime
import os
import urllib
from functools import cmp_to_key

In [21]:
main_folder = '/home/jurbani/data2/binary-embeddings/dbpedia50/'
annotations_file = main_folder + "annotations/gold-annotations.json"
testdata_raw_path = '/home/uji300/OpenKE/benchmarks/dbpedia50/test2id.txt'
unmesh_annotations_path = '/var/scratch2/uji300/ijcai2021/binary-embeddings/dbpedia50/misc/annotations_unmesh_dbpedia50.json'
annotator = 'U'
#main_folder = '/Users/jacopo/Desktop/binary-embeddings/dbpedia50/'
# The file where all the annotations are stored
#annotations_file = '/Users/jacopo/Desktop/binary-embeddings/dbpedia50/annotations/gold-annotations.json'
#testdata_raw_path = '/Users/jacopo/prj/OpenKE-unmesh/benchmarks/dbpedia50/test2id.txt'
#unmesh_annotations_path = '/Users/jacopo/Desktop/binary-embeddings/dbpedia50/annotations/gold-annotations-unmesh.json'
#annotator = 'J'

In [22]:
testdata_folder = main_folder + 'answers/'
testdata_transe_path_head = testdata_folder + 'dbpedia50-answers-transe-test-10-head.pkl'
testdata_transe_path_tail = testdata_folder + 'dbpedia50-answers-transe-test-10-tail.pkl'
testdata_complex_path_head = testdata_folder + 'dbpedia50-answers-complex-test-10-head.pkl'
testdata_complex_path_tail = testdata_folder + 'dbpedia50-answers-complex-test-10-tail.pkl'
testdata_rotate_path_head = testdata_folder + 'dbpedia50-answers-rotate-test-10-head.pkl'
testdata_rotate_path_tail = testdata_folder + 'dbpedia50-answers-rotate-test-10-tail.pkl'

Load the dictionary

In [23]:
ent_labels_path = '/home/uji300/OpenKE/benchmarks/dbpedia50/entity2id.txt'
rel_labels_path = '/home/uji300/OpenKE/benchmarks/dbpedia50/relation2id.txt'

In [24]:
ent_labels = {}
with open(ent_labels_path, 'rt') as f:
    nents = int(f.readline())
    for line in f:
        tkns = line.split('\t')
        ent_labels[int(tkns[1])] = tkns[0]
    assert(len(ent_labels) == nents)
rel_labels = {}
with open(rel_labels_path, 'rt') as f:
    nrels = int(f.readline())
    for line in f:
        tkns = line.split('\t')
        rel_labels[int(tkns[1])] = tkns[0]
    assert(len(rel_labels) == nrels)

Load the raw test triples

In [25]:
raw_test_triples = set()
with open(testdata_raw_path, 'rt') as f:
    nfacts = int(f.readline())
    for l in f:
        tkns = l.split(' ')
        h = int(tkns[0])
        t = int(tkns[1])
        r = int(tkns[2])
        raw_test_triples.add((h, t, r))

Load the test files

In [26]:
with open(testdata_transe_path_head, 'rb') as fin:
    testdata_transe_head = pkl.load(fin) # json.load(open(testdata_transe_path, 'rt'))
with open(testdata_transe_path_tail, 'rb') as fin:
    testdata_transe_tail = pkl.load(fin)
with open(testdata_complex_path_head, 'rb') as fin:
    testdata_complex_head = pkl.load(fin) # json.load(open(testdata_complex_path, 'rt'))
with open(testdata_complex_path_tail, 'rb') as fin:
    testdata_complex_tail = pkl.load(fin)
with open(testdata_rotate_path_head, 'rb') as fin:
    testdata_rotate_head = pkl.load(fin) # json.load(open(testdata_rotate_path, 'rt'))
with open(testdata_rotate_path_tail, 'rb') as fin:
    testdata_rotate_tail = pkl.load(fin)

Compute all the head and tail queries

In [27]:
queries_tail = {}
for name, testset in [("transe", testdata_transe_tail), ("complex", testdata_complex_tail), ("rotate", testdata_rotate_tail)]:
    for t in testset:
        ent = t['ent']
        rel = t['rel']
        if (ent, rel) in queries_tail:
            answers = queries_tail[(ent, rel)]
            if name not in answers:
                answers[name] = t['answers_fil']
        else:
            a = { name : t['answers_fil'] }            
            queries_tail[(ent, rel)] = a

queries_head = {}
for name, testset in [("transe", testdata_transe_head), ("complex", testdata_complex_head), ("rotate", testdata_rotate_head)]:
    for t in testset:
        ent = t['ent']
        rel = t['rel']    
        if (ent, rel) in queries_head:
            answers = queries_head[(ent, rel)]
            if name not in answers:
                answers[name] = t['answers_fil']
        else:
            a = { name : t['answers_fil'] }           
            queries_head[(ent, rel)] = a

Copy all the queries into a single list. Also, load all the queries annotated by Unmesh

In [29]:
queries = []
counter = 0
for q, a in queries_head.items():
    queries.append({'id': counter, 'type': 0, 'ent' : q[0], 'rel' : q[1], 'answers' : a})
    counter += 1    
for q, a in queries_tail.items():
    queries.append({'id': counter, 'type': 1, 'ent' : q[0], 'rel' : q[1], 'answers' : a})    
    counter += 1
    
if os.path.exists(unmesh_annotations_path):
    print(unmesh_annotations_path)
    unmesh_queries = json.load(open(unmesh_annotations_path))
else:
    unmesh_queries = []
    
#unmesh_queries = []
#if os.path.exists(unmesh_annotations_path):
#    unmesh_queries_by_id = json.load(open(unmesh_annotations_path))
#    for key, q in unmesh_queries_by_id.items():
#        unmesh_queries.append(q)
unmesh_queries_by_query_id = {}
for q in unmesh_queries:
    query = q['query']
    # Give the same ID used for all queries
    found = False
    for q2 in queries:
        if q2['type'] == query['type'] and q2['ent'] == query['ent'] and q2['rel'] == query['rel']:
            found = True
            query['id'] = q2['id']
            # Compare the annotated answers by Unmesh and the total numbers.
            unmesh_answers = q['annotated_answers']
            total_answers = q2['answers']
            col_answers = set()
            for method, ans in total_answers.items():
                for a in ans:
                    #col_answers.add(a)
                    col_answers.add(a['entity_id'])
            overlap = 0
            for a in unmesh_answers:
                if a['entity_id'] in col_answers:
                    overlap += 1
            q['not_annotated_answers'] = len(col_answers) - overlap
            assert(len(col_answers) - overlap >= 0)
            break    
    assert(found == True)
    unmesh_queries_by_query_id[query['id']] = q
#random.shuffle(unmesh_queries)
unmesh_queries = sorted(unmesh_queries, key=cmp_to_key(lambda i1, i2: i1['not_annotated_answers'] - i2['not_annotated_answers']))

print("# Queries:", len(queries))
print("# Queries annotated by Unmesh:", len(unmesh_queries))

/var/scratch2/uji300/ijcai2021/binary-embeddings/dbpedia50/misc/annotations_unmesh_dbpedia50.json
# Queries: 3411
# Queries annotated by Unmesh: 600


Global data structures

In [30]:
out = widgets.Output(layout={'padding': '5px', 'border': '1px solid black'})
array_answers = []
valid_annotations = True
current_query_id = None
processed_queries = {}
if os.path.exists(annotations_file):
    print("Loading annotations from file", annotations_file)
    processed_queries = json.load(open(annotations_file, 'rt'))
    new_processed_queries = {}
    for k, v in processed_queries.items():
        new_processed_queries[int(k)] = v
    processed_queries = new_processed_queries
print("# Processed Queries:", len(processed_queries))

Loading annotations from file /home/jurbani/data2/binary-embeddings/dbpedia50/annotations/gold-annotations.json
# Processed Queries: 110


### Auxiliary functions

In [31]:
def pick_next_query():
    global current_query_id
    global unmesh_queries
    if len(processed_queries) < len(queries):
        # First select the first query by Unmesh that has not yet been processed
        found = False
        for i, q in enumerate(unmesh_queries):
            if q['query']['id'] not in processed_queries:
                found = True
                idx = q['query']['id']
                break
        #print("Select a random query that is not yet processed ...")
        while not found:
            idx = random.randint(0, len(queries) - 1)
            if idx not in processed_queries:
                break
        current_query_id = idx
        return True        
    else:
        return None

In [32]:
def on_change_checkbox(b):
    owner = b['owner']
    desc = owner.description
    id_answer = int(owner.description[0:desc.find('.')])
    value = b['new']
    if value is True:
        array_answers[id_answer]['checked']= True
    else:
        array_answers[id_answer]['checked']= False

In [33]:
def dump_on_file():
    # First check if the file exist
    if os.path.exists(annotations_file):
        now = str(datetime.datetime.now())
        old_file = annotations_file + '-' + now
        os.rename(annotations_file, old_file)
    json.dump(processed_queries, open(annotations_file, 'wt'), indent = 6)

In [34]:
def on_click_button(b):
    global processed_queries
    global current_query_id
    global valid_annotations
    
    out.clear_output()
    # Store the annotation
    query = queries[current_query_id]
    print(array_answers)
    processed_queries[current_query_id] = {'query' : query, 'valid_annotations' : valid_annotations, 'annotated_answers' : array_answers, 'annotator' : annotator, 'date': str(datetime.datetime.now())}
    dump_on_file()
    
    # Move to the next query
    with out:
        ok = pick_next_query()
        if ok is not None:
            query = queries[current_query_id]
            print_query_answers(query['id'], query['type'], query['ent'], query['rel'], query['answers'])

In [35]:
def on_click_skip_button(b):
    global valid_annotations
    valid_annotations = False
    on_click_button(b)

In [36]:
def print_query_answers(query_id, typ, ent, rel, answers):
    global processed_queries
    global array_answers
    global valid_annotations
    global unmesh_queries_by_query_id
    valid_annotations = True
    n_skipped = 0
    n_ok = 0
    n_annotated_answers = 0
    n_tail_queries = 0
    n_head_queries = 0
    for _, q in processed_queries.items():
        if q['valid_annotations']:
            n_ok += 1
            n_annotated_answers += len(q['annotated_answers'])
            if q['query']['type'] == 1:
                n_tail_queries += 1
            else:
                n_head_queries += 1
        else:
            n_skipped += 1
    print("Processed queries: {} Skipped: {} Ok: {} Head: {} Tail: {}".format(len(processed_queries), n_skipped, n_ok, n_head_queries, n_tail_queries))
    print("Annnotated answers: {}\n".format(n_annotated_answers))
    typ_str = 'HEAD'
    if typ == 1:
        typ_str = 'TAIL'
    display(Markdown("***Query #{} Type {}***".format(query_id, typ_str)))
    lbl, link_wikidata = (ent_labels[ent], "") # TODO retrieve_wikidata_label(ent_labels[ent])
    ent_str = lbl
    if typ == 0:
        print("?", rel_labels[rel], ent_str)
    else:
        print(ent_str, rel_labels[rel], "?")
    #print("\n")
    
    unmesh_annotations = None
    if query_id in unmesh_queries_by_query_id:
        print("\nThis query was previously annotated by Unmesh")
        unmesh_annotations = unmesh_queries_by_query_id[query_id]['annotated_answers']

    lbl_google = urllib.parse.urlencode({"q" : lbl})
    google_link = "https://www.google.com/search?hl=en&" + lbl_google
    display(Markdown("***Search on Google:*** {}".format(google_link)))
    print("\nAnswers (striked answers are the ones that are already annotated):")
    array_answers = []
    for method, answers_method in answers.items():
        for i, answer in enumerate(answers_method):
            a = answer['entity_id']
            # Should I add it?
            found = False
            for j, array_answer in enumerate(array_answers):
                if array_answer['entity_id'] == a:
                    found = True
                    array_answer['methods'].append(method)
                    break                    
            if not found:
                # Is the answer known to be true?
                found = False
                if typ == 0 and (a, ent, rel) in raw_test_triples:
                    found = True
                if typ == 1 and (ent, a, rel) in raw_test_triples:
                    found = True
                if found:
                    array_answers.append({'entity_id' : a, 'checked' : True, 'methods': [method], 'enabled' : False})
                else:
                    if unmesh_annotations is not None:
                        # Search if the entity is mentioned
                        for unmesh_annotation in unmesh_annotations:
                            if unmesh_annotation['entity_id'] == a:
                                found = True
                                if unmesh_annotation['checked'] == True:
                                    array_answers.append({'entity_id' : a, 'checked' : True, 'methods': [method], 'enabled' : False})
                                else:
                                    array_answers.append({'entity_id' : a, 'checked' : False, 'methods': [method], 'enabled' : False})
                                break                                
                    if not found:
                        array_answers.append({'entity_id' : a, 'checked' : False, 'methods': [method], 'enabled' : True})
                        
    for i, a in enumerate(array_answers):
        #sleep(1) # Some sleeping is necessary for wikidata
        lbl, link_wikidata = (ent_labels[a['entity_id']],'') # TODO retrieve_wikidata_label(ent_labels[a['entity_id']])
        #a_str = '[' + lbl + ' <a href=' + link_wikidata + '>' + link_wikidata + '</a> (' + ent_labels[a['entity_id']] + ')]'
        a_str = lbl
        desc = "{}. {} ({}) methods={}".format(i, a_str, a['entity_id'], a['methods'])
        if a['enabled'] == False:
            box = widgets.Checkbox(a['checked'], id=len(array_answers), description="<strike>" + desc + "</strike>", layout=Layout(width='2000px', height='20px'), indent=False, disabled=True)
        else:
            box = widgets.Checkbox(False, id=len(array_answers), description=desc, layout=Layout(width='2000px', height='20px'), indent=False)
        box.observe(on_change_checkbox, names="value")
        display(box)
        if a['enabled'] == True:
            lbl_google = urllib.parse.urlencode({"q" : lbl})
            google_link = "https://www.google.com/search?hl=en&" + lbl_google
            lbl_wikipedia = urllib.parse.urlencode({"search" : lbl})
            wikipedia_link = "https://en.wikipedia.org/w/index.php?" + lbl_wikipedia
            display(Markdown("&ensp;&ensp;&ensp;{} {}".format(google_link, wikipedia_link)))

    print("\n")
    display(Markdown("***Known answers from the testset:***"))
    known_answers = []
    for triple in raw_test_triples: #(h,t,r)
        if triple[2] == rel:
            if typ == 0 and triple[1] == ent:
                known_answers.append(triple[0])
            if typ == 1 and triple[0] == ent:
                known_answers.append(triple[1])
    assert(len(known_answers) > 0)
    for known_answer in known_answers:
        #sleep(1) # Some sleeping is necessary for wikidata
        lbl, link_wikidata = (ent_labels[known_answer],'') # retrieve_wikidata_label(ent_labels[known_answer])
        a_str = lbl# '[' + lbl + ' ' + link_wikidata + ' (' + ent_labels[known_answer] + ')]'   
        desc = "{} ({})".format(a_str, known_answer)
        print(desc)

### Start the annotation process

In [37]:
out.clear_output()
ok = pick_next_query()
with out:    
    if ok is not None:
        query = queries[current_query_id]
        print_query_answers(query['id'], query['type'], query['ent'], query['rel'], query['answers'])
b = widgets.Button(description='Submit', style=ButtonStyle(font_weight='bf'))
b.on_click(on_click_button)
b_skip = widgets.Button(description='Skip', style=ButtonStyle(font_weight='bf'))
b_skip.on_click(on_click_skip_button)
display(out)
display(b)
display(b_skip)

Output(layout=Layout(border='1px solid black', padding='5px'))

Button(description='Submit', style=ButtonStyle(font_weight='bf'))

Button(description='Skip', style=ButtonStyle(font_weight='bf'))

[{'entity_id': 2539, 'checked': True, 'methods': ['transe', 'complex'], 'enabled': False}, {'entity_id': 20240, 'checked': True, 'methods': ['transe', 'complex'], 'enabled': False}, {'entity_id': 11185, 'checked': True, 'methods': ['transe', 'complex'], 'enabled': False}, {'entity_id': 9993, 'checked': False, 'methods': ['transe'], 'enabled': True}, {'entity_id': 8701, 'checked': True, 'methods': ['transe', 'complex', 'rotate'], 'enabled': False}, {'entity_id': 3197, 'checked': True, 'methods': ['transe', 'complex', 'rotate'], 'enabled': False}, {'entity_id': 7205, 'checked': True, 'methods': ['transe'], 'enabled': False}, {'entity_id': 22440, 'checked': False, 'methods': ['transe', 'complex'], 'enabled': True}, {'entity_id': 9870, 'checked': False, 'methods': ['transe'], 'enabled': True}, {'entity_id': 7877, 'checked': False, 'methods': ['transe'], 'enabled': True}, {'entity_id': 7329, 'checked': True, 'methods': ['complex'], 'enabled': True}, {'entity_id': 6679, 'checked': True, 'met

PermissionError: [Errno 13] Permission denied: '/home/jurbani/data2/binary-embeddings/dbpedia50/annotations/gold-annotations.json' -> '/home/jurbani/data2/binary-embeddings/dbpedia50/annotations/gold-annotations.json-2021-01-27 22:46:23.003586'