# Important imports

In [None]:
# import owlready2
import prodigy
import spacy 
import csv
import os

from pathlib import Path
from prodigy.models.ner import EntityRecognizer

In [None]:
# !python -m spacy download en_core_web_lg

### READ BRAT ANNOTATION  

In [None]:
# !pip install mendelai-brat-parser

In [None]:
from brat_parser import get_entities_relations_attributes_groups

In [None]:
entities_list, relations_list = [], []

for i in range(300):
    entities, relations, _, _ = get_entities_relations_attributes_groups(r".\0-1-2\{0}.ann".format(i))
    entities_list.append(entities)
    relations_list.append(relations)

In [None]:
print(entities_list[3])

In [None]:
print(relations_list[3])

### Candidate generation from ontology

In [None]:
def load_food_entities():
    entities_loc = Path("food_product_entities.csv")
    
    names = dict()
    descriptions = dict()
    
    with entities_loc.open("r", encoding="utf-8") as csvfile: 
        csvreader = csv.reader(csvfile, delimiter=",")
        
        for row in csvreader:
            qid = row[0]
            name = row[1]
            desc = row[2]
            
            names[qid] = name
            descriptions[qid] = desc
            
    return names, descriptions

In [None]:
nlp = spacy.load("en_core_web_lg")
name_dict, desc_dict = load_food_entities()
kb = spacy.kb.KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

In [None]:
import re
for qid, desc in desc_dict.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)
    
for qid, name in name_dict.items():
    name = re.sub(r'[^a-zA-Z0-9]', ' ', name)
    name = re.sub(r'\s+', ' ', name)

    for elem in name.split(' '):
        if len(elem) > 0:
            kb.add_alias(alias=elem, entities=[qid], probabilities=[1 / len(name.split(' '))])
    kb.add_alias(alias=name, entities=[qid], probabilities=[1.0])

qids = name_dict.keys()
probs = [1 / len(qids) for _ in qids]
kb.add_alias(alias="food", entities=qids, probabilities=probs)

'''print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

print(f"Candidates for 'Roy Stanley Emerson': {[c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson')]}")
print(f"Candidates for 'Emerson': {[c.entity_ for c in kb.get_alias_candidates('Emerson')]}")
print(f"Candidates for 'food': {[c.entity_ for c in kb.get_alias_candidates('food')]}")'''

In [None]:
# save KnowlegeBase 
output_dir = Path.cwd() / "output_food"

if not os.path.exists(output_dir):
    os.mkdir(output_dir) 

kb.to_disk(output_dir / "my_kb")
nlp.to_disk(output_dir / "my_nlp")

### Manual Entity Linking

In [None]:
def _add_option(stream, kb, id_dict):
    for task in stream:
        text = task["text"]
        
        for span in task["spans"]:
            start_char = int(span["start"])
            end_char = int(span["end"])
            mention = text[start_char:end_char]
            
            candidates = []
            for elem in mention.split(' '):
                res = kb.get_alias_candidates(elem)
                if res:
                    candidates.append(res[0])
                    
            if candidates: 
                options = [{"id": c.entity_, "html": _print_url(c.entity_, id_dict)} for c in candidates]
                options = sorted(options, key=lambda r: r["id"])
                options.append({"id": "NIL_otherLink", "text": "Link not in options"})
                options.append({"id": "NIL_ambiguous", "text": "Need more context"})
                task["options"] = options
                yield task

In [None]:
def _print_url(entity_id, id_dict):
    url_prefix = "https://www.wikidata.org/wiki"
    name, descr = id_dict.get(entity_id)
    option = "<a href='" + url_prefix + entity_id + "'>" + entity_id + "</a>: " + descr
    return option

In [None]:
@prodigy.recipe(
    "entity_linker.manual", 
    dataset=("The dataset to use", "positional", None, str), 
    source=("The source data as a .txt file", "positional", None, Path),
    nlp_dir=("Path to the NLP model with a pretrained NER component", "positional", None, Path),
    kb_loc=("Path to the KB", "positional", None, Path), 
    entity_loc=("Path to the file with additional information about he entities", "positional", None, Path),
)
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc):
    nlp = spacy.load(nlp_dir)
    kb = spacy.kb.KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
    kb.from_disk(kb_loc)
    model = EntityRecognizer(nlp)
    
    id_dict = {}
    with entity_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            id_dict[row[0]] = (row[1], row[2])
    
    stream = prodigy.components.loaders.TXT(source)
    stream = [prodigy.util.set_hashes(eg) for eg in stream]
    stream = (eg for score, eg in model(stream))
    
    stream = _add_option(stream, kb, id_dict)
    stream = prodigy.components.filters.filter_duplicates(stream, by_input=True, by_task=False)
    
    return {
        "dataset": dataset, 
        "stream": stream,
        "view_id": "choice",
        "config": {"choice_auto_accept": True},
    }

In [None]:
!python -m prodigy entity_linker.manual food_sandbox .\0-1-2\ 3 ./output_food/my_nlp ./output_food/my_kb food_product_entities.csv -F sample.py

In [None]:
recipe_number = 3
file = Path(r".\0-1-2\{0}.txt".format(recipe_number)) # Path("emerson_input_text.txt") # Path(r".\0-1-2\0.txt")
model = EntityRecognizer(nlp)

id_dict = {}
entity_loc = Path("food_product_entities.csv")
kb = spacy.kb.KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
kb.from_disk(Path("./output_food/my_kb"))

with entity_loc.open("r", encoding="utf8") as csvfile:
    csvreader = csv.reader(csvfile, delimiter=",")
    for row in csvreader:
        id_dict[row[0]] = (row[1], row[2])

stream = prodigy.components.loaders.TXT(file)
stream = [prodigy.util.set_hashes(eg) for eg in stream]
# print([elem for elem in stream])
# print()
# TODO: change it  
# stream = (eg for score, eg in model(stream))
# print([elem for elem in stream])
for i, elem in enumerate(stream):
    spans = []
    for entity in entities_list[recipe_number].values():
        if entity.text in elem['text']:
            span = {'start': entity.span[0][0], 'end': entity.span[0][1], 'text': entity.text, 'label': entity.type}
            spans.append(span)
    stream[i]['spans'] = spans
print(stream)

In [None]:
'''for task in stream:
    text = task["text"]
    print(text)
        
    for span in task["spans"]:
        start_char = int(span["start"])
        end_char = int(span["end"])
        mention = text[start_char:end_char]
        
        candidates = []
        for elem in mention.split(' '):
            res = kb.get_alias_candidates(elem)
            if res:
                candidates.append(res[0])
        # print([elem.alias_ for elem in candidates])
        
        if candidates: 
            options = [{"id": c.entity_, "html": _print_url(c.entity_, id_dict)} for c in candidates]
            options = sorted(options, key=lambda r: int(r["id"][1:]))
            options.append({"id": "NIL_otherLink", "text": "Link not in options"})
            options.append({"id": "NIL_ambiguous", "text": "Need more context"})                
            task["options"] = options
            yield task'''
        
stream = _add_option(stream, kb, id_dict)
print([elem for elem in stream])
stream = prodigy.components.filters.filter_duplicates(stream, by_input=True, by_task=False)
print('final stream:', [elem for elem in stream])

result = [id_dict[elem] for elem in id_dict.keys() if 'strawberry' in id_dict[elem][0] or 'strawberry' in id_dict[elem][1]]
print(result)