# Important imports

In [15]:
# import owlready2
import prodigy
import spacy 
import csv
import os

from rdflib import Graph
from rdflib.namespace import OWL, RDF, RDFS### Read FoodOn Ontology 
from pathlib import Path
from prodigy.models.ner import EntityRecognizer

### Read FoodOn Ontology 

In [35]:
g = Graph()
g.parse('https://raw.githubusercontent.com/FoodOntology/foodon/master/foodon.owl', format='xml')
labels = {}

for s, p, o in g:
    if p.split('#')[-1] == "label":
        # print(s, p, o)
        labels[str(o)] = str(s)
        
print(labels)

{'cattle steer': 'http://purl.obolibrary.org/obo/FOODON_00002531', 'laminate paper coextruded with plastic': 'http://purl.obolibrary.org/obo/FOODON_03490356', 'japanese huchen': 'http://purl.obolibrary.org/obo/FOODON_03414048', 'seed, skin removed': 'http://purl.obolibrary.org/obo/FOODON_03420134', 'food object quality': 'http://purl.obolibrary.org/obo/FOODON_00002470', 'ironwort plant': 'http://purl.obolibrary.org/obo/FOODON_03415141', 'carcharodon': 'http://purl.obolibrary.org/obo/FOODON_03413523', 'cut into short pieces': 'http://purl.obolibrary.org/obo/FOODON_03430111', 'belly meat': 'http://purl.obolibrary.org/obo/FOODON_03530063', 'low density polyethylene container': 'http://purl.obolibrary.org/obo/FOODON_03490370', 'extent of food heat treatment': 'http://purl.obolibrary.org/obo/FOODON_03440011', 'one component adhesive, water soluble': 'http://purl.obolibrary.org/obo/FOODON_03490228', 'marine fish': 'http://purl.obolibrary.org/obo/FOODON_03413362', 'Williams bon chrétien pear 

In [17]:
# !python -m spacy download en_core_web_lg

### READ BRAT ANNOTATION  

In [18]:
# !pip install mendelai-brat-parser

In [19]:
from brat_parser import get_entities_relations_attributes_groups

In [20]:
entities_list, relations_list = [], []

for i in range(300):
    entities, relations, _, _ = get_entities_relations_attributes_groups(r".\0-1-2\0.ann")
    entities_list.append(entities)
    relations_list.append(relations)

In [21]:
print(entities_list[0])

{'T1': Entity(id='T1', type='quantity', span=((0, 1),), text='5'), 'T2': Entity(id='T2', type='unit', span=((2, 8),), text='ounces'), 'T3': Entity(id='T3', type='food_product_with_unit', span=((9, 12),), text='rum'), 'T4': Entity(id='T4', type='quantity', span=((33, 34),), text='3'), 'T5': Entity(id='T5', type='unit', span=((35, 41),), text='ounces'), 'T6': Entity(id='T6', type='trade_name', span=((42, 51),), text='Tia Maria'), 'T7': Entity(id='T7', type='food_product_with_unit', span=((42, 51),), text='Tia Maria'), 'T8': Entity(id='T8', type='food_product_with_unit', span=((22, 32),), text='triple sec'), 'T9': Entity(id='T9', type='unit', span=((15, 21),), text='ounces'), 'T10': Entity(id='T10', type='quantity', span=((13, 14),), text='4'), 'T11': Entity(id='T11', type='food_product_with_unit', span=((62, 74),), text='orange juice'), 'T12': Entity(id='T12', type='unit', span=((55, 61),), text='ounces'), 'T13': Entity(id='T13', type='quantity', span=((52, 54),), text='20')}


In [22]:
print(relations_list[0])

{'R1': Relation(id='R1', type='refers', subj='T3', obj='T2'), 'R3': Relation(id='R3', type='refers', subj='T7', obj='T5'), 'R2': Relation(id='R2', type='refers', subj='T2', obj='T1'), 'R4': Relation(id='R4', type='refers', subj='T5', obj='T4'), 'R5': Relation(id='R5', type='refers', subj='T8', obj='T9'), 'R6': Relation(id='R6', type='refers', subj='T9', obj='T10'), 'R7': Relation(id='R7', type='refers', subj='T11', obj='T12'), 'R8': Relation(id='R8', type='refers', subj='T12', obj='T13')}


### Candidate generation from ontology

In [102]:
def load_food_entities():
    # entities_loc = Path("entity_food.csv")
    
    names = dict()
    descriptions = dict()
    
    '''with entities_loc.open("r", encoding="utf-8") as csvfile: 
        csvreader = csv.reader(csvfile, delimiter=",")
        
        for row in csvreader:
            qid = row[0]
            name = row[1]
            desc = row[2]
            
            names[qid] = name
            descriptions[qid] = desc'''
    
    # TODO: right indexes
    for i, elem in enumerate(labels.keys()):
        qid = str(i)
        names[qid] = elem
        descriptions[qid] = labels[elem]
            
    return names, descriptions

In [54]:
nlp = spacy.load("en_core_web_lg")
name_dict, desc_dict = load_food_entities()
kb = spacy.kb.KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

In [55]:
for qid, desc in desc_dict.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)
    
for qid, name in name_dict.items():
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])
    
qids = name_dict.keys()
probs = [1 / len(qids) for _ in qids]
kb.add_alias(alias="food", entities=qids, probabilities=probs)

print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

print(f"Candidates for 'Roy Stanley Emerson': {[c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson')]}")
print(f"Candidates for 'Emerson': {[c.entity_ for c in kb.get_alias_candidates('Emerson')]}")
print(f"Candidates for 'food': {[c.entity_ for c in kb.get_alias_candidates('food')]}")

Entities in the KB: ['958', '2224', '2405', '1627', '3477', '2414', '3003', '1254', '1900', '1767', '2012', '860', '3402', '3244', '429', '3015', '2859', '1883', '2922', '2206', '1428', '1985', '1937', '3374', '529', '3602', '2698', '1726', '326', '2960', '3022', '2988', '196', '2435', '1348', '2854', '942', '3230', '3582', '3401', '2495', '788', '3659', '1062', '546', '2538', '1379', '1056', '3577', '1884', '2999', '290', '3653', '1816', '1851', '1772', '404', '2117', '726', '391', '3525', '3591', '2355', '2701', '2397', '2494', '799', '1416', '1928', '2761', '1417', '703', '684', '1262', '1664', '1943', '2152', '2593', '722', '690', '2503', '2740', '3510', '550', '1750', '2994', '46', '3446', '3033', '756', '2514', '3585', '1364', '453', '2298', '1695', '1347', '1849', '2631', '1006', '2525', '1376', '3377', '393', '3353', '586', '1002', '1382', '1113', '286', '936', '2535', '1160', '1033', '3663', '1166', '1648', '2200', '3355', '9', '1676', '1783', '3161', '2775', '2860', '981', '2

In [56]:
# save KnowlegeBase 
output_dir = Path.cwd() / "output_food"

if not os.path.exists(output_dir):
    os.mkdir(output_dir) 

kb.to_disk(output_dir / "my_kb")
nlp.to_disk(output_dir / "my_nlp")

### Manual Entity Linking

In [75]:
def _add_option(stream, kb, id_dict):
    for task in stream:
        text = task["text"]
        
        for span in task["spans"]:
            start_char = int(span["start"])
            end_char = int(span["end"])
            mention = text[start_char:end_char]
            
            candidates = kb.get_alias_candidates(mention)
            if candidates: 
                options = [{"id": c.entity_, "html": _print_url(c.entity_, id_dict)} for c in candidates]
                options = sorted(options, key=lambda r: int(r["id"][1:]))
                options.append({"id": "NIL_otherLink", "text": "Link not in options"})
                options.append({"id": "NIL_ambiguous", "text": "Need more context"})
                task["options"] = options
                yield task

In [76]:
def _print_url(entity_id, id_dict):
    url_prefix = "https://www.wikidata.org/wiki"
    name, descr = id_dict.get(entity_id)
    option = "<a href='" + url_prefix + entity_id + "'>" + entity_id + "</a>: " + descr
    return option

In [77]:
@prodigy.recipe(
    "entity_linker.manual", 
    dataset=("The dataset to use", "positional", None, str), 
    source=("The source data as a .txt file", "positional", None, Path),
    nlp_dir=("Path to the NLP model with a pretrained NER component", "positional", None, Path),
    kb_loc=("Path to the KB", "positional", None, Path), 
    entity_loc=("Path to the file with additional information about he entities", "positional", None, Path),
)
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc):
    nlp = spacy.load(nlp_dir)
    kb = spacy.kb.KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1)
    kb.from_disk(kb_loc)
    model = EntityRecognizer(nlp)
    
    id_dict = {}
    '''with entity_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            id_dict[row[0]] = (row[1], row[2])'''
    for i, elem in enumerate(labels.keys()):
        id_dict[str(i)] = (elem, labels[elem])
    
    stream = prodigy.components.loaders.TXT(source)
    stream = [prodigy.util.set_hashes(eg) for eg in stream]
    stream = (eg for score, eg in model(stream))
    
    stream = _add_option(stream, kb, id_dict)
    stream = prodigy.components.filters.filter_duplicates(stream, by_input=True, by_task=False)
    
    return {
        "dataset": dataset, 
        "stream": stream,
        "view_id": "choice",
        "config": {"choice_auto_accept": True},
    }

In [78]:
!python -m prodigy entity_linker.manual food_sandbox .\0-1-2\3.txt ./output_food/my_nlp ./output_food/my_kb entity_food.csv -F sample.py


[x] Error while validating stream: no first example
This likely means that your stream is empty.This can also mean all the examples
in your stream have been annotated in datasets included in your --exclude recipe
parameter.



In [103]:
file = Path(r".\0-1-2\3.txt") # Path("emerson_input_text.txt") # Path(r".\0-1-2\0.txt")
model = EntityRecognizer(nlp)

id_dict = {}
for i, elem in enumerate(labels.keys()):
    id_dict[str(i)] = (elem, labels[elem])

stream = prodigy.components.loaders.TXT(file)
stream = [prodigy.util.set_hashes(eg) for eg in stream]
stream = (eg for score, eg in model(stream))

for task in stream:
    text = task["text"]
        
    for span in task["spans"]:
        start_char = int(span["start"])
        end_char = int(span["end"])
        mention = text[start_char:end_char]
        print('mention:', mention)
        
stream = _add_option(stream, kb, id_dict)
stream = prodigy.components.filters.filter_duplicates(stream, by_input=True, by_task=False)

result = [id_dict[elem] for elem in id_dict.keys() if 'strawberry' in id_dict[elem][0] or 'strawberry' in id_dict[elem][1]]
print(result)

mention: 1
mention: 1
mention: 2
mention: 2
mention: 6 -8
mention: 6 -8
mention: half
mention: strawberry
mention: strawberry
mention: graham cracker
[('strawberry-tomato plant', 'http://purl.obolibrary.org/obo/FOODON_03414932'), ('strawberry plant', 'http://purl.obolibrary.org/obo/FOODON_03411393'), ('alpine strawberry plant', 'http://purl.obolibrary.org/obo/FOODON_03413339'), ('hautbois strawberry plant', 'http://purl.obolibrary.org/obo/FOODON_03413337'), ('european strawberry plant', 'http://purl.obolibrary.org/obo/FOODON_03412948'), ('green strawberry plant', 'http://purl.obolibrary.org/obo/FOODON_03413335')]


# Old - Read foodOn Ontology 

In [None]:
path = "foodon.owl" # https://raw.githubusercontent.com/FoodOntology/foodon/master/foodon.owl
ontology = owlready2.get_ontology(f"file://{path}").load()

In [12]:
""" Generate properties associated with a list of tuples with arguments used with it
    Returns:
        dict: properties associated with a list of tuples with arguments used with it
"""
usages: dict = dict()

for subject in ontology.classes():
    for prop in subject.get_class_properties():
        if prop not in usages:
            usages[prop] = []
            
        # if prop[type] == "rdfs:literal": 
            # print('tutaj')
        try:
            for obj in prop[subject]:
                if isinstance(obj, owlready2.entity.ThingClass):
                    usages[prop].append((subject, obj))
                else:
                    usages[prop].append((subject, None))
        except:
            print('Cant read literal')

Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read 

Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read literal
Cant read 

In [13]:
print(usages)

{rdf-schema.label: [(obo.FOODON_00002403, None), (obo.FOODON_00002403, None), (obo.FOODON_00002403, None), (obo.FOODON_00001002, None), (obo.BFO_0000024, None), (obo.BFO_0000024, None), (obo.BFO_0000024, None), (obo.BFO_0000024, None), (obo.BFO_0000029, None), (obo.BFO_0000029, None), (obo.BFO_0000029, None), (obo.BFO_0000029, None), (obo.BFO_0000031, None), (obo.BFO_0000031, None), (obo.BFO_0000031, None), (obo.FOODON_03420127, None), (obo.FOODON_03411041, None), (obo.CEPH_0000256, None), (obo.CEPH_0000285, None), (obo.UBERON_0000463, None), (obo.CHEBI_10607, None), (obo.FOODON_03412972, None), (obo.CHEBI_113451, None), (obo.CHEBI_113455, None), (obo.CHEBI_114786, None), (obo.CHEBI_12777, None), (obo.CHEBI_12777, None), (obo.FOODON_03413751, None), (obo.CHEBI_131526, None), (obo.CHEBI_132106, None), (obo.CHEBI_133354, None), (obo.CHEBI_15040, None), (obo.CHEBI_15366, None), (obo.CHEBI_15366, None), (obo.CHEBI_15366, None), (obo.CHEBI_15428, None), (obo.CHEBI_15428, None), (obo.CHEBI_1