In [1]:
import os, re
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import treebank
from nltk.corpus import dependency_treebank
from nltk.grammar import DependencyGrammar
from nltk.parse import DependencyGraph
import pprint
from nltk.parse.corenlp import CoreNLPDependencyParser
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec

In [2]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [3]:
class Habitat:
    oid = -1
    name = ""

In [4]:
train_path = "./BioNLP-OST-2019_BB-norm_train/"
test_path = "./BioNLP-OST-2019_BB-norm_test/"
dev_path = "./BioNLP-OST-2019_BB-norm_dev/"
BB_train = os.listdir(train_path)
BB_test = os.listdir(test_path)
BB_dev = os.listdir(dev_path)
obp_file = "OntoBiotope_BioNLP-OST-2019.obo"

In [5]:
ontology = open(obp_file,encoding="utf8").read()

In [6]:
BB_train_a1 = sorted([name for name in BB_train if name.endswith("a1")])
BB_train_a2 = sorted([name for name in BB_train if name.endswith("a2")])
BB_train_txt = sorted([name for name in BB_train if name.endswith("txt")])

BB_dev_a1 = sorted([name for name in BB_dev if name.endswith("a1")])
BB_dev_a2 = sorted([name for name in BB_dev if name.endswith("a2")])
BB_dev_txt = sorted([name for name in BB_dev if name.endswith("txt")])

BB_test_a1 = sorted([name for name in BB_test if name.endswith("a1")])
BB_test_txt = sorted([name for name in BB_test if name.endswith("txt")])

In [7]:
def original_phrase(phrase):
    return phrase

In [8]:
def stem_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = ps.stem(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [9]:
def lemmatize_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = wnl.lemmatize(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [10]:
def get_headword(phrase):
    #print(phrase)
    if phrase == phrase.upper():
        return ""
    
    parses = dep_parser.parse(phrase.split())
    triples = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
    
    try:
        if(len(triples) > 0 and len(triples[0]) > 0 and len(triples[0][0]) > 0 and len(triples[0][0][0]) > 0 and triples[0][0][0][0] is not None):
            return triples[0][0][0][0]
        else:
            return phrase
    except:
        return phrase
    
    

In [11]:
phrase_reducer = lemmatize_phrase

In [12]:
def generate_habitat_map(processor):
    habitat_list = ontology.split("\n\n[Term]\n")[1:]
    habitat_map = {}
    habitat_map_originals = {}
    for h in habitat_list:
        #new_habitat = Habitat()
        h = h.split("\n")
        oid = re.findall(r"(?<=id: OBT:)[0-9]+", h[0])[0]
        name = (re.findall(r"(?<=name: ).+", h[1])[0])
        if name != name.upper():
            name = name.lower()
            name = processor(name)
    
        habitat_map[name] = oid
        habitat_map_originals[oid] = name
    
        for h_line in h:
            if "synonym" in h_line:
                synonym_name = (re.findall(r"(?<=synonym: \").+(?=\")", h_line)[0])
                if synonym_name != synonym_name.upper():
                    synonym_name = synonym_name.lower()
                    synonym_name = processor(synonym_name)
                if "EXACT" in h_line and synonym_name not in habitat_map:
                    habitat_map[synonym_name] = oid
    return habitat_map, habitat_map_originals

In [13]:
habitat_map, habitat_map_originals = generate_habitat_map(phrase_reducer)

In [87]:
def add_habitats(a1, a2, processor):
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]: 
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                if named_entity not in habitat_map:
                    habitat_map[named_entity] = oid
                
                print(habitat_map_originals[oid], " ---- ", named_entity)
                
                    #print("Newly added entity:", named_entity, oid)
                    #print("Original entity:", habitat_map_originals[oid])

In [88]:
add_habitats(open(train_path + BB_train_a1[0],encoding="utf8").read(),\
             open(train_path + BB_train_a2[0],encoding="utf8").read(),\
             phrase_reducer)

nutrient broth  ----  selective broth based on hypertonic strontium chloride
nutrient broth  ----  selective broth based on the bi-selenite ion
egg and egg product  ----  egg product
whole egg  ----  egg
whole egg  ----  egg
egg and egg product  ----  egg pulp
nutrient broth  ----  strontium chloride m broth
nutrient broth  ----  strontium selenite broth
nutrient broth  ----  strontium selenite a broth
nutrient broth  ----  bi-selenite based medium
egg and egg product  ----  egg product
whole egg  ----  egg
nutrient broth  ----  strontium chloride m broth


In [15]:
for i in range(len(BB_train_txt)):    
    a1 = open(train_path + BB_train_a1[i],encoding="utf8").read()
    a2 = open(train_path + BB_train_a2[i],encoding="utf8").read()
    add_habitats(a1, a2, phrase_reducer)

In [16]:
def predict_habitats_exact_matching(a1, txt, a1_name, processor, pred_file_name):
    pred_file = open(pred_file_name, "w")
    cands = 0
    matches = 0
    match_list = {}
    a1 = a1.split("\n")
    count = 1
    for line in a1:
        found = False
        if "Habitat" in line:
            cands += 1
            name = (line.split("\t")[2])
            named_entity = name
            if name != name.upper():
                name = name.lower()
                named_entity = processor(name)
            if named_entity in habitat_map:
                matches += 1
                match = name + " - " + habitat_map[named_entity] + "---exact"
                match_list[line.split("\t")[0]] = [habitat_map[named_entity], named_entity, "exact"]
                found = True
            else:
                #print(name)
                pre_named_entity = get_headword(name)
                #print(pre_named_entity)
                named_entity = processor(pre_named_entity)
                if named_entity in habitat_map:
                    matches += 1
                    match = name + " - " + habitat_map[named_entity] + "---headwordexact"
                    print(match)
                    match_list[line.split("\t")[0]] = [habitat_map[named_entity], name, "headwordexact", named_entity]
                    found = True
        if found:
            pred_file.write("N" + str(count) +"\tOntoBiotope Annotation:" + line.split("\t")[0] + " Referent:OBT:" + match_list[line.split("\t")[0]][0] + "\n")
            count += 1
    #print(a1_name)
    #print("Out of", cands, "candidates,", matches, "matches found")
    #print()
    pred_file.close()
    return match_list

In [17]:
def add_documents(txt, processor):
    document = []
    txt = processor(txt)
    sentence_list = sent_tokenize(txt)
    
    for sentence in sentence_list:
        word_list = word_tokenize(sentence)
        word_list = [processor(word) for word in word_list if word not in stopwords.words("english") and word.isalnum()]
        for i in range(len(word_list)):
            if word_list[i].isnumeric():
                word_list[i] = "##number##"
        document.append(word_list)
    #print(document)
    return document

In [32]:
documents = []
for i in range(len(BB_train_txt)):
    txt = open(train_path + BB_train_txt[i],encoding="utf8").read()
    documents += add_documents(txt, phrase_reducer)

for i in range(len(BB_dev_txt)):
    txt = open(dev_path + BB_dev_txt[i],encoding="utf8").read()
    documents += add_documents(txt, phrase_reducer)

for i in range(len(BB_test_txt)):
    txt = open(test_path + BB_test_txt[i],encoding="utf8").read()
    documents += add_documents(txt, phrase_reducer)
    

In [76]:
open(train_path + BB_train_a1[0],encoding="utf8").read().split('\n')

['T1\tTitle 0 141\tAn evaluation of selective broths based on the bi-selenite ion and on hypertonic strontium chloride in Salmonellae detection in egg products.',
 'T2\tParagraph 142 579\tOf the 104 isolations of Salmonella sp. from egg pulp, 97 were obtained from strontium chloride M broth, 42 from strontium selenite broth and 57 from strontium selenite A broth. The results suggest that the first medium may be used more successfully than bi-selenite based media for enrichment and subsequent detection of salmonellae in egg products; however, the growth of S. pullorum was not satisfactory in strontium chloride M broth.',
 'T3\tHabitat 17 39;67 99\tselective broths based on hypertonic strontium chloride',
 'T4\tHabitat 17 62\tselective broths based on the bi-selenite ion',
 'T5\tMicroorganism 103 114\tSalmonellae',
 'T6\tHabitat 128 140\tegg products',
 'T7\tHabitat 128 131\tegg',
 'T8\tMicroorganism 167 177\tSalmonella',
 'T9\tHabitat 187 190\tegg',
 'T10\tHabitat 187 195\tegg pulp',
 '

In [77]:
open(train_path + BB_train_a2[0],encoding="utf8").read().split('\n')

['N1\tOntoBiotope Annotation:T3 Referent:OBT:000360',
 'N2\tOntoBiotope Annotation:T4 Referent:OBT:000360',
 'N3\tNCBI_Taxonomy Annotation:T5 Referent:590',
 'N4\tOntoBiotope Annotation:T6 Referent:OBT:001086',
 'N5\tOntoBiotope Annotation:T7 Referent:OBT:001847',
 'N6\tNCBI_Taxonomy Annotation:T8 Referent:599',
 'N7\tOntoBiotope Annotation:T9 Referent:OBT:001847',
 'N8\tOntoBiotope Annotation:T10 Referent:OBT:001086',
 'N9\tOntoBiotope Annotation:T11 Referent:OBT:000360',
 'N10\tOntoBiotope Annotation:T12 Referent:OBT:000360',
 'N11\tOntoBiotope Annotation:T13 Referent:OBT:000360',
 'N12\tOntoBiotope Annotation:T14 Referent:OBT:000360',
 'N13\tNCBI_Taxonomy Annotation:T15 Referent:590',
 'N14\tOntoBiotope Annotation:T16 Referent:OBT:001086',
 'N15\tOntoBiotope Annotation:T17 Referent:OBT:001847',
 'N16\tNCBI_Taxonomy Annotation:T18 Referent:605',
 'N17\tOntoBiotope Annotation:T19 Referent:OBT:000360',
 '']

In [78]:
habitat_map

{'fermented cheese': '003381',
 'enriched dough': '002017',
 'lean dough': '002100',
 'fermented cereal-based product': '003144',
 'crusty bread': '002417',
 'fresh cheese': '002043',
 'whey cheese': '003440',
 'brined cheese': '003406',
 'processed cheese': '002202',
 'feta': '003448',
 'streched curd cheese': '003432',
 'pasta filata': '003432',
 'plastic curd cheese': '003432',
 'pulled-curd cheese': '003432',
 'mozzarella': '003452',
 'caciocavallo': '003467',
 'ricotta': '003456',
 'brocciu': '003443',
 'schabziger': '003457',
 'sapsago': '003521',
 'sérac': '003457',
 'ripened cheese': '003428',
 'soft cheese': '003459',
 'semi soft cheese': '003458',
 'semi hard cheese': '003458',
 'hard cheese': '003450',
 'extra hard cheese': '003489',
 'american cheese': '002319',
 'velveeta': '002633',
 'the laughing cow': '002329',
 'la vache qui rit': '002329',
 'fromage blanc': '003561',
 'quark': '002568',
 'quarg': '002568',
 'cottage cheese': '002408',
 'chhena': '002393',
 'chhana': '

In [60]:
len(documents)

2079

In [54]:
model = Word2Vec(documents, min_count=1)


In [56]:
len(model.wv.vocab)

5721

In [66]:
count = 0
a = 0
for hb in habitat_map.keys():
    words = hb.split(' ')
    for w in words:
        a += 1
        try:
            model[w]
        except:
            count += 1

  


In [68]:
a

9583

In [67]:
count

4815

In [73]:
model['cheese'].shape

  """Entry point for launching an IPython kernel.


(100,)

In [None]:
annotations_test = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    txt = open(dev_path + BB_dev_txt[i],encoding="utf8").read()
    pred_file_name = "./dev_preds/" + BB_dev_a2[i]
    annotations_test[BB_dev_a1[i]] = predict_habitats(a1, txt, BB_dev_a1[i], phrase_reducer, pred_file_name)

In [None]:
annotations_test

In [None]:
def test_habitats(a1, a2, processor):
    
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    match_list = {}
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]:
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                match_list[t] = [oid, named_entity]
    return match_list

In [None]:
dev_a2 = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    a2 = open(dev_path + BB_dev_a2[i],encoding="utf8").read()
    dev_a2[BB_dev_a1[i]] = test_habitats(a1,a2, phrase_reducer)

In [None]:
true_pred_count = 0
false_pred_count = 0
total_count = 0
for key in dev_a2:
    """print()
    print(key)
    print(dev_a2[key])
    print("**************")"""
    total_count += len(dev_a2[key])
    if key in annotations_test:
        for pred in annotations_test[key]:            
            if annotations_test[key][pred][0] == dev_a2[key][pred][0]:
                true_pred_count += 1
            else:
                print(key)
                print("Prediction:", annotations_test[key][pred]," Real:", dev_a2[key][pred])
                print("Prediction:", habitat_map_originals[annotations_test[key][pred][0]], " Real:", habitat_map_originals[dev_a2[key][pred][0]])
                false_pred_count += 1

In [None]:
recall = true_pred_count/total_count
recall

In [None]:
precision = true_pred_count/(true_pred_count + false_pred_count)
precision