In [1]:
import os, re
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import treebank
from nltk.corpus import dependency_treebank
from nltk.grammar import DependencyGrammar
from nltk.parse import DependencyGraph
import pprint
from nltk.parse.corenlp import CoreNLPDependencyParser
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
import numpy as np
import re
from numpy.linalg import norm
from nltk.stem import LancasterStemmer

from nltk.stem.snowball import SnowballStemmer




In [2]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
lancaster=LancasterStemmer()
sb = SnowballStemmer("english")
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [3]:
class Habitat:
    oid = -1
    name = ""

In [4]:
train_path = "./BioNLP-OST-2019_BB-norm_train/"
test_path = "./BioNLP-OST-2019_BB-norm_test/"
dev_path = "./BioNLP-OST-2019_BB-norm_dev/"
BB_train = os.listdir(train_path)
BB_test = os.listdir(test_path)
BB_dev = os.listdir(dev_path)
obp_file = "OntoBiotope_BioNLP-OST-2019.obo"

In [5]:
ontology = open(obp_file,encoding="utf8").read()

In [6]:
BB_train_a1 = sorted([name for name in BB_train if name.endswith("a1")])
BB_train_a2 = sorted([name for name in BB_train if name.endswith("a2")])
BB_train_txt = sorted([name for name in BB_train if name.endswith("txt")])

BB_dev_a1 = sorted([name for name in BB_dev if name.endswith("a1")])
BB_dev_a2 = sorted([name for name in BB_dev if name.endswith("a2")])
BB_dev_txt = sorted([name for name in BB_dev if name.endswith("txt")])

BB_test_a1 = sorted([name for name in BB_test if name.endswith("a1")])
BB_test_txt = sorted([name for name in BB_test if name.endswith("txt")])

In [7]:
def l_stem_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        if phrase[i] != phrase[i].upper():
            phrase[i] = lancaster.stem(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [8]:
def s_stem_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        if phrase[i] != phrase[i].upper():
            phrase[i] = sb.stem(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [9]:
def original_phrase(phrase):
    return phrase

In [10]:
def p_stem_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        if phrase[i] != phrase[i].upper():
            phrase[i] = ps.stem(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [11]:
def lemmatize_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        if phrase[i] != phrase[i].upper():
            phrase[i] = wnl.lemmatize(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [12]:
def switch_to_lower(phrase):
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        if phrase[i] != phrase[i].upper():
            phrase[i] = phrase[i].lower()
    phrase = " ".join(phrase)
    return phrase

In [13]:
phrase_reducer = lemmatize_phrase

In [14]:
def get_headword(phrase):
    #print(phrase)
    if phrase == phrase.upper():
        return ""
    
    parses = dep_parser.parse(phrase.split())
    triples = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
    
    try:
        if(len(triples) > 0 and len(triples[0]) > 0 and len(triples[0][0]) > 0 and len(triples[0][0][0]) > 0 and triples[0][0][0][0] is not None):
            return triples[0][0][0][0]
        else:
            return phrase
    except:
        return phrase
    
    

In [15]:
def add_documents(txt, processor):
    document = []
    #txt = processor(txt)
    sentence_list = sent_tokenize(txt)
    
    for sentence in sentence_list:
        sentence = sentence.replace('-', ' ')
        
        word_list = word_tokenize(sentence)
        word_list = [word for word in word_list if word not in stopwords.words("english") and word.isalnum()]
        for i in range(len(word_list)):
            if word_list[i].isnumeric():
                word_list[i] = "##number##"
            if word_list[i] != word_list[i].upper():
                word_list[i] = processor(word_list[i].lower())
             
        document.append(word_list)
    #print(document)
    return document

In [16]:
documents = []
for i in range(len(BB_train_txt)):
    txt = open(train_path + BB_train_txt[i],encoding="utf8").read()
    documents += add_documents(txt, phrase_reducer)
for i in range(len(BB_dev_txt)):
    txt = open(dev_path + BB_dev_txt[i],encoding="utf8").read()
    documents += add_documents(txt, phrase_reducer)
for i in range(len(BB_test_txt)):
    txt = open(test_path + BB_test_txt[i],encoding="utf8").read()
    documents += add_documents(txt, phrase_reducer)

In [17]:
model = Word2Vec(documents, min_count=1)

In [18]:
def get_phrase_embedding(phrase, model, processor):
    original_phrase = phrase
    phrase = phrase.replace('-', ' ')
    hyphenless_phrase = phrase
    phrase = nltk.word_tokenize(phrase)
    phrase = [word for word in phrase if word.lower() not in stopwords.words("english") and word.isalnum()]
    for i in range(len(phrase)):
        if phrase[i].isnumeric():
            phrase[i] = "##number##"
        if phrase[i] != phrase[i].upper():
            phrase[i] = processor(phrase[i].lower())
    #print(phrase)
    phrase_size = len(phrase)
    vec_size = len(model[list(model.wv.vocab)[0]])
    embedded_sum = np.zeros(vec_size)
    if(phrase_size == 0):
        return -1
    for word in phrase:
        if word not in list(model.wv.vocab):
            return -1
        embedded_sum = embedded_sum + np.array(model[word])
    embedded_sum = embedded_sum / phrase_size
    
    """hw = get_headword(hyphenless_phrase)
    if hw == "":
        print(hyphenless_phrase)
    else:
        hw_emb = np.array(model[processor(hw)])
        embedded_sum = (4*embedded_sum + hw_emb)/5"""
    
    return embedded_sum
    

In [19]:
def generate_habitat_map(processor, model):
    habitat_list = ontology.split("\n\n[Term]\n")[1:]
    habitat_map = {}
    habitat_map_originals = {}
    habitat_embeddings = {}
    for h in habitat_list:
        #new_habitat = Habitat()
        h = h.split("\n")
        oid = re.findall(r"(?<=id: OBT:)[0-9]+", h[0])[0]
        name = (re.findall(r"(?<=name: ).+", h[1])[0])
        original_name = name
        if name != name.upper():
            name = switch_to_lower(name)
            name = processor(name)
    
        habitat_map[name] = oid
        habitat_map_originals[oid] = name
        embedding = get_phrase_embedding(name, model, processor)
        if type(embedding) == type(np.array([1])):
            habitat_embeddings[name] = embedding
    
        # word2vec buraya eklenmeli 
        for h_line in h:
            if "synonym" in h_line:
                synonym_name = (re.findall(r"(?<=synonym: \").+(?=\")", h_line)[0])
                if synonym_name != synonym_name.upper():
                    synonym_name = switch_to_lower(synonym_name)
                    synonym_name = processor(synonym_name)
                if "EXACT" in h_line and synonym_name not in habitat_map:
                    habitat_map[synonym_name] = oid
                    if type(embedding) == type(np.array([1])):
                        habitat_embeddings[synonym_name] = embedding
                    
    return habitat_map, habitat_map_originals, habitat_embeddings

In [20]:
habitat_map, habitat_map_originals, habitat_embeddings = generate_habitat_map(phrase_reducer, model)

  


In [21]:
def add_habitats(a1, a2, processor, model):
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    
    
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]: 
                named_entity = (a1[a1_line].split("\t")[2])
                named_entity = named_entity.replace('-', ' ')
                if named_entity != named_entity.upper():
                    named_entity = switch_to_lower(named_entity)
                    named_entity = processor(named_entity)
                if named_entity not in habitat_map:
                    habitat_map[named_entity] = oid
                
                if habitat_map_originals[oid] not in habitat_embeddings:
                    habitat_embeddings[habitat_map_originals[oid]] = []
                
                if type(habitat_embeddings[habitat_map_originals[oid]]) == list:
                    emb = get_phrase_embedding(named_entity, model, processor)
                    if type(emb) == int:
                        print("no embedding for", named_entity)
                    habitat_embeddings[habitat_map_originals[oid]].append(emb)

In [22]:
for i in range(len(BB_train_txt)):    
    a1 = open(train_path + BB_train_a1[i],encoding="utf8").read()
    a2 = open(train_path + BB_train_a2[i],encoding="utf8").read()
    add_habitats(a1, a2, phrase_reducer, model)

  


In [23]:
for key in habitat_embeddings:
    if type(habitat_embeddings[key]) == list:
        vec_size = len(habitat_embeddings[key][0])
        temp_vec = np.zeros(vec_size)
        for vec in habitat_embeddings[key]:
            temp_vec += vec
        temp_vec /= len(habitat_embeddings[key])
        habitat_embeddings[key] = temp_vec

In [24]:
def predict_habitats_exact_matching(a1, txt, a1_name, processor, pred_file_name):
    pred_file = open(pred_file_name, "w")
    cands = 0
    matches = 0
    match_list = {}
    a1 = a1.split("\n")
    count = 1
    for line in a1:
        found = False
        if "Habitat" in line:
            cands += 1
            name = (line.split("\t")[2])
            named_entity = name
            if name != name.upper():
                name = switch_to_lower(name)
                named_entity = processor(name)
            if named_entity in habitat_map:
                matches += 1
                match = name + " - " + habitat_map[named_entity] + "---exact"
                match_list[line.split("\t")[0]] = [habitat_map[named_entity], named_entity, "exact"]
                found = True
            else:
                #print(name)
                pre_named_entity = get_headword(name)
                #print(pre_named_entity)
                named_entity = processor(pre_named_entity)
                if named_entity in habitat_map:
                    matches += 1
                    match = name + " - " + habitat_map[named_entity] + "---headwordexact"
                    print(match)
                    match_list[line.split("\t")[0]] = [habitat_map[named_entity], name, "headwordexact", named_entity]
                    found = True
        if found:
            pred_file.write("N" + str(count) +"\tOntoBiotope Annotation:" + line.split("\t")[0] + " Referent:OBT:" + match_list[line.split("\t")[0]][0] + "\n")
            count += 1
    #print(a1_name)
    #print("Out of", cands, "candidates,", matches, "matches found")
    #print()
    pred_file.close()
    return match_list

In [25]:
def predict_habitats(a1, txt, a1_name, processor, pred_file_name, model):
    pred_file = open(pred_file_name, "w")
    cands = 0
    matches = 0
    match_list = {}
    a1 = a1.split("\n")
    count = 1
    for line in a1:
        found = False
        if "Habitat" in line:
            name = (line.split("\t")[2])
            named_entity = name
            if name != name.upper():
                name = switch_to_lower(name)
                named_entity = processor(name)
            if named_entity in habitat_map:
                matches += 1
                match = name + " - " + habitat_map[named_entity] + " - " + habitat_map_originals[habitat_map[named_entity]] + " ---exact"
                #print(match)
                match_list[line.split("\t")[0]] = [habitat_map[named_entity], named_entity, "exact"]
                pred_file.write("N" + str(count) +"\tOntoBiotope Annotation:" + line.split("\t")[0] + " Referent:OBT:" + match_list[line.split("\t")[0]][0] + "\n")
                count += 1
                continue
            #print(name)
            pre_named_entity = get_headword(name)
            #print(pre_named_entity)
            named_entity = processor(pre_named_entity)
            if named_entity in habitat_map:
                matches += 1
                match = name + " - " + habitat_map[named_entity] + " - " + habitat_map_originals[habitat_map[named_entity]] +" ---headwordexact"
                #print(match)
                match_list[line.split("\t")[0]] = [habitat_map[named_entity], name, "headwordexact", named_entity]
                found = True
                pred_file.write("N" + str(count) +"\tOntoBiotope Annotation:" + line.split("\t")[0] + " Referent:OBT:" + match_list[line.split("\t")[0]][0] + "\n")
                count += 1
                continue
            
            cands += 1
            name = (line.split("\t")[2])
            name = name.replace("-", " ")
            named_entity = name
            entity_vec = get_phrase_embedding(named_entity, model, processor)
            match_id = ""
            largest_prod = 0
            if type(entity_vec) != int:
                largest_prod = np.dot(entity_vec, habitat_embeddings[list(habitat_embeddings)[0]])/(norm(entity_vec)*norm(habitat_embeddings[list(habitat_embeddings)[0]]))
                match_id = habitat_map[list(habitat_embeddings)[0]]
                for habitat in habitat_embeddings:
                    prod = np.dot(entity_vec,habitat_embeddings[habitat])/(norm(entity_vec)*norm(habitat_embeddings[habitat]))
                    if prod > largest_prod:
                        largest_prod = prod
                        match_id = habitat_map[habitat]
            else:
                print("Doc:", a1_name," no embedding for:", named_entity)
                continue
                
            match_list[line.split("\t")[0]] = [match_id, named_entity, "w2v", largest_prod]
            match = name + " - " + match_id + " - " + habitat_map_originals[match_id] + " ---w2v"
            #print(line.split("\t")[0], "matched with w2v")
            #print(match_list)
            #print(match)
            
            pred_file.write("N" + str(count) +"\tOntoBiotope Annotation:" + line.split("\t")[0] + " Referent:OBT:" + match_id + "\n")
            count += 1
            
    #print(a1_name)
    #print("Out of", cands, "candidates,", matches, "matches found")
    #print()
    pred_file.close()
    return match_list

In [26]:
annotations_test = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    txt = open(dev_path + BB_dev_txt[i],encoding="utf8").read()
    pred_file_name = "./dev_preds/" + BB_dev_a2[i]
    annotations_test[BB_dev_a1[i]] = predict_habitats(a1, txt, BB_dev_a1[i], phrase_reducer, pred_file_name, model)

  


In [27]:
def test_habitats(a1, a2, processor):
    
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    match_list = {}
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]:
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = switch_to_lower(named_entity)
                    named_entity = processor(named_entity)
                match_list[t] = [oid, named_entity]
    return match_list

In [28]:
dev_a2 = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    a2 = open(dev_path + BB_dev_a2[i],encoding="utf8").read()
    dev_a2[BB_dev_a1[i]] = test_habitats(a1,a2, phrase_reducer)

In [29]:
true_pred_count = 0
false_pred_count = 0
total_count = 0
for key in dev_a2:
    """print()
    print(key)
    print(dev_a2[key])
    print("**************")"""
    total_count += len(dev_a2[key])
    if key in annotations_test:
        for pred in annotations_test[key]:            
            if annotations_test[key][pred][0] == dev_a2[key][pred][0]:
                true_pred_count += 1
                if annotations_test[key][pred][2] == "w2v":
                    print(key)
                    print(True)
                    #print("Prediction:", annotations_test[key][pred]," Real:", dev_a2[key][pred], " Type:", annotations_test[key][pred][2])
                    print("Word:", annotations_test[key][pred][1]," Prediction:", habitat_map_originals[annotations_test[key][pred][0]], " Real:", habitat_map_originals[dev_a2[key][pred][0]], " Type:", annotations_test[key][pred][2])
                    print("Cosine Similarity:", annotations_test[key][pred][3])
                    print()
            else:
                if annotations_test[key][pred][2] == "w2v":
                    print(key)
                    print(False)
                    #print("Prediction:", annotations_test[key][pred]," Real:", dev_a2[key][pred], " Type:", annotations_test[key][pred][2])
                    print("Word:", annotations_test[key][pred][1]," Prediction:", habitat_map_originals[annotations_test[key][pred][0]], " Real:", habitat_map_originals[dev_a2[key][pred][0]], " Type:", annotations_test[key][pred][2])
                    print("Cosine Similarity:", annotations_test[key][pred][3])
                    print()
                false_pred_count += 1

BB-norm-10496597.a1
False
Word: gastric mucosal  Prediction: gastric acid  Real: gastric mucosa  Type: w2v
Cosine Similarity: 0.9963446853800444

BB-norm-10496597.a1
False
Word: MALT  Prediction: blood serum  Real: lymphatic system part  Type: w2v
Cosine Similarity: 0.7174385242722474

BB-norm-10496597.a1
False
Word: gastric MALT  Prediction: gastric body  Real: lymphatic system part  Type: w2v
Cosine Similarity: 0.9949707402309331

BB-norm-10496597.a1
False
Word: MALT  Prediction: blood serum  Real: lymphatic system part  Type: w2v
Cosine Similarity: 0.7174385242722474

BB-norm-10496597.a1
False
Word: CD20  Prediction: young adult  Real: lymphocyte  Type: w2v
Cosine Similarity: 0.8136720998929088

BB-norm-10496597.a1
False
Word: CD3  Prediction: vancomycin tolerant  Real: lymphocyte  Type: w2v
Cosine Similarity: 0.7110639146621243

BB-norm-10496597.a1
False
Word: MALT  Prediction: blood serum  Real: lymphatic system part  Type: w2v
Cosine Similarity: 0.7174385242722474

BB-norm-104965

Word: surface of Gobbeen cheese  Prediction: surface of cheese  Real: surface of cheese  Type: w2v
Cosine Similarity: 0.9998629884280106

BB-norm-F-20167385-011.a1
True
Word: Fontina cheese surface  Prediction: surface of cheese  Real: surface of cheese  Type: w2v
Cosine Similarity: 0.9998785167485225

BB-norm-F-20167385-011.a1
True
Word: Gorgonzola rind  Prediction: cheese rind  Real: cheese rind  Type: w2v
Cosine Similarity: 0.996326724455542

BB-norm-F-20167385-011.a1
False
Word: Taleggio rind  Prediction: food rind  Real: cheese rind  Type: w2v
Cosine Similarity: 0.9960254750872725

BB-norm-F-20167385-011.a1
True
Word: surface of Gorgonzola cheeses  Prediction: surface of cheese  Real: surface of cheese  Type: w2v
Cosine Similarity: 0.9998769304406441

BB-norm-F-20167385-011.a1
True
Word: surface of Scimudin cheeses  Prediction: surface of cheese  Real: surface of cheese  Type: w2v
Cosine Similarity: 0.999860689314432

BB-norm-F-20167385-011.a1
False
Word: Scimudin surfaces  Predic

In [30]:
recall = true_pred_count/total_count
recall

0.5295081967213114

In [31]:
precision = true_pred_count/(true_pred_count + false_pred_count)
precision

0.5295081967213114

## Denemeler

In [32]:
qwe = get_phrase_embedding("gastric mucosal",model,phrase_reducer)
ewq = get_phrase_embedding("gastric mucosa",model,phrase_reducer)
np.dot(qwe,ewq)/(norm(qwe)*norm(ewq))

  


0.9950891712699284