In [9]:
import os, re
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import treebank
from nltk.corpus import dependency_treebank
from nltk.grammar import DependencyGrammar
from nltk.parse import DependencyGraph
import pprint
from nltk.parse.corenlp import CoreNLPDependencyParser
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec

In [10]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [11]:
class Habitat:
    oid = -1
    name = ""

In [12]:
train_path = "./BioNLP-OST-2019_BB-norm_train/"
test_path = "./BioNLP-OST-2019_BB-norm_test/"
dev_path = "./BioNLP-OST-2019_BB-norm_dev/"
BB_train = os.listdir(train_path)
BB_test = os.listdir(test_path)
BB_dev = os.listdir(dev_path)
obp_file = "OntoBiotope_BioNLP-OST-2019.obo"

In [13]:
ontology = open(obp_file,encoding="utf8").read()

In [14]:
BB_train_a1 = sorted([name for name in BB_train if name.endswith("a1")])
BB_train_a2 = sorted([name for name in BB_train if name.endswith("a2")])
BB_train_txt = sorted([name for name in BB_train if name.endswith("txt")])

BB_dev_a1 = sorted([name for name in BB_dev if name.endswith("a1")])
BB_dev_a2 = sorted([name for name in BB_dev if name.endswith("a2")])
BB_dev_txt = sorted([name for name in BB_dev if name.endswith("txt")])

BB_test_a1 = sorted([name for name in BB_test if name.endswith("a1")])
BB_test_txt = sorted([name for name in BB_test if name.endswith("txt")])

In [15]:
def original_phrase(phrase):
    return phrase

In [16]:
def stem_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = ps.stem(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [17]:
def lemmatize_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = wnl.lemmatize(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [18]:
def get_headword(phrase):
    #print(phrase)
    if phrase == phrase.upper():
        return ""
    
    parses = dep_parser.parse(phrase.split())
    triples = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
    
    try:
        if(len(triples) > 0 and len(triples[0]) > 0 and len(triples[0][0]) > 0 and len(triples[0][0][0]) > 0 and triples[0][0][0][0] is not None):
            return triples[0][0][0][0]
        else:
            return phrase
    except:
        return phrase
    
    

In [19]:
phrase_reducer = lemmatize_phrase

In [20]:
def generate_habitat_map(processor):
    habitat_list = ontology.split("\n\n[Term]\n")[1:]
    habitat_map = {}
    habitat_map_originals = {}
    for h in habitat_list:
        #new_habitat = Habitat()
        h = h.split("\n")
        oid = re.findall(r"(?<=id: OBT:)[0-9]+", h[0])[0]
        name = (re.findall(r"(?<=name: ).+", h[1])[0])
        if name != name.upper():
            name = name.lower()
            name = processor(name)
    
        habitat_map[name] = oid
        habitat_map_originals[oid] = name
    
        for h_line in h:
            if "synonym" in h_line:
                synonym_name = (re.findall(r"(?<=synonym: \").+(?=\")", h_line)[0])
                if synonym_name != synonym_name.upper():
                    synonym_name = synonym_name.lower()
                    synonym_name = processor(synonym_name)
                if "EXACT" in h_line and synonym_name not in habitat_map:
                    habitat_map[synonym_name] = oid
    return habitat_map, habitat_map_originals

In [21]:
habitat_map, habitat_map_originals = generate_habitat_map(phrase_reducer)

In [22]:
def add_habitats(a1, a2, processor):
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]: 
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                if named_entity not in habitat_map:
                    habitat_map[named_entity] = oid
                    #print("Newly added entity:", named_entity, oid)
                    #print("Original entity:", habitat_map_originals[oid])

In [23]:
for i in range(len(BB_train_txt)):    
    a1 = open(train_path + BB_train_a1[i],encoding="utf8").read()
    a2 = open(train_path + BB_train_a2[i],encoding="utf8").read()
    add_habitats(a1, a2, phrase_reducer)

In [24]:
def predict_habitats_exact_matching(a1, txt, a1_name, processor, pred_file_name):
    pred_file = open(pred_file_name, "w")
    cands = 0
    matches = 0
    match_list = {}
    a1 = a1.split("\n")
    count = 1
    for line in a1:
        found = False
        if "Habitat" in line:
            cands += 1
            name = (line.split("\t")[2])
            named_entity = name
            if name != name.upper():
                name = name.lower()
                named_entity = processor(name)
            if named_entity in habitat_map:
                matches += 1
                match = name + " - " + habitat_map[named_entity] + "---exact"
                match_list[line.split("\t")[0]] = [habitat_map[named_entity], named_entity, "exact"]
                found = True
            else:
                #print(name)
                pre_named_entity = get_headword(name)
                #print(pre_named_entity)
                named_entity = processor(pre_named_entity)
                if named_entity in habitat_map:
                    matches += 1
                    match = name + " - " + habitat_map[named_entity] + "---headwordexact"
                    print(match)
                    match_list[line.split("\t")[0]] = [habitat_map[named_entity], name, "headwordexact", named_entity]
                    found = True
        if found:
            pred_file.write("N" + str(count) +"\tOntoBiotope Annotation:" + line.split("\t")[0] + " Referent:OBT:" + match_list[line.split("\t")[0]][0] + "\n")
            count += 1
    #print(a1_name)
    #print("Out of", cands, "candidates,", matches, "matches found")
    #print()
    pred_file.close()
    return match_list

In [25]:
def add_documents(txt, processor):
    document = []
    txt = processor(txt)
    sentence_list = sent_tokenize(txt)
    
    for sentence in sentence_list:
        word_list = word_tokenize(sentence)
        word_list = [processor(word) for word in word_list if word not in stopwords.words("english") and word.isalnum()]
        for i in range(len(word_list)):
            if word_list[i].isnumeric():
                word_list[i] = "##number##"
        document.append(word_list)
    #print(document)
    return document

In [31]:
documents = []
for i in range(len(BB_train_txt)):
    txt = open(train_path + BB_train_txt[i],encoding="utf8").read()
    documents += add_documents(txt, phrase_reducer)

In [34]:
len(documents)

892

In [33]:
model = Word2Vec(documents)


In [37]:
print(model['yavuz'])

  """Entry point for launching an IPython kernel.


KeyError: "word 'yavuz' not in vocabulary"

In [61]:
annotations_test = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    txt = open(dev_path + BB_dev_txt[i],encoding="utf8").read()
    pred_file_name = "./dev_preds/" + BB_dev_a2[i]
    annotations_test[BB_dev_a1[i]] = predict_habitats(a1, txt, BB_dev_a1[i], phrase_reducer, pred_file_name)

gastric mucosal-associated lymphoid tissue - 000196---headwordexact
monoclonal b cells - 000061---headwordexact
gastric malt - 003215---headwordexact
patients with atypical lymphoid infiltrates - 003220---headwordexact
patients with low-grade malt lymphoma - 003220---headwordexact
patients with helicobacter pylori-chronic active gastritis - 003220---headwordexact
patients with high-grade primary gastric lymphoma - 003220---headwordexact
patients with chronic active gastritis - 003220---headwordexact
cd20-positive cells - 000061---headwordexact
lymphoepithelial lesions - 000332---headwordexact
monoclonal b cells - 000061---headwordexact
gastric malt - 003215---headwordexact
chicken nugget processing plant - 000393---headwordexact
chicken nugget - 002729---headwordexact
chicken nugget - 002729---headwordexact
chicken nugget processing plant - 000393---headwordexact
selective agars - 000031---headwordexact
h. pylori-positive subjects - 002488---headwordexact
h. pylori-negative subjects - 

In [62]:
annotations_test

{'BB-norm-10496597.a1': {'T3': ['001792', 'gastric', 'exact'],
  'T4': ['001577', 'gastric mucosa', 'exact'],
  'T5': ['000196',
   'gastric mucosal-associated lymphoid tissue',
   'headwordexact',
   'tissue'],
  'T7': ['001792', 'gastric', 'exact'],
  'T9': ['000061', 'monoclonal b cells', 'headwordexact', 'cell'],
  'T11': ['001792', 'gastric', 'exact'],
  'T12': ['003215', 'gastric malt', 'headwordexact', 'malt'],
  'T13': ['003220',
   'patients with atypical lymphoid infiltrates',
   'headwordexact',
   'patient'],
  'T14': ['003220',
   'patients with low-grade malt lymphoma',
   'headwordexact',
   'patient'],
  'T15': ['003220',
   'patients with helicobacter pylori-chronic active gastritis',
   'headwordexact',
   'patient'],
  'T16': ['003220',
   'patients with high-grade primary gastric lymphoma',
   'headwordexact',
   'patient'],
  'T17': ['001792', 'gastric', 'exact'],
  'T23': ['003220',
   'patients with chronic active gastritis',
   'headwordexact',
   'patient'],
  

In [63]:
def test_habitats(a1, a2, processor):
    
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    match_list = {}
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]:
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                match_list[t] = [oid, named_entity]
    return match_list

In [64]:
dev_a2 = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    a2 = open(dev_path + BB_dev_a2[i],encoding="utf8").read()
    dev_a2[BB_dev_a1[i]] = test_habitats(a1,a2, phrase_reducer)

In [65]:
true_pred_count = 0
false_pred_count = 0
total_count = 0
for key in dev_a2:
    """print()
    print(key)
    print(dev_a2[key])
    print("**************")"""
    total_count += len(dev_a2[key])
    if key in annotations_test:
        for pred in annotations_test[key]:            
            if annotations_test[key][pred][0] == dev_a2[key][pred][0]:
                true_pred_count += 1
            else:
                print(key)
                print("Prediction:", annotations_test[key][pred]," Real:", dev_a2[key][pred])
                print("Prediction:", habitat_map_originals[annotations_test[key][pred][0]], " Real:", habitat_map_originals[dev_a2[key][pred][0]])
                false_pred_count += 1

BB-norm-10496597.a1
Prediction: ['000196', 'gastric mucosal-associated lymphoid tissue', 'headwordexact', 'tissue']  Real: ['000334', 'gastric mucosal-associated lymphoid tissue']
Prediction: animal tissue  Real: lymphatic system part
BB-norm-10496597.a1
Prediction: ['000061', 'monoclonal b cells', 'headwordexact', 'cell']  Real: ['001623', 'monoclonal b cell']
Prediction: cell  Real: lymphocyte
BB-norm-10496597.a1
Prediction: ['003215', 'gastric malt', 'headwordexact', 'malt']  Real: ['000334', 'gastric malt']
Prediction: malt  Real: lymphatic system part
BB-norm-10496597.a1
Prediction: ['003220', 'patients with helicobacter pylori-chronic active gastritis', 'headwordexact', 'patient']  Real: ['003269', 'patient with helicobacter pylori-chronic active gastritis']
Prediction: patient  Real: patient with infectious disease
BB-norm-10496597.a1
Prediction: ['000061', 'cd20-positive cells', 'headwordexact', 'cell']  Real: ['001623', 'cd20-positive cell']
Prediction: cell  Real: lymphocyte


In [39]:
recall = true_pred_count/total_count
recall

0.4967213114754098

In [40]:
precision = true_pred_count/(true_pred_count + false_pred_count)
precision

0.6703539823008849

## Denemeler

In [None]:
len(treebank.parsed_sents())

In [None]:
tb_prod = set([production for parsed_sent in treebank.parsed_sents() for production in parsed_sent.productions()])

In [None]:
tb_prod

In [None]:
bb = dependency_treebank.parsed_sents()[1].to_conll(3)
print(bb)

In [None]:
dg = DependencyGraph(bb)
dg.tree().pprint()

In [None]:
from nltk.parse import CoreNLPParser

In [None]:
parser = CoreNLPParser(url='http://localhost:9000')

In [None]:
list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))

In [None]:
from nltk.parse.corenlp import CoreNLPDependencyParser

In [None]:
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [None]:
sentence1 = "birds supplemented with the n-hexane extract"

In [None]:
parses1 = dep_parser.parse(sentence1.split())

In [None]:
aaaa1 = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses1]

In [None]:
aaaa1[0]
