In [5]:
import os, re
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import treebank
from nltk.corpus import dependency_treebank
from nltk.grammar import DependencyGrammar
from nltk.parse import DependencyGraph
import pprint
from nltk.parse.corenlp import CoreNLPDependencyParser

In [6]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [7]:
class Habitat:
    oid = -1
    name = ""

In [8]:
train_path = "./BioNLP-OST-2019_BB-norm_train/"
test_path = "./BioNLP-OST-2019_BB-norm_test/"
dev_path = "./BioNLP-OST-2019_BB-norm_dev/"
BB_train = os.listdir(train_path)
BB_test = os.listdir(test_path)
BB_dev = os.listdir(dev_path)
obp_file = "OntoBiotope_BioNLP-OST-2019.obo"

In [9]:
ontology = open(obp_file,encoding="utf8").read()

In [10]:
BB_train_a1 = sorted([name for name in BB_train if name.endswith("a1")])
BB_train_a2 = sorted([name for name in BB_train if name.endswith("a2")])
BB_train_txt = sorted([name for name in BB_train if name.endswith("txt")])

BB_dev_a1 = sorted([name for name in BB_dev if name.endswith("a1")])
BB_dev_a2 = sorted([name for name in BB_dev if name.endswith("a2")])
BB_dev_txt = sorted([name for name in BB_dev if name.endswith("txt")])

BB_test_a1 = sorted([name for name in BB_test if name.endswith("a1")])
BB_test_txt = sorted([name for name in BB_test if name.endswith("txt")])

In [11]:
def original_phrase(phrase):
    return phrase

In [12]:
def stem_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = ps.stem(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [13]:
def lemmatize_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = wnl.lemmatize(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [14]:
def get_headword(phrase):
    #print(phrase)
    if phrase == phrase.upper():
        return ""
    
    parses = dep_parser.parse(phrase.split())
    triples = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]
    
    try:
        if(len(triples) > 0 and len(triples[0]) > 0 and len(triples[0][0]) > 0 and len(triples[0][0][0]) > 0 and triples[0][0][0][0] is not None):
            return triples[0][0][0][0]
        else:
            return phrase
    except:
        return phrase
    
    

In [15]:
phrase_reducer = lemmatize_phrase

In [16]:
def generate_habitat_map(processor):
    habitat_list = ontology.split("\n\n[Term]\n")[1:]
    habitat_map = {}
    habitat_map_originals = {}
    for h in habitat_list:
        #new_habitat = Habitat()
        h = h.split("\n")
        oid = re.findall(r"(?<=id: OBT:)[0-9]+", h[0])[0]
        name = (re.findall(r"(?<=name: ).+", h[1])[0])
        if name != name.upper():
            name = name.lower()
            name = processor(name)
    
        habitat_map[name] = oid
        habitat_map_originals[oid] = name
    
        for h_line in h:
            if "synonym" in h_line:
                synonym_name = (re.findall(r"(?<=synonym: \").+(?=\")", h_line)[0])
                if synonym_name != synonym_name.upper():
                    synonym_name = synonym_name.lower()
                    synonym_name = processor(synonym_name)
                if "EXACT" in h_line and synonym_name not in habitat_map:
                    habitat_map[synonym_name] = oid
    return habitat_map, habitat_map_originals

In [17]:
habitat_map, habitat_map_originals = generate_habitat_map(phrase_reducer)

In [18]:
def add_habitats(a1, a2, processor):
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]: 
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                if named_entity not in habitat_map:
                    habitat_map[named_entity] = oid
                    #print("Newly added entity:", named_entity, oid)
                    #print("Original entity:", habitat_map_originals[oid])

In [19]:
for i in range(len(BB_train_txt)):    
    a1 = open(train_path + BB_train_a1[i],encoding="utf8").read()
    a2 = open(train_path + BB_train_a2[i],encoding="utf8").read()
    add_habitats(a1, a2, phrase_reducer)

In [45]:
def predict_habitats(a1, txt, a1_name, processor, pred_file_name):
    pred_file = open(pred_file_name, "w")
    cands = 0
    matches = 0
    match_list = {}
    a1 = a1.split("\n")
    for line in a1:
        found = False
        if "Habitat" in line:
            cands += 1
            name = (line.split("\t")[2])
            named_entity = name
            if name != name.upper():
                name = name.lower()
                named_entity = processor(name)
            if named_entity in habitat_map:
                matches += 1
                match = name + " - " + habitat_map[named_entity] + "---exact"
                match_list[line.split("\t")[0]] = [habitat_map[named_entity], named_entity, "exact"]
                found = True
            else:
                #print(name)
                pre_named_entity = get_headword(name)
                #print(pre_named_entity)
                named_entity = processor(pre_named_entity)
                if named_entity in habitat_map:
                    matches += 1
                    match = name + " - " + habitat_map[named_entity] + "---headwordexact"
                    print(match)
                    match_list[line.split("\t")[0]] = [habitat_map[named_entity], name, "headwordexact", named_entity]
                    found = True
        if found:
            pred_file.write("OntoBiotope Annotation:" + line.split("\t")[0] + "\tReferent:OBT:" + match_list[line.split("\t")[0]][0] + "\n")
    #print(a1_name)
    #print("Out of", cands, "candidates,", matches, "matches found")
    #print()
    pred_file.close()
    return match_list

In [46]:
annotations_test = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    txt = open(dev_path + BB_dev_txt[i],encoding="utf8").read()
    pred_file_name = "./dev_preds/" + BB_dev_a2[i]
    annotations_test[BB_dev_a1[i]] = predict_habitats(a1, txt, BB_dev_a1[i], phrase_reducer, pred_file_name)

gastric mucosal-associated lymphoid tissue - 000196---headwordexact
monoclonal b cells - 000061---headwordexact
gastric malt - 003215---headwordexact
patients with atypical lymphoid infiltrates - 003220---headwordexact
patients with low-grade malt lymphoma - 003220---headwordexact
patients with helicobacter pylori-chronic active gastritis - 003220---headwordexact
patients with high-grade primary gastric lymphoma - 003220---headwordexact
patients with chronic active gastritis - 003220---headwordexact
cd20-positive cells - 000061---headwordexact
lymphoepithelial lesions - 000332---headwordexact
monoclonal b cells - 000061---headwordexact
gastric malt - 003215---headwordexact
chicken nugget processing plant - 000393---headwordexact
chicken nugget - 002729---headwordexact
chicken nugget - 002729---headwordexact
chicken nugget processing plant - 000393---headwordexact
selective agars - 000031---headwordexact
h. pylori-positive subjects - 002488---headwordexact
h. pylori-negative subjects - 

In [25]:
annotations_test

{'BB-norm-10496597.a1': {'T3': ['001792', 'gastric', 'exact'],
  'T4': ['001577', 'gastric mucosa', 'exact'],
  'T5': ['000196',
   'gastric mucosal-associated lymphoid tissue',
   'headwordexact',
   'tissue'],
  'T7': ['001792', 'gastric', 'exact'],
  'T9': ['000061', 'monoclonal b cells', 'headwordexact', 'cell'],
  'T11': ['001792', 'gastric', 'exact'],
  'T12': ['003215', 'gastric malt', 'headwordexact', 'malt'],
  'T13': ['003220',
   'patients with atypical lymphoid infiltrates',
   'headwordexact',
   'patient'],
  'T14': ['003220',
   'patients with low-grade malt lymphoma',
   'headwordexact',
   'patient'],
  'T15': ['003220',
   'patients with helicobacter pylori-chronic active gastritis',
   'headwordexact',
   'patient'],
  'T16': ['003220',
   'patients with high-grade primary gastric lymphoma',
   'headwordexact',
   'patient'],
  'T17': ['001792', 'gastric', 'exact'],
  'T23': ['003220',
   'patients with chronic active gastritis',
   'headwordexact',
   'patient'],
  

In [1]:
def test_habitats(a1, a2, processor):
    
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    match_list = {}
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]:
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                match_list[t] = [oid, named_entity]
    return match_list

In [195]:
dev_a2 = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    a2 = open(dev_path + BB_dev_a2[i],encoding="utf8").read()
    dev_a2[BB_dev_a1[i]] = test_habitats(a1,a2, phrase_reducer)

In [201]:
true_pred_count = 0
false_pred_count = 0
total_count = 0
for key in dev_a2:
    """print()
    print(key)
    print(dev_a2[key])
    print("**************")"""
    total_count += len(dev_a2[key])
    if key in annotations_test:
        for pred in annotations_test[key]:            
            if annotations_test[key][pred][0] == dev_a2[key][pred][0]:
                true_pred_count += 1
            else:
                print(key)
                print("Prediction:", annotations_test[key][pred]," Real:", dev_a2[key][pred])
                print("Prediction:", habitat_map_originals[annotations_test[key][pred][0]], " Real:", habitat_map_originals[dev_a2[key][pred][0]])
                false_pred_count += 1

BB-norm-10496597.a1
Prediction: ['000196', 'gastric mucosal-associated lymphoid tissue', 'headwordexact', 'tissue']  Real: ['000334', 'gastric mucosal-associated lymphoid tissue']
Prediction: animal tissue  Real: lymphatic system part
BB-norm-10496597.a1
Prediction: ['000061', 'monoclonal b cells', 'headwordexact', 'cell']  Real: ['001623', 'monoclonal b cell']
Prediction: cell  Real: lymphocyte
BB-norm-10496597.a1
Prediction: ['003215', 'gastric malt', 'headwordexact', 'malt']  Real: ['000334', 'gastric malt']
Prediction: malt  Real: lymphatic system part
BB-norm-10496597.a1
Prediction: ['003220', 'patients with helicobacter pylori-chronic active gastritis', 'headwordexact', 'patient']  Real: ['003269', 'patient with helicobacter pylori-chronic active gastritis']
Prediction: patient  Real: patient with infectious disease
BB-norm-10496597.a1
Prediction: ['000061', 'cd20-positive cells', 'headwordexact', 'cell']  Real: ['001623', 'cd20-positive cell']
Prediction: cell  Real: lymphocyte


In [197]:
recall = true_pred_count/total_count
recall

0.4967213114754098

In [198]:
precision = true_pred_count/(true_pred_count + false_pred_count)
precision

0.6703539823008849

## Denemeler

In [23]:
len(treebank.parsed_sents())

3914

In [24]:
tb_prod = set([production for parsed_sent in treebank.parsed_sents() for production in parsed_sent.productions()])

In [25]:
tb_prod

{SBAR -> WHADVP-3 S,
 NP-SBJ -> NNP NNP NNP NNP NNP NNP,
 VBZ -> 'factors',
 JJ -> 'sweeping',
 NN -> 'lesson',
 VP -> VBG PP-CLR , SBAR-PRP,
 VP -> VBZ SBAR , S-ADV,
 ADJP -> JJ RB,
 NN -> 'information',
 NN -> 'subsidiary',
 NP-PRD -> NP CC NP,
 CD -> '3.80',
 NNS -> 'Bonds',
 VP -> VB ADVP-MNR NP,
 NP -> DT NNP CD NNP,
 JJ -> 'Washington-based',
 VB -> 'say',
 VBZ -> 'reports',
 NN -> 'month',
 -NONE- -> '*-94',
 NNP -> 'Roosevelt',
 VBN -> 'rationed',
 VBD -> 'hired',
 S-1 -> NP-SBJ PRN VP . '',
 NNP -> 'Hartford',
 CD -> '170,000',
 JJ -> 'larger-than-normal',
 PP-DIR -> IN NP-TMP,
 CD -> '1920s',
 NN -> 'cardboard',
 PP-CLR -> IN `` ADJP,
 NNS -> 'clothes',
 NP-SBJ-2 -> NP , VP ,,
 NN -> 'commodity',
 VB -> 'Put',
 JJ -> 'secondary',
 JJ -> 'enlarged',
 VBN -> 'UPHELD',
 -NONE- -> '*T*-13',
 -NONE- -> '*-111',
 CD -> '149',
 -NONE- -> '*-130',
 VP -> VBD PP-DIR PP,
 JJ -> 'six-bottle',
 VBD -> 'included',
 NNP -> 'N.J.',
 VP -> VBP NP-2 PP,
 CD -> '77.70',
 VB -> 'write',
 VB -> 

In [26]:
bb = dependency_treebank.parsed_sents()[1].to_conll(3)
print(bb)

Mr.	NNP	2
Vinken	NNP	3
is	VBZ	0
chairman	NN	3
of	IN	4
Elsevier	NNP	7
N.V.	NNP	12
,	,	12
the	DT	12
Dutch	NNP	12
publishing	VBG	12
group	NN	5
.	.	3



In [27]:
dg = DependencyGraph(bb)
dg.tree().pprint()

(is
  (Vinken Mr.)
  (chairman (of (group (N.V. Elsevier) , the Dutch publishing)))
  .)


In [28]:
from nltk.parse import CoreNLPParser

In [29]:
parser = CoreNLPParser(url='http://localhost:9000')

In [30]:
list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))

[Tree('ROOT', [Tree('SBARQ', [Tree('WHNP', [Tree('WP', ['What'])]), Tree('SQ', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['airspeed'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['an']), Tree('JJ', ['unladen'])])]), Tree('S', [Tree('VP', [Tree('VB', ['swallow'])])])])]), Tree('.', ['?'])])])]

In [31]:
from nltk.parse.corenlp import CoreNLPDependencyParser

In [32]:
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [49]:
sentence1 = "birds supplemented with the n-hexane extract"

In [50]:
parses1 = dep_parser.parse(sentence1.split())

In [51]:
aaaa1 = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses1]

In [52]:
aaaa1[0]


[(('birds', 'NNS'), 'acl', ('supplemented', 'VBN')),
 (('supplemented', 'VBN'), 'nmod', ('extract', 'NN')),
 (('extract', 'NN'), 'case', ('with', 'IN')),
 (('extract', 'NN'), 'det', ('the', 'DT')),
 (('extract', 'NN'), 'compound', ('n-hexane', 'NN'))]