In [1]:
import os, re
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import treebank
from nltk.corpus import dependency_treebank
from nltk.grammar import DependencyGrammar
from nltk.parse import DependencyGraph
import pprint

In [2]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()

In [3]:
class Habitat:
    oid = -1
    name = ""

In [4]:
train_path = "./BioNLP-OST-2019_BB-norm_train/"
test_path = "./BioNLP-OST-2019_BB-norm_test/"
dev_path = "./BioNLP-OST-2019_BB-norm_dev/"
BB_train = os.listdir(train_path)
BB_test = os.listdir(test_path)
BB_dev = os.listdir(dev_path)
obp_file = "OntoBiotope_BioNLP-OST-2019.obo"

In [5]:
ontology = open(obp_file,encoding="utf8").read()

In [6]:
BB_train_a1 = sorted([name for name in BB_train if name.endswith("a1")])
BB_train_a2 = sorted([name for name in BB_train if name.endswith("a2")])
BB_train_txt = sorted([name for name in BB_train if name.endswith("txt")])

BB_dev_a1 = sorted([name for name in BB_dev if name.endswith("a1")])
BB_dev_a2 = sorted([name for name in BB_dev if name.endswith("a2")])
BB_dev_txt = sorted([name for name in BB_dev if name.endswith("txt")])

BB_test_a1 = sorted([name for name in BB_test if name.endswith("a1")])
BB_test_txt = sorted([name for name in BB_test if name.endswith("txt")])

In [7]:
def original_phrase(phrase):
    return phrase

In [8]:
def stem_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = ps.stem(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [9]:
def lemmatize_phrase(phrase):
    #print(type(phrase))
    #print(phrase)
    
    phrase = phrase.split(" ")
    for i in range(len(phrase)):
        phrase[i] = wnl.lemmatize(phrase[i])
    phrase = " ".join(phrase)
    return phrase

In [10]:
phrase_reducer = lemmatize_phrase

In [11]:
def generate_habitat_map(processor):
    habitat_list = ontology.split("\n\n[Term]\n")[1:]
    habitat_map = {}
    habitat_map_originals = {}
    for h in habitat_list:
        #new_habitat = Habitat()
        h = h.split("\n")
        oid = re.findall(r"(?<=id: OBT:)[0-9]+", h[0])[0]
        name = (re.findall(r"(?<=name: ).+", h[1])[0])
        if name != name.upper():
            name = name.lower()
            name = processor(name)
    
        habitat_map[name] = oid
        habitat_map_originals[oid] = name
    
        for h_line in h:
            if "synonym" in h_line:
                synonym_name = (re.findall(r"(?<=synonym: \").+(?=\")", h_line)[0])
                if synonym_name != synonym_name.upper():
                    synonym_name = synonym_name.lower()
                    synonym_name = processor(synonym_name)
                if "EXACT" in h_line and synonym_name not in habitat_map:
                    habitat_map[synonym_name] = oid
    return habitat_map, habitat_map_originals

In [12]:
habitat_map, habitat_map_originals = generate_habitat_map(phrase_reducer)

In [13]:
def add_habitats(a1, a2, processor):
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]: 
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                if named_entity not in habitat_map:
                    habitat_map[named_entity] = oid
                    print("Newly added entity:", named_entity, oid)
                    print("Original entity:", habitat_map_originals[oid])

In [14]:
for i in range(len(BB_train_txt)):    
    a1 = open(train_path + BB_train_a1[i],encoding="utf8").read()
    a2 = open(train_path + BB_train_a2[i],encoding="utf8").read()
    add_habitats(a1, a2, phrase_reducer)

Newly added entity: selective broth based on hypertonic strontium chloride 000360
Original entity: nutrient broth
Newly added entity: selective broth based on the bi-selenite ion 000360
Original entity: nutrient broth
Newly added entity: egg product 001086
Original entity: egg and egg product
Newly added entity: egg pulp 001086
Original entity: egg and egg product
Newly added entity: strontium chloride m broth 000360
Original entity: nutrient broth
Newly added entity: strontium selenite broth 000360
Original entity: nutrient broth
Newly added entity: strontium selenite a broth 000360
Original entity: nutrient broth
Newly added entity: bi-selenite based medium 000360
Original entity: nutrient broth
Newly added entity: ozone 000477
Original entity: additive
Newly added entity: O3 000477
Original entity: additive
Newly added entity: food industry 001557
Original entity: food processing factory
Newly added entity: acid 000477
Original entity: additive
Newly added entity: surfactant 000477


Newly added entity: fish farm sediment 001430
Original entity: aquatic sediment
Newly added entity: polyclonal rabbit serum 000524
Original entity: blood serum
Newly added entity: intestinal tract of healthy fish 000641
Original entity: gastrointestinal tract
Newly added entity: healthy fish 002793
Original entity: fish
Newly added entity: sediment sample from diseased farm 001430
Original entity: aquatic sediment
Newly added entity: diseased farm 000630
Original entity: fish farm
Newly added entity: sediment sample from a disease-free fish farm 001430
Original entity: aquatic sediment
Newly added entity: disease-free fish farm 000630
Original entity: fish farm
Newly added entity: fish farming 000630
Original entity: fish farm
Newly added entity: rabbit serum 000524
Original entity: blood serum
Newly added entity: clinical material (amniotic fluid, intrauterine secretions, exudate of the pelvic dead space) of patient with various infection 000039
Original entity: animal part
Newly adde

Newly added entity: cheese  from dairy c 001480
Original entity: cheese
Newly added entity: cheese  from dairy a 001480
Original entity: cheese
Newly added entity: starter  culture 001030
Original entity: cheese starter culture
Newly added entity: cheese from dairy c 001480
Original entity: cheese
Newly added entity: cheese from dairy a 001480
Original entity: cheese
Newly added entity: cheese from  dairy b 001480
Original entity: cheese
Newly added entity: cheese from  dairy d 001480
Original entity: cheese
Newly added entity: cheese from dairy  b 001480
Original entity: cheese
Newly added entity: cheese from dairy d 001480
Original entity: cheese
Newly added entity: cheese from dairy b 001480
Original entity: cheese
Newly added entity: cheese from the farmhouse 001480
Original entity: cheese
Newly added entity: surface yeast microbiota 000113
Original entity: microflora
Newly added entity: yeast surface  microbiota 000113
Original entity: microflora
Newly added entity: yeast microbio

In [15]:
def predict_habitats(a1, txt, a1_name, processor):
    cands = 0
    matches = 0
    match_list = {}
    a1 = a1.split("\n")
    for line in a1:
        if "Habitat" in line:
            cands += 1
            name = (line.split("\t")[2])
            named_entity = name
            if name != name.upper():
                named_entity = name.lower()
                named_entity = processor(named_entity)
            if named_entity in habitat_map:
                matches += 1
                match = name + " - " + habitat_map[named_entity]
                match_list[line.split("\t")[0]] = [habitat_map[named_entity], named_entity]
    print(a1_name)
    print("Out of", cands, "candidates,", matches, "matches found")
    print()
    return match_list

In [16]:
annotations_test = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    txt = open(dev_path + BB_dev_txt[i],encoding="utf8").read() 
    annotations_test[BB_dev_a1[i]] = predict_habitats(a1, txt, BB_dev_a1[i], phrase_reducer)

BB-norm-10496597.a1
Out of 33 candidates, 9 matches found

BB-norm-11989773.a1
Out of 16 candidates, 5 matches found

BB-norm-12109661.a1
Out of 25 candidates, 16 matches found

BB-norm-1214327.a1
Out of 6 candidates, 3 matches found

BB-norm-1356998.a1
Out of 8 candidates, 7 matches found

BB-norm-14633026.a1
Out of 2 candidates, 0 matches found

BB-norm-15293611.a1
Out of 22 candidates, 14 matches found

BB-norm-15618837.a1
Out of 7 candidates, 2 matches found

BB-norm-16273411.a1
Out of 6 candidates, 3 matches found

BB-norm-16458564.a1
Out of 27 candidates, 13 matches found

BB-norm-16514151.a1
Out of 1 candidates, 0 matches found

BB-norm-18094887.a1
Out of 49 candidates, 27 matches found

BB-norm-18694716.a1
Out of 4 candidates, 0 matches found

BB-norm-18789156.a1
Out of 17 candidates, 10 matches found

BB-norm-19004249.a1
Out of 1 candidates, 1 matches found

BB-norm-19339076.a1
Out of 26 candidates, 18 matches found

BB-norm-19552770.a1
Out of 18 candidates, 2 matches found

B

In [17]:
annotations_test

{'BB-norm-10496597.a1': {'T3': ['001792', 'gastric'],
  'T4': ['001577', 'gastric mucosa'],
  'T7': ['001792', 'gastric'],
  'T11': ['001792', 'gastric'],
  'T17': ['001792', 'gastric'],
  'T26': ['001623', 'b cell'],
  'T27': ['001623', 't cell'],
  'T31': ['001792', 'gastric'],
  'T38': ['001792', 'gastric']},
 'BB-norm-11989773.a1': {'T6': ['003314', 'chicken'],
  'T12': ['000008', 'food'],
  'T13': ['003314', 'chicken'],
  'T21': ['000008', 'food'],
  'T22': ['000008', 'food']},
 'BB-norm-12109661.a1': {'T4': ['001792', 'gastric'],
  'T5': ['001792', 'gastric'],
  'T8': ['001792', 'gastric'],
  'T13': ['001792', 'gastric'],
  'T14': ['001577', 'gastric mucosa'],
  'T19': ['003188', 'child'],
  'T20': ['001577', 'gastric mucosa'],
  'T21': ['001792', 'gastric'],
  'T23': ['001792', 'gastric'],
  'T24': ['001792', 'gastric'],
  'T28': ['001792', 'gastric'],
  'T31': ['001792', 'gastric'],
  'T34': ['001792', 'gastric'],
  'T37': ['001792', 'gastric'],
  'T38': ['001577', 'gastric muc

In [18]:
def test_habitats(a1, a2, processor):
    a2 = a2.split("\n")
    a1 = a1.split("\n")
    match_list = {}
    for annotation in a2:
        a1_line = 0
        t = ""
        oid = ""
        #print(annotation)
        if "OntoBiotope" in annotation:
            t = re.findall(r"(?<=Annotation:)T[0-9]+", annotation)[0]
            oid = re.findall(r"(?<=Referent:OBT:)[0-9]+", annotation)[0]
            named_entity = ""
            while t not in a1[a1_line] and a1_line < len(a1)-1:
                a1_line += 1
            if "Habitat" in a1[a1_line]:
                named_entity = (a1[a1_line].split("\t")[2])
                if named_entity != named_entity.upper():
                    named_entity = named_entity.lower()
                    named_entity = processor(named_entity)
                match_list[t] = [oid, named_entity]
    return match_list

In [19]:
dev_a2 = {}
for i in range(len(BB_dev_txt)):
    a1 = open(dev_path + BB_dev_a1[i],encoding="utf8").read()
    a2 = open(dev_path + BB_dev_a2[i],encoding="utf8").read()
    dev_a2[BB_dev_a1[i]] = test_habitats(a1,a2, phrase_reducer)

In [20]:
true_pred_count = 0
false_pred_count = 0
total_count = 0
for key in dev_a2:
    """print()
    print(key)
    print(dev_a2[key])
    print("**************")"""
    total_count += len(dev_a2[key])
    if key in annotations_test:
        for pred in annotations_test[key]:            
            if annotations_test[key][pred][0] == dev_a2[key][pred][0]:
                true_pred_count += 1
            else:
                print(key)
                print("Prediction:", annotations_test[key][pred]," Real:", dev_a2[key][pred])
                false_pred_count += 1

BB-norm-11989773.a1
Prediction: ['003314', 'chicken']  Real: ['002394', 'chicken']
BB-norm-11989773.a1
Prediction: ['000008', 'food']  Real: ['000094', 'food']
BB-norm-11989773.a1
Prediction: ['003314', 'chicken']  Real: ['002394', 'chicken']
BB-norm-11989773.a1
Prediction: ['000008', 'food']  Real: ['000094', 'food']
BB-norm-11989773.a1
Prediction: ['000008', 'food']  Real: ['000094', 'food']
BB-norm-1356998.a1
Prediction: ['002488', 'human']  Real: ['003269', 'human']
BB-norm-1356998.a1
Prediction: ['002488', 'human']  Real: ['003269', 'human']
BB-norm-16273411.a1
Prediction: ['002488', 'human']  Real: ['003493', 'human']
BB-norm-16273411.a1
Prediction: ['000193', 'animal']  Real: ['001103', 'animal']
BB-norm-16458564.a1
Prediction: ['003188', 'child']  Real: ['003269', 'child']
BB-norm-16458564.a1
Prediction: ['003188', 'child']  Real: ['003269', 'child']
BB-norm-16458564.a1
Prediction: ['003188', 'child']  Real: ['003269', 'child']
BB-norm-18094887.a1
Prediction: ['001863', 'animal

In [21]:
recall = true_pred_count/total_count
recall

0.3950819672131147

In [22]:
precision = true_pred_count/(true_pred_count + false_pred_count)
precision

0.8546099290780141

In [None]:
len(treebank.parsed_sents())

In [None]:
tb_prod = set([production for parsed_sent in treebank.parsed_sents() for production in parsed_sent.productions()])

In [None]:
tb_prod

In [None]:
bb = dependency_treebank.parsed_sents()[1].to_conll(3)
print(bb)

In [None]:
dg = DependencyGraph(bb)
dg.tree().pprint()

In [None]:
from nltk.parse import CoreNLPParser

In [None]:
parser = CoreNLPParser(url='http://localhost:9000')

In [None]:
list(parser.parse('What is the airspeed of an unladen swallow ?'.split()))

In [None]:
from nltk.parse.corenlp import CoreNLPDependencyParser

In [None]:
dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')

In [None]:
parses = dep_parser.parse(sentence.split())

In [None]:
aaaa = [[(governor, dep, dependent) for governor, dep, dependent in parse.triples()] for parse in parses]

In [None]:
aaaa[0]

In [None]:
sentence = "birds supplemented with the n-hexane extract"