# Recognition of Biomedical and Temporal entities

## Pre-processing stage

### Open the PDF using Fitz

In [None]:
import fitz

with fitz.open("17L0002_anon.pdf") as doc:
    text = ""
    for page in doc:
        text += page.get_text()
        
print(text)

### Tokenize the content of the file into senteces and words

In [None]:
import nltk

from nltk.tokenize import word_tokenize, sent_tokenize

# Resource punkt not found. 
# Please use the NLTK Downloader to obtain the resource:
nltk.download('punkt')

# tokenize sentences
sentences = sent_tokenize(text)
sentences

# tokenize sentences into words
tokenized_sentence = [[word for word in nltk.word_tokenize(sent)] for sent in sentences]
tokenized_sentence

### Obtaining stop-words, punctuation marks and implementing a function to reformat incorrect date formats

In [None]:
import re
from nltk.corpus import stopwords
from string import punctuation

nltk.download('stopwords')

customStopWords = set(stopwords.words('spanish') + list(punctuation))

def format_word(word):
    pattern = re.compile(r"\d{1,2}.\d{1,2}.\d{4}$", re.IGNORECASE)
    if (pattern.match(word)):
        return word.replace(".", "/")
    return word

### Perform the cleaning of the tokens: Remove the stop-words, the punctuation marks and change the format of dates

In [None]:
tokenized_sentence = [[format_word(word) for word in nltk.word_tokenize(sent) 
                       if word not in customStopWords] 
                      for sent in sentences]
tokenized_sentence

## Annotate the temporal expressions

### Call the service to tag and normalize all temporal expressions present in the tokens

In [None]:
import requests
import json

coreNLPApi = "http://localhost:8080"

for sentence in tokenized_sentence:
    singleSentence = " ".join(word for word in sentence)
    print("==> << "+singleSentence+" >>")
    query = {'text': singleSentence}
    headers = {'Content-Type': 'application/json'}
    response = requests.get(coreNLPApi + "/ner", params = query, headers = headers)
    data = response.json()
    print(json.dumps(data, indent=4, sort_keys=True, ensure_ascii=False))