In [1]:
from datasets import load_dataset
import pandas as pd
import huggingface_hub
import matplotlib.pyplot as plt
import numpy as np
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from spacy.scorer import Scorer
import json

##### Helper functions

In [2]:
def get_ground_truth(file_name):
    ''' Read the ground truth from the json file and return the entities
        Input: file_name: the name of the json file'''
    
    file = open(file_name, "r")
    annotations = json.load(file)
    return annotations['annotations'][0][1]

In [24]:
def compute_ner_metrics(predicted_entities, ground_truth_entities):
    ''' Compute the precision, recall, and f1-score for the predicted entities
        Input: predicted_entities: the list of predicted entities, tuples (word, entity)
               ground_truth_entities: the list of ground truth entities, tuples (word, entity)'''
    
    # Initialize counters for TP, FP, and FN
    tp, fp, fn = 0, 0, 0

    # Create sets of predicted entities and ground truth entities
    predicted_entities_set = set(predicted_entities)
    ground_truth_entities_set = set(ground_truth_entities)

    # Calculate True Positives (TP)
    tp = len(predicted_entities_set.intersection(ground_truth_entities_set))

    # Calculate False Positives (FP)
    fp = len(predicted_entities_set - ground_truth_entities_set)

    # Calculate False Negatives (FN)
    fn = len(ground_truth_entities_set - predicted_entities_set)

    # Calculate Precision, Recall, and F1-Score
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1_score

In [19]:
def get_ner_ent(index, dataset_p, nlp, med7):
    ''' Get the entities from the text and display them
        Input: index: the index of the text in the dataset
               dataset_p: the dataset
               nlp: the nlp model
               med7: the med7 model '''
    
    print('ID: ', index, '\n')
    # List for the colours of the entities to be displayed
    col_dict = {}
    colours = ['#e6194B', '#3cb44b', '#ffe119', '#ffd8b1', '#f58231', '#f032e6', '#42d4f4', '#fabebe', '#469990', 
                    '#e6beff', '#9A6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#a9a9a9',
                    '#ffffff', '#000000', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c']

    # Get the entities list from the models
    nlp_ner = list(nlp.pipe_labels['ner'])
    med7_ner = list(med7.pipe_labels['ner'])
    combined_ner = nlp_ner + med7_ner

    # Create a dictionary with the entities and the colours
    l_combined_ner = len(combined_ner)
    for label, colour in zip(combined_ner, colours[:l_combined_ner]):
        col_dict[label] = colour

    # Create the options for the display
    options = {'entities': combined_ner, 'colors':col_dict}

    # Get the text entities from the dataset
    text = dataset_p.loc[index, 'tokens']
    doc_web_sm = nlp(text)

    doc_med7 = med7(text)
    docs = [doc_web_sm, doc_med7]

    # Display the entities
    spacy.displacy.render(docs, style='ent', jupyter=True, options=options)

    #  Get the entities from the text
    combined_entities = []
    for doc in docs:
        combined_entities.extend([(ent.text, ent.label_) for ent in doc.ents])

    return combined_entities

In [11]:
def process_text(text):
    ''' Process the text: lowercasing, lemmatization, stopwords removal, 
        and punctuation removal
        Input: text: the text to be processed'''
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = text.replace('  ', ' ')

    # Word tokenization 
    tokens = word_tokenize(text)

    # Normalization (lowercasing and lemmatization)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Token filtering (stopwords removal)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation tokens
    tokens = [token for token in tokens if token.isalpha()]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

### Task 2

##### Task 1 recap

First we are going to set the environment that we created in Task 1 for the preprocessing of the text using the pipeline provided in our Task 1 notebook. 

In [12]:
# Load the dataset
dataset = load_dataset("argilla/medical-domain")

# preprocess the data
dataset_p = pd.DataFrame(columns=['tokens'])
for i in range(len(dataset['train'])):
    text = dataset['train'][i]['text']
    text_p = process_text(text)
    dataset_p.loc[i] = [text_p]

In [14]:
dataset_p.head()

Unnamed: 0,tokens
0,preoperative diagnosis iron deficiency postope...
1,clinical indication normal stress procedure pe...
2,finding scan performed reformatted image obtai...
3,preoperative diagnosis blood loss postoperativ...
4,reason visit elevated psa nocturia occasional ...


The standard NER types can be obtained from the "en_core_web_sm" model. And are the following:

CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART.

In [13]:
# Load nlp english model
nlp = spacy.load("en_core_web_sm")
list(nlp.pipe_labels['ner'])



['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

Our documents are medical related, so we should look for NER classes that can explain some medical terms as medication or treatments to extend the standard NER types.
By manual inspection we identified following potential new NER types:
- Healthcare profession 
- Administrative event (e.g. hospitalization)
- Care environment
- Demographic
- Anatomy
- Diseases
- Symptom
- Examination
- Measurement unit
- Measurement value
- Treatment
- Medication
- Medication Dosage
- Medication form (e.g. tablet)
- Medication route

To extend our NER classification we can't use all of them, because we are restricted to labels which our classifiers are trained on. In order to extend our classes we are going to use the med7 NER model (https://github.com/kormilitzin/med7) composed from the following NER labels:
DOSAGE, DRUG, DURATION, FORM, FREQUENCY, ROUTE, STRENGTH

In [15]:
med7 = spacy.load("en_core_med7_lg")
list(med7.pipe_labels['ner'])

['DOSAGE', 'DRUG', 'DURATION', 'FORM', 'FREQUENCY', 'ROUTE', 'STRENGTH']

##### Apply NER types and evaluate.

To apply the NER classifier to our data we have implemented the get_ner_ent function. Will be applied after together with the evaluation of our samples.

To evaluate the NER classifier we first need to create our GT data. For that we are going to manually annonate 4 texts with the standard NER types and the extended classes. 

The GT annotations are been done exporting the NER types from the four text using NER Annotator (https://tecoholic.github.io/ner-annotator/). 

In [16]:
'''Select 4 random samples: these random samples id are then stored 
in the idx array for further replication.'''
# l = len(dataset_p)
# select 10 random samples
# idx = np.random.randint(0, l, 9)
idx = np.array([2906, 4146, 4671, 1074])

In [17]:
# test data txt: ground truth annotations
for i in idx:
    print('idx: ', i, ' --> ', dataset_p.loc[i]['tokens'])

idx:  2906  -->  preoperative diagnosis tympanic membrane atelectasis chronic eustachian tube postoperative diagnosis tympanic membrane atelectasis chronic eustachian tube operative procedure bilateral myringotomy insertion santa barbara anesthesia general finding patient white female chronic eustachian tube dysfunction tm atelectasis wa taken operating room tube time surgery ha extruding right santa barbara severe left tm atelectasis retraction wa scant amount fluid middle ear description procedure patient wa taken operating room placed supine position general mask anesthesia wa established right ear wa draped normal sterile fashion cerumen wa removed external canal extruding santa barbara wa identified atraumatically removed fresh santa barbara wa atraumatically inserted ciloxan drop attention wa directed left side severe tm atelectasis wa identified mask anesthetic eardrum elevated radial incision wa made inferior aspect tympanic membrane middle ear fluid aspirated santa barbara wa 

In [25]:
# retrieve all test data metrics
precission_all = []
recall_all = []
f1_score_all = []
for i in idx:
    # ner entities: spacy model
    ents_ne = get_ner_ent(i, dataset_p, nlp, med7)

    # gt annotations
    file_name = 'gt_annotations/{:d}_annotations.json'.format(i)
    ents_gt = get_ground_truth(file_name)

    list_gt = []
    for ent in ents_gt['entities']:
        list_gt.append((dataset_p.loc[i, 'tokens'][ent[0]:ent[1]], ent[2]))
    
    print('NER Entities: ', ents_ne)
    print('Ground Truth: ', list_gt)
    precision, recall, f1_score = compute_ner_metrics(list_gt, ents_ne)

    print()

    precission_all.append(precision)
    recall_all.append(recall)
    f1_score_all.append(f1_score)

print("Precision:", np.round(np.mean(precission_all),2), '(+/-)', np.round(np.std(precission_all),2))
print("Recall:", np.round(np.mean(recall_all),2), '(+/-)', np.round(np.std(recall_all),2))
print("F1-Score:", np.round(np.mean(f1_score_all),2), '(+/-)', np.round(np.std(f1_score_all),2))

ID:  2906 



NER Entities:  [('tympanic', 'CARDINAL'), ('santa barbara anesthesia', 'GPE'), ('santa barbara', 'GPE'), ('santa barbara', 'GPE'), ('santa barbara', 'GPE'), ('tympanic', 'CARDINAL'), ('santa barbara', 'GPE'), ('anesthesia', 'GPE'), ('inserted', 'FORM'), ('ciloxan', 'DRUG'), ('ciloxan', 'DRUG'), ('anesthesia', 'DRUG')]
Ground Truth:  [('santa barbara', 'GPE'), ('time', 'TIME'), ('ciloxan', 'DRUG'), ('santa barbara', 'GPE'), ('ciloxan', 'DRUG')]

ID:  4146 



NER Entities:  [('mm', 'PERSON'), ('four', 'CARDINAL'), ('mm', 'PERSON'), ('tritome', 'DRUG')]
Ground Truth:  [('ct', 'PRODUCT'), ('medication', 'DRUG'), ('medication', 'DRUG'), ('ercp', 'PRODUCT'), ('pylorus', 'DRUG'), ('ampulla', 'DOSAGE'), ('ampulla', 'DOSAGE'), ('ampulla', 'DOSAGE'), ('ampulla', 'DOSAGE'), ('tritome', 'DRUG'), ('mm', 'QUANTITY'), ('ampulla', 'DOSAGE'), ('ampulla', 'DOSAGE'), ('ampulla', 'DOSAGE'), ('four', 'CARDINAL'), ('mm', 'QUANTITY')]

ID:  4671 



NER Entities:  [('anesthesia', 'GPE'), ('anesthesia', 'GPE'), ('two', 'CARDINAL'), ('nasopharynx', 'DATE'), ('nasopharynx', 'DATE'), ('choana', 'ORG'), ('linear', 'ORG'), ('endotracheal', 'ROUTE'), ('two', 'DOSAGE'), ('cortisporin', 'DRUG'), ('ear', 'ROUTE')]
Ground Truth:  [('oral', 'ROUTE'), ('nasal', 'ROUTE'), ('oral', 'ROUTE'), ('cortisporin', 'DRUG'), ('drop', 'DOSAGE')]

ID:  1074 



NER Entities:  [('week day', 'DATE'), ('week day', 'DATE'), ('single', 'DOSAGE'), ('ac consistent week day', 'FREQUENCY'), ('fl', 'DOSAGE')]
Ground Truth:  [('bpm', 'QUANTITY'), ('day', 'DURATION'), ('week day', 'DATE'), ('week day', 'DATE')]

Precision: 0.37 (+/-) 0.17
Recall: 0.33 (+/-) 0.21
F1-Score: 0.3 (+/-) 0.09


For evaluation we focus on two approaches:
- Manual evaluation: comparation of NER entities list and GT entities list.
- Automatic evaluation: use Precission, recall and F1-Score.

For what can see here the automatic metrics give us very low performance. And double checking with manual evaluation we can see that automatic NER classification would need more fine-tunning of the model by retraining it with large annotated corpus. Althouh the NER types we obtain for the first text is quite similar to the one we thought about. There are some entities that our model finds as ('mm', 'PERSON') that is clearly not correct. Or ('single', 'DOSAGE') in the last text where looking to the context we can say it is more a frequency as it is not given a medicament. We can also state that it generally works correclty for drugs recognition. And it would be interesting to have a model that can detect body parts or illnesses. 

Another limitations that we should have into account to understand the results is that the GT data could be not as accurate as it should. We faced some complications with what exaclty entities refer to exactly and we found some entities are worth to annotate that the model fails to find.

##### Describe how NER type information could help in other NLP use cases.

NER can improve the understanding of texts and enable more advanced processing of text. The following are some examples where NER type information can benefit NLP applications:

1. **Information Extraction**: NER can identify and classify entities, such as people, organizations, locations, dates, and more. This information is crucial for extracting structured data from unstructured text (legal documents, forms, news).

2. **Sentiment Analysis**: identifying that a positive sentiment is expressed towards a particular brand or product in customer reviews can be valuable for businesses.

3. **Question Answering**: NER can assist in locating entities within a text that are relevant to user queries.

4. **Machine Translation**: NER can help identify and preserve the entities during the translation process, ensuring that names, places, and other important entities remain consistent across languages.
 NER can help identify sensitive information, such as personal names, addresses, or credit card numbers, and facilitate data protection measures.

In summary, NER type information serves as a foundational element in many NLP applications, enhancing their ability to understand, extract, and act on information in text, making them more powerful and context-aware.