## Imports

In [93]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertConfig, TrainingArguments, Trainer
from transformers import pipeline
import datasets
import fitz
import re
import spacy

## Extracting text from pdf 

In [94]:
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            page_text = page.get_text()
            text += page_text
    return text

pdf_path = 'medicalreport9.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

John Smith, born the 12/07/1980, presented to 
our hospital called François Mitterrand for a 
consultation for persistent chest pain associated 
with shortness of breath. Here is a summary of 
his medical history and recent consultation done 
by doctor Doe:  
Mr. Smith reports chest pain for the past three 
days, described as a pressure sensation. This 
pain is accompanied by shortness of breath, 
particularly during moderate exertion. He has a 
history of hypertension diagnosed in 2015, 
currently on Lisinopril 10 mg once a day, Type 2 
diabetes diagnosed in 2018, on Metformin 500 
mg twice a day, hyperlipidemia diagnosed in 
2017, treated with Atorvastatin 20 mg once a 
day, and asthma since childhood, requiring the 
use of Ventolin (albuterol) during attacks.  
On physical examination, his temperature was 
36.8°C, heart rate 88 bpm, blood pressure 
145/90 mmHg, and oxygen saturation 98%. 
Cardiac auscultation revealed systolic murmurs, 
and pulmonary auscultation noted bilateral 
wh

## Loading model for name(s) of patient(s) only / Classification of the tokens

In [95]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "PATIENT" : "blue"
}

options = {"colors" : colors}

In [96]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForTokenClassification.from_pretrained("obi/deid_roberta_i2b2")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)

patients = [result['word'] for result in ner_results if result['entity_group'] == 'PATIENT']
patients = [patient.strip() for patient in patients]

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [97]:
patients

['John', 'Smith', 'Smith']

## Loading second model for all name(s) recognition

In [98]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "B-PER" : "blue",
    "I-PER" : "blue"
}

options = {"colors" : colors}

In [99]:
config = BertConfig.from_pretrained("dslim/bert-base-NER")

config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 12

model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", config=config)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, ignore_labels=[])
#example = "My name is John Do and I live in Berlin with Maria Martinez who is the sister of Mario Hernandez"
ner_results = nlp(extracted_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Name(s) being seen in the text

In [100]:
person_entities = [result['word'] for result in ner_results if result['entity'] in ['B-PER', 'I-PER']]

person_names = ' '.join(person_entities)

print(person_names)

John Smith François Mi ##tter ##rand Do Smith


## Name(s) / ID(s) association and Pseudonymized text 

In [101]:
#CELLULE POUR GERER REPETITION DU MEME NOMS
name_to_id = {}
current_person_id = None
pseudonymized_text = []
noms = []

for result in ner_results:
    if result['entity'] == 'B-PER' and result['word'] in patients:
        person_name = result['word']
        if person_name not in noms:
            noms.append(person_name)
            current_person_id = 'ID{}'.format(len(name_to_id) + 1)
            name_to_id[current_person_id] = person_name
            pseudonymized_text.append(current_person_id)
        else:
            pseudonymized_text.append("he")
    elif result['entity'] == 'I-PER' and result['word'] in patients:
        noms.append(result['word'])
        person_name += ' ' + result['word']  
        name_to_id[current_person_id] = person_name.replace('##', '')
        pseudonymized_text.append(current_person_id)
    else:
        pseudonymized_text.append(result['word'])

pseudonymized_text = ' '.join(pseudonymized_text)
print(pseudonymized_text)

ID1 ID1 , born the 12 / 07 / 1980 , presented to our hospital called François Mi ##tter ##rand for a consultation for persistent chest pain associated with short ##ness of breath . Here is a summary of his medical history and recent consultation done by doctor Do ##e : Mr . he reports chest pain for the past three days , described as a pressure sensation . This pain is accompanied by short ##ness of breath , particularly during moderate ex ##ert ##ion . He has a history of h ##yper ##tens ##ion diagnosed in 2015 , currently on Li ##sin ##op ##ril 10 mg once a day , Type 2 diabetes diagnosed in 2018 , on Met ##form ##in 500 mg twice a day , h ##yper ##lip ##ide ##mia diagnosed in 2017 , treated with At ##or ##vas ##tat ##in 20 mg once a day , and as ##th ##ma since childhood , requiring the use of V ##ent ##olin ( al ##but ##ero ##l ) during attacks . On physical examination , his temperature was 36 . 8 ##° ##C , heart rate 88 b ##pm , blood pressure 145 / 90 mm ##H ##g , and oxygen sat

In [102]:
cleaned_text = re.sub(r'\s*##\s*', '', pseudonymized_text)
print(cleaned_text)

ID1 ID1 , born the 12 / 07 / 1980 , presented to our hospital called François Mitterrand for a consultation for persistent chest pain associated with shortness of breath . Here is a summary of his medical history and recent consultation done by doctor Doe : Mr . he reports chest pain for the past three days , described as a pressure sensation . This pain is accompanied by shortness of breath , particularly during moderate exertion . He has a history of hypertension diagnosed in 2015 , currently on Lisinopril 10 mg once a day , Type 2 diabetes diagnosed in 2018 , on Metformin 500 mg twice a day , hyperlipidemia diagnosed in 2017 , treated with Atorvastatin 20 mg once a day , and asthma since childhood , requiring the use of Ventolin ( albuterol ) during attacks . On physical examination , his temperature was 36 . 8°C , heart rate 88 bpm , blood pressure 145 / 90 mmHg , and oxygen saturation 98 % . Cardiac auscultation revealed systolic murmurs , and pulmonary auscultation noted bilatera

In [103]:
print(name_to_id)

{'ID1': 'John Smith'}


## Third model for problem(s) / treatment(s) recognition

In [104]:
config = BertConfig.from_pretrained("samrawal/bert-base-uncased_clinical-ner")

#config.hidden_dropout_prob = 0.2
#config.attention_probs_dropout_prob = 0.2
#config.num_hidden_layers = 12

tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner", config=config)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
#example = "My name is John Do and I live in Berlin with Maria Martinez who is the sister of Mario"
ner_results = nlp(cleaned_text)
print(ner_results)




[{'entity': 'B-problem', 'score': 0.9972057, 'index': 27, 'word': 'persistent', 'start': 112, 'end': 122}, {'entity': 'I-problem', 'score': 0.99801886, 'index': 28, 'word': 'chest', 'start': 123, 'end': 128}, {'entity': 'I-problem', 'score': 0.99866617, 'index': 29, 'word': 'pain', 'start': 129, 'end': 133}, {'entity': 'B-problem', 'score': 0.99838734, 'index': 32, 'word': 'short', 'start': 150, 'end': 155}, {'entity': 'I-problem', 'score': 0.9971258, 'index': 33, 'word': '##ness', 'start': 155, 'end': 159}, {'entity': 'I-problem', 'score': 0.9991912, 'index': 34, 'word': 'of', 'start': 160, 'end': 162}, {'entity': 'I-problem', 'score': 0.9990127, 'index': 35, 'word': 'breath', 'start': 163, 'end': 169}, {'entity': 'B-problem', 'score': 0.9967152, 'index': 57, 'word': 'chest', 'start': 274, 'end': 279}, {'entity': 'I-problem', 'score': 0.9978436, 'index': 58, 'word': 'pain', 'start': 280, 'end': 284}, {'entity': 'B-problem', 'score': 0.99423826, 'index': 67, 'word': 'a', 'start': 324, 

In [105]:
problem = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-problem'):
        compteur+=1    
        problem.append(result['word'])
    if (result['entity'] == 'I-problem'):
        problem[compteur] = problem[compteur] + ' '+result['word']

In [106]:
problem

['persistent chest pain',
 'short ##ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ##ness of breath',
 'hyper ##tension',
 'type 2 diabetes',
 'hyper ##lip ##ide ##mia',
 'asthma',
 'attacks',
 'sy ##sto',
 '##lic murmurs',
 'bilateral w ##hee ##zing ec ##g',
 'abnormalities',
 'my ##oca ##rdial in ##far ##ction',
 'diffuse pulmonary infiltrate ##s',
 'congestion',
 'elevated tr ##op ##oni ##n levels',
 'acute corona ##ry syndrome',
 'a my ##oca ##rdial in ##far ##ction',
 'an asthma ex ##ace ##rba ##tion glucose',
 'hyper ##tension',
 'the my ##oca ##rdial in ##far ##ction',
 'chest pain',
 'cardiac load']

In [107]:
problems = [word.replace('#', '') for word in problem]
problems

['persistent chest pain',
 'short ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ness of breath',
 'hyper tension',
 'type 2 diabetes',
 'hyper lip ide mia',
 'asthma',
 'attacks',
 'sy sto',
 'lic murmurs',
 'bilateral w hee zing ec g',
 'abnormalities',
 'my oca rdial in far ction',
 'diffuse pulmonary infiltrate s',
 'congestion',
 'elevated tr op oni n levels',
 'acute corona ry syndrome',
 'a my oca rdial in far ction',
 'an asthma ex ace rba tion glucose',
 'hyper tension',
 'the my oca rdial in far ction',
 'chest pain',
 'cardiac load']

In [108]:
treatment = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-treatment'):
        compteur+=1    
        treatment.append(result['word'])
    if (result['entity'] == 'I-treatment'):
        treatment[compteur] = treatment[compteur] +result['word']

In [109]:
treatments = [word.replace('#', '') for word in treatment]
treatments

['li',
 'sinopril',
 'metformin',
 'at',
 'orvastatin',
 'ventolin(albuterol)',
 'subopti',
 'malcontroloftreatment',
 'management',
 'ivadministration',
 'heparin',
 'ni',
 'troglycerin',
 'metoprolol',
 'adjustment',
 'diabetestreatment',
 'insulin',
 'ventolininhaler']

## Fourth model for dates recognition

In [113]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "DATE" : "blue"
}

options = {"colors" : colors}

In [114]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")



from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", ignore_labels=[])
ner_results = nlp(cleaned_text)


spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [115]:
dates = []
extracted_text = []
date_dic = {}

year_pattern = re.compile(r'\b\d{4}\b')

for result in ner_results:
    if result['entity_group'] == 'DATE':
        dates.append(result['word'])
        match = year_pattern.search(result['word'])
        if match:
            date_dic[int(match.group())] = result['word']

for result in ner_results:
    if result['word'] == date_dic[min(date_dic)]:
        extracted_text.append("DATE DE NAISSANCE")
    else:
        extracted_text.append(result['word'])
        
extracted_text = ' '.join(extracted_text)
print(extracted_text)

ID1 ID1 , born the DATE DE NAISSANCE , presented to our hospital called François Mitterrand for a consultation for persistent chest pain associated with shortness of breath . Here is a summary of his medical history and recent consultation done by doc tor Doe : Mr . he reports chest pain for the past three days , described as a pressure sensation . This pain is accompanied by shortness of breath , particularly during moderate exertion . He has a history of hypertension diagnosed in 2015 , currently on Lisinopril 10 mg once a day , Type 2 diabetes diagnosed in 2018 , on Metformin 500 mg twice a day , hyperlipidemia diagnosed in 2017 , treated with Atorvastatin 20 mg once a day , and asthma since childhood , requiring the use of Ventolin ( albuterol ) during attacks . On physical examination , his temperature was 36 . 8°C , heart rate 88 bpm , blood pressure 145 / 90 mmHg , and oxygen saturation 98 % . Cardiac auscultation revealed systolic murmurs , and pulmonary auscultation noted bila

## Comparing results with other models

In [129]:
config = BertConfig.from_pretrained("Dr-BERT/DrBERT-7GB")

config.hidden_dropout_prob = 0.1
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 25

model = AutoModelForTokenClassification.from_pretrained("Dr-BERT/DrBERT-7GB", config=config)
tokenizer = AutoTokenizer.from_pretrained("Dr-BERT/DrBERT-7GB")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, ignore_labels=[])
ner_results = nlp(extracted_text)
print(ner_results)

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

You are using a model of type camembert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at Dr-BERT/DrBERT-7GB and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weig

tokenizer_config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/791k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

[{'entity': 'LABEL_1', 'score': 0.51489985, 'index': 1, 'word': '▁John', 'start': 0, 'end': 4}, {'entity': 'LABEL_0', 'score': 0.51016515, 'index': 2, 'word': '▁Smith', 'start': 4, 'end': 10}, {'entity': 'LABEL_0', 'score': 0.54650736, 'index': 3, 'word': ',', 'start': 10, 'end': 11}, {'entity': 'LABEL_0', 'score': 0.5420798, 'index': 4, 'word': '▁aged', 'start': 11, 'end': 16}, {'entity': 'LABEL_1', 'score': 0.5408063, 'index': 5, 'word': '▁44,', 'start': 16, 'end': 20}, {'entity': 'LABEL_0', 'score': 0.5772229, 'index': 6, 'word': '▁presented', 'start': 20, 'end': 30}, {'entity': 'LABEL_1', 'score': 0.5394204, 'index': 7, 'word': '▁to', 'start': 30, 'end': 33}, {'entity': 'LABEL_0', 'score': 0.5419339, 'index': 8, 'word': '▁our', 'start': 33, 'end': 37}, {'entity': 'LABEL_0', 'score': 0.50280106, 'index': 9, 'word': '▁consultation', 'start': 37, 'end': 50}, {'entity': 'LABEL_0', 'score': 0.5121327, 'index': 10, 'word': '▁for', 'start': 50, 'end': 54}, {'entity': 'LABEL_0', 'score': 0

In [280]:
problem_model2 = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B_problem'):
        compteur+=1    
        problem_model2.append(result['word'])
    if (result['entity'] == 'I_problem'):
        problem_model2[compteur] = problem_model2[compteur] + ' '+result['word']
print(problem_model2)

['chest pain', 'short ##ness of breath', 'the pain', 'a pressure - sensation', 'hyper ##tension', 'type 2 diabetes mel ##lit ##us', 'hyper ##lip ##ide ##mia', 'a my ##oca ##rdial in ##far ##ction', 'murmurs', 'acute changes', 'elevated', 'elevated cardiac markers', 'acute corona ##ry syndrome', 'chest pain']


In [281]:
problem_model2 = [word.replace('#', '') for word in problem_model2]
problems

['chest pain',
 'short',
 'ness of breath',
 'the pain',
 'a pressure - like sensation',
 'hyper tension',
 'type 2 diabetes mel lit us',
 'hyper lip ide mia',
 'a my oca rdial in far ction',
 'uncomfortable',
 'murmurs',
 'acute changes',
 'elevated',
 'elevated cardiac markers',
 'acute corona ry syndrome',
 'chest pain']

In [282]:
problems

['chest pain',
 'short',
 'ness of breath',
 'the pain',
 'a pressure - like sensation',
 'hyper tension',
 'type 2 diabetes mel lit us',
 'hyper lip ide mia',
 'a my oca rdial in far ction',
 'uncomfortable',
 'murmurs',
 'acute changes',
 'elevated',
 'elevated cardiac markers',
 'acute corona ry syndrome',
 'chest pain']