## Imports

In [616]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertConfig, TrainingArguments, Trainer
from transformers import pipeline
import datasets
import fitz
import re
import spacy

## Extracting text from pdf 

In [696]:
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            page_text = page.get_text()
            text += page_text
    return text

pdf_path = 'medicalreport3.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)
extracted_text="Marc Dupont, born on 08/11/1975 in Marseille, 13001, and with the phone number 07 23 45 67 89, presented himself to our hospital named Charles de Gaulle, located in 75014, for a consultation regarding persistent chest pains associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Martin: Mr. Dupont reports having chest pains for five days, described as a burning sensation. These pains are accompanied by shortness of breath, especially when climbing stairs. He has a history of hypertension diagnosed in 2012, currently treated with Enalapril 20 mg once a day, type 2 diabetes diagnosed in 2014, on Glucophage 850 mg twice a day, hyperlipidemia diagnosed in 2016, treated with Rosuvastatin 10 mg once a day, and asthma since childhood, requiring the use of Bricanyl during attacks. "
print(extracted_text)

Marc Dupont, born on 08/11/1975 in Marseille, 13001, and with the phone number 07 23 45 67 89, presented himself to our hospital named Charles de Gaulle, located in 75014, for a consultation regarding persistent chest pains associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Martin: Mr. Dupont reports having chest pains for five days, described as a burning sensation. These pains are accompanied by shortness of breath, especially when climbing stairs. He has a history of hypertension diagnosed in 2012, currently treated with Enalapril 20 mg once a day, type 2 diabetes diagnosed in 2014, on Glucophage 850 mg twice a day, hyperlipidemia diagnosed in 2016, treated with Rosuvastatin 10 mg once a day, and asthma since childhood, requiring the use of Bricanyl during attacks. 


## Loading model for name(s) of patient(s) only / Classification of the tokens

In [697]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "PATIENT" : "blue"
}

options = {"colors" : colors}

In [698]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForTokenClassification.from_pretrained("obi/deid_roberta_i2b2")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)

patients = [result['word'] for result in ner_results if result['entity_group'] == 'PATIENT']
patients = [patient.strip() for patient in patients]

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [699]:
patients

['Marc', 'Dupont', 'Dupont']

## Loading second model for all name(s) recognition

In [700]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "B-PER" : "blue",
    "I-PER" : "blue"
}

options = {"colors" : colors}

In [701]:
config = BertConfig.from_pretrained("dslim/bert-base-NER")

config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 12

model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", config=config)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, ignore_labels=[])
ner_results = nlp(extracted_text)
print(ner_results)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

[{'entity': 'B-PER', 'score': 0.9997781, 'index': 1, 'word': 'Marc', 'start': 0, 'end': 4}, {'entity': 'I-PER', 'score': 0.9986689, 'index': 2, 'word': 'Du', 'start': 5, 'end': 7}, {'entity': 'I-PER', 'score': 0.99878055, 'index': 3, 'word': '##pont', 'start': 7, 'end': 11}, {'entity': 'O', 'score': 0.99997425, 'index': 4, 'word': ',', 'start': 11, 'end': 12}, {'entity': 'O', 'score': 0.9999305, 'index': 5, 'word': 'born', 'start': 13, 'end': 17}, {'entity': 'O', 'score': 0.9999709, 'index': 6, 'word': 'on', 'start': 18, 'end': 20}, {'entity': 'O', 'score': 0.9999591, 'index': 7, 'word': '08', 'start': 21, 'end': 23}, {'entity': 'O', 'score': 0.99992585, 'index': 8, 'word': '/', 'start': 23, 'end': 24}, {'entity': 'O', 'score': 0.9997733, 'index': 9, 'word': '11', 'start': 24, 'end': 26}, {'entity': 'O', 'score': 0.9999012, 'index': 10, 'word': '/', 'start': 26, 'end': 27}, {'entity': 'O', 'score': 0.99996173, 'index': 11, 'word': '1975', 'start': 27, 'end': 31}, {'entity': 'O', 'score

## Name(s) being seen in the text

In [702]:
person_entities = [result['word'] for result in ner_results if result['entity'] in ['B-PER', 'I-PER']]

person_names = ' '.join(person_entities)

print(person_names)

Marc Du ##pont Charles Martin Du ##pont


## Name(s) / ID(s) association and Pseudonymized text 

In [703]:
name_to_id = {}
current_person_id = None
pseudonymized_text = []
noms = []

for result in ner_results:
    if result['entity'] == 'B-PER':
        if result['word'] in patients:
            person_name = result['word']
            if person_name not in noms:
                noms.append(person_name)
                current_person_id = 'ID{}'.format(len(name_to_id) + 1)
                name_to_id[current_person_id] = person_name
                pseudonymized_text.append(current_person_id)
            else:
                for cle, valeur in name_to_id.items():
                    if person_name in valeur:
                        pseudonymized_text.append(cle)
        else:
            for verif in ner_results:
                if int(verif['index'])==result['index']+1:
                    if "#" in verif['word']:
                        person = result['word']+verif['word'].replace('##','')
                        if person in noms:
                            for cle, valeur in name_to_id.items():
                                if person in valeur:
                                    pseudonymized_text.append(cle)
                    else:
                        pseudonymized_text.append(result['word'])
                    
    
    elif result['entity'] == 'I-PER':
        if result['word'] in patients:
            noms.append(result['word'])
            person_name += ' ' + result['word']  
            name_to_id[current_person_id] = person_name.replace('##', '')
            pseudonymized_text.append(current_person_id)
            
        if "#" in result['word']:
            for a in ner_results:
                if a['index'] == int(result['index'])-1:
                    b = a['word'] + result['word'].replace('##','')
                    if b in patients:
                        temp = []
                        for cle, valeur in name_to_id.items():
                            temp.append(valeur)
                        if b not in valeur:
                            noms.append(b)
                            person_name = person_name+" "+b
                            name_to_id[current_person_id] = person_name
                            pseudonymized_text.append(current_person_id)
                        else:
                            pseudonymized_text.append(current_person_id)

                    else:
                        pseudonymized_text.append(b)
    else:
        pseudonymized_text.append(result['word'])

pseudonymized_text = ' '.join(pseudonymized_text)
print(pseudonymized_text)

ID1 ID1 , born on 08 / 11 / 1975 in Marseille , 1300 ##1 , and with the phone number 07 23 45 67 89 , presented himself to our hospital named Charles de G ##aul ##le , located in 750 ##14 , for a consultation regarding persistent chest pains associated with short ##ness of breath . Here is a summary of his medical history and recent consultation performed by Dr . Martin : Mr . ID1 ID1 reports having chest pains for five days , described as a burning sensation . These pains are accompanied by short ##ness of breath , especially when climbing stairs . He has a history of h ##yper ##tens ##ion diagnosed in 2012 , currently treated with En ##ala ##p ##ril 20 mg once a day , type 2 diabetes diagnosed in 2014 , on G ##lu ##co ##pha ##ge 850 mg twice a day , h ##yper ##lip ##ide ##mia diagnosed in 2016 , treated with R ##os ##u ##vas ##tat ##in 10 mg once a day , and as ##th ##ma since childhood , requiring the use of B ##rica ##ny ##l during attacks .


In [704]:
cleaned_text = re.sub(r'\s*##\s*', '', pseudonymized_text)
print(cleaned_text)

ID1 ID1 , born on 08 / 11 / 1975 in Marseille , 13001 , and with the phone number 07 23 45 67 89 , presented himself to our hospital named Charles de Gaulle , located in 75014 , for a consultation regarding persistent chest pains associated with shortness of breath . Here is a summary of his medical history and recent consultation performed by Dr . Martin : Mr . ID1 ID1 reports having chest pains for five days , described as a burning sensation . These pains are accompanied by shortness of breath , especially when climbing stairs . He has a history of hypertension diagnosed in 2012 , currently treated with Enalapril 20 mg once a day , type 2 diabetes diagnosed in 2014 , on Glucophage 850 mg twice a day , hyperlipidemia diagnosed in 2016 , treated with Rosuvastatin 10 mg once a day , and asthma since childhood , requiring the use of Bricanyl during attacks .


In [705]:
print(name_to_id)

{'ID1': 'Marc Dupont'}


## Third model for problem(s) / treatment(s) recognition

In [257]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "B-problem" : "red",
    "I-problem" : "red",
    "B-treatment" : "green",
    "I-treatment" : "green"
}

options = {"colors" : colors}

In [258]:
config = BertConfig.from_pretrained("samrawal/bert-base-uncased_clinical-ner")

#config.hidden_dropout_prob = 0.2
#config.attention_probs_dropout_prob = 0.2
#config.num_hidden_layers = 12

tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner", config=config)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
#example = "My name is John Do and I live in Berlin with Maria Martinez who is the sister of Mario"
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [259]:
problem = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-problem'):
        compteur+=1    
        problem.append(result['word'])
    if (result['entity'] == 'I-problem'):
        problem[compteur] = problem[compteur] + ' '+result['word']

In [260]:
problem

['persistent chest pain',
 'short ##ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ##ness of breath',
 'hyper',
 '##tension',
 'type 2 diabetes',
 'hyper ##lip ##ide ##mia',
 'asthma',
 'attacks',
 'sy ##sto ##lic murmurs',
 'bilateral w ##hee ##zing',
 'abnormalities',
 'my ##oca ##rdial in ##far ##ction',
 'diffuse pulmonary infiltrate ##s',
 'congestion',
 'elevated tr ##op ##oni ##n levels',
 'a fast ##ing',
 'acute corona ##ry syndrome',
 'a my ##oca ##rdial in ##far ##ction',
 'an asthma ex ##ace ##rba ##tion glucose',
 'hyper ##tension',
 'the my ##oca ##rdial in ##far ##ction',
 'chest pain',
 'cardiac load']

In [261]:
problems = [word.replace('#', '') for word in problem]
problems

['persistent chest pain',
 'short ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ness of breath',
 'hyper',
 'tension',
 'type 2 diabetes',
 'hyper lip ide mia',
 'asthma',
 'attacks',
 'sy sto lic murmurs',
 'bilateral w hee zing',
 'abnormalities',
 'my oca rdial in far ction',
 'diffuse pulmonary infiltrate s',
 'congestion',
 'elevated tr op oni n levels',
 'a fast ing',
 'acute corona ry syndrome',
 'a my oca rdial in far ction',
 'an asthma ex ace rba tion glucose',
 'hyper tension',
 'the my oca rdial in far ction',
 'chest pain',
 'cardiac load']

In [262]:
treatment = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-treatment'):
        compteur+=1    
        treatment.append(result['word'])
    if (result['entity'] == 'I-treatment'):
        treatment[compteur] = treatment[compteur] +result['word']

In [263]:
treatments = [word.replace('#', '') for word in treatment]
treatments

['li',
 'sinopril',
 'metformin',
 'atorvastatin',
 'ventolin(albuterol)',
 'subopti',
 'malcontrol',
 'management',
 'ivadministration',
 'heparin',
 'ni',
 'troglycerin',
 'metoprolol',
 'adjustment',
 'diabetestreatment',
 'insulin',
 'ventolininhaler']

## Fourth model for dates recognition

In [394]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "DATE" : "blue"
}

options = {"colors" : colors}

In [433]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("tner/roberta-large-ontonotes5")
model = AutoModelForTokenClassification.from_pretrained("tner/roberta-large-ontonotes5")

extracted_text = "John Smith, born on 07/12/1980 in Pontault Combault 77340 with his phone number being 01 02 03 04 05, presented to our hospital named François Mitterrand located in 75015 for a consultation for persistent chest pain associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Doe: Mr. Smith reports chest pain for the past three days, described as a pressure sensation. This pain is accompanied by shortness of breath, particularly during moderate exertion. He has a history of hypertension diagnosed in 2015, currently on Lisinopril 10 mg once daily, type 2 diabetes diagnosed in 2018, on Metformin 500 mg twice daily, hyperlipidemia diagnosed in 2017, treated with Atorvastatin 20 mg once daily, and asthma since childhood, requiring the use of Ventolin (albuterol) during attacks. During the physical examination, his temperature was 36.8°C, heart rate 88 bpm, blood pressure 145/90 mmHg, and oxygen saturation 98%. Cardiac auscultation revealed systolic murmurs, and lung auscultation noted bilateral wheezing. Investigations showed an ECG with abnormalities suggestive of a myocardial infarction, a chest X-ray revealing diffuse pulmonary infiltrates indicating congestion, and blood tests indicating elevated troponin levels, a fasting blood glucose of 140 mg/dL. The diagnosis is acute coronary syndrome, likely a myocardial infarction, asthma exacerbation, and suboptimal control of blood glucose and hypertension. The treatment plan includes immediate hospitalization for the management of the myocardial infarction, intravenous administration of heparin, nitroglycerin to relieve chest pain, metoprolol to reduce cardiac workload, and an adjustment of diabetes treatment with insulin. An increased frequency of use of the Ventolin inhaler is also recommended. I recommend regular follow-up in cardiology and endocrinology after hospital discharge to ensure optimal management."

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", ignore_labels=[])
ner_results = nlp(extracted_text)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [434]:
dates = []
date_dic = {}

year_pattern = re.compile(r'\b\d{4}\b')

for result in ner_results:
    if result['entity_group'] == 'DATE':
        dates.append(result['word'])
        match = year_pattern.search(result['word'])
        if match:
            date_dic[int(match.group())] = result['word']

extracted_text = extracted_text.replace(date_dic[min(date_dic)]," DATE DE NAISSANCE")
print(extracted_text)

John Smith, born on DATE DE NAISSANCE in Pontault Combault 77340 with his phone number being 01 02 03 04 05, presented to our hospital named François Mitterrand located in 75015 for a consultation for persistent chest pain associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Doe: Mr. Smith reports chest pain for the past three days, described as a pressure sensation. This pain is accompanied by shortness of breath, particularly during moderate exertion. He has a history of hypertension diagnosed in 2015, currently on Lisinopril 10 mg once daily, type 2 diabetes diagnosed in 2018, on Metformin 500 mg twice daily, hyperlipidemia diagnosed in 2017, treated with Atorvastatin 20 mg once daily, and asthma since childhood, requiring the use of Ventolin (albuterol) during attacks. During the physical examination, his temperature was 36.8°C, heart rate 88 bpm, blood pressure 145/90 mmHg, and oxygen saturation 98%. Cardiac auscult

## City recognition

In [319]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [270]:
destination = []
for result in ner_results:
    if result["entity_group"] == "LOC":
        destination.append(result["word"])
print(destination)

[]


## Phone number recognition

In [315]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

labels = ["phone number"]
entities = model.predict_entities(cleaned_text, labels)

for entity in entities:
    phone_number = entity["text"]
                
cleaned_text = cleaned_text.replace(phone_number,"NUMERO_DE_TELEPHONE")
print(cleaned_text)

John Smith, born the 12/07/1980 at Pontault 
Combault 77340 with his phone number being 
NUMERO_DE_TELEPHONE, presented to our hospital called 
François Mitterrand for a consultation for 
persistent chest pain associated with shortness 
of breath. Here is a summary of his medical 
history and recent consultation done by doctor 
Doe:  
Mr. Smith reports chest pain for the past three 
days, described as a pressure sensation. This 
pain is accompanied by shortness of breath, 
particularly during moderate exertion. He has a 
history of hypertension diagnosed in 2015, 
currently on Lisinopril 10 mg once a day, Type 2 
diabetes diagnosed in 2018, on Metformin 500 
mg twice a day, hyperlipidemia diagnosed in 
2017, treated with Atorvastatin 20 mg once a 
day, and asthma since childhood, requiring the 
use of Ventolin (albuterol) during attacks.  
On physical examination, his temperature was 
36.8°C, heart rate 88 bpm, blood pressure 
145/90 mmHg, and oxygen saturation 98%. 
Cardiac auscultati

## Postal Code recognitionn

In [332]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "ID_NUM" : "blue"
}

options = {"colors" : colors}

In [454]:
pdf_path = 'medicalreport2.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)
extracted_text = "Jane Doe, born on 09/25/1985 in Toulouse, 31000, with her phone number being 06 12 34 56 78, presented herself to our hospital called Victor Hugo located in  13008 for a consultation regarding persistent chest pains associated with  shortness of breath. Here is a summary of her medical history and her recent  consultation performed by Dr. Dupont: Ms. Doe reports having chest pains for the last five days, described as a burning  sensation. This pain is accompanied by shortness of breath, especially during  intense exertion. She has a history of hypertension diagnosed in 2012, currently  on Ramipril 5 mg once a day, type 1 diabetes diagnosed in 2016, on Insulin 10 units three times a day, hypercholesterolemia diagnosed in 2019, treated with Rosuvastatin 10 mg once a day, and asthma since adolescence, requiring the use of Salbutamol during attacks.The diagnosis is acute coronary syndrome, likely angina, an exacerbation of asthma, and inadequate control of blood sugar and hypertension.The treatment plan includes immediate hospitalization for the management of angina, intravenous administration of diltiazem, morphine for chest pain relief, bisoprolol to reduce cardiac workload, and an adjustment of diabetes treatment with insulin. An increased frequency of Salbutamol inhaler use is also recommended.I recommend regular follow-ups in cardiology and endocrinology after discharge from the hospital to ensure optimal management.During the physical examination, her temperature was 37.2°C, heart rate 92 bpm, blood pressure 135/85 mmHg, and oxygen saturation 97%. Cardiac auscultation revealed abnormal sounds, and pulmonary auscultation noted bilateral crepitant rales. Investigations showed an ECG with abnormalities suggestive of myocardial ischemia, a chest X-ray revealing diffuse pulmonary infiltrates indicating pneumonia, and blood tests showing elevated CRP levels and a fasting blood sugar of 150 mg/dL."

In [455]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("zmilczarek/pii-detection-roberta-v2")
model = AutoModelForTokenClassification.from_pretrained("zmilczarek/pii-detection-roberta-v2")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)
print(ner_results)
postal_code = []
for result in ner_results:
    if result["entity_group"] == "ID_NUM":
        postal_code.append(result["word"])

postal_code = [s.replace(" ", "") for s in postal_code]

compteur = 0
cleaned_postal_codes = []
code = []
for number in postal_code:
    compteur += len(number)
    code+=number
    if compteur==5:
        code = "".join(code)
        cleaned_postal_codes.append(code)
        compteur=0
        code=[]

for postal_code in cleaned_postal_codes:
    extracted_text = extracted_text.replace(postal_code,"CODE_POSTAL")
print(extracted_text)

[{'entity_group': 'NAME_STUDENT', 'score': 0.7860918, 'word': 'Jane Doe', 'start': 0, 'end': 8}, {'entity_group': 'ID_NUM', 'score': 0.6035127, 'word': ' 34', 'start': 83, 'end': 85}]
Jane Doe, born on 09/25/1985 in Toulouse, 31000, with her phone number being 06 12 34 56 78, presented herself to our hospital called Victor Hugo located in  13008 for a consultation regarding persistent chest pains associated with  shortness of breath. Here is a summary of her medical history and her recent  consultation performed by Dr. Dupont: Ms. Doe reports having chest pains for the last five days, described as a burning  sensation. This pain is accompanied by shortness of breath, especially during  intense exertion. She has a history of hypertension diagnosed in 2012, currently  on Ramipril 5 mg once a day, type 1 diabetes diagnosed in 2016, on Insulin 10 units three times a day, hypercholesterolemia diagnosed in 2019, treated with Rosuvastatin 10 mg once a day, and asthma since adolescence, requir