## Imports

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertConfig, TrainingArguments, Trainer
from transformers import pipeline
import datasets
import fitz
import re
import spacy

## Extracting text from pdf 

In [61]:
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            page_text = page.get_text()
            text += page_text
    return text

pdf_path = 'medicalreport3.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)
extracted_text="Jean Du Jardin, who got a consultation with John Doe"
print(extracted_text)

Jean Du Jardin, who got a consultation with John Doe


## Loading model for name(s) of patient(s) only / Classification of the tokens

In [62]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "PATIENT" : "blue"
}

options = {"colors" : colors}

In [63]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForTokenClassification.from_pretrained("obi/deid_roberta_i2b2")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)

patients = [result['word'] for result in ner_results if result['entity_group'] == 'PATIENT']
patients = [patient.strip() for patient in patients]

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [64]:
patients

['Jean', 'Jardin']

In [65]:
print(ner_results)

[{'entity_group': 'PATIENT', 'score': 0.44048473, 'word': ' Jean', 'start': 0, 'end': 4}, {'entity_group': 'HOSP', 'score': 0.49537244, 'word': ' Du', 'start': 5, 'end': 7}, {'entity_group': 'PATIENT', 'score': 0.5329712, 'word': ' Jardin', 'start': 8, 'end': 14}, {'entity_group': 'STAFF', 'score': 0.999835, 'word': ' John', 'start': 44, 'end': 48}, {'entity_group': 'STAFF', 'score': 0.999848, 'word': ' Doe', 'start': 49, 'end': 52}]


In [66]:
name_to_id = {}
index = []
for result in ner_results:
    if result['entity_group'] == 'PATIENT':
        if len(index)==0 and str(result['word']) not in str(name_to_id.values()):
            index.append(result['end'])
            current_person_id = 'ID{}'.format(len(name_to_id) + 1)
            name_to_id[current_person_id] = result['word']
        elif len(index)!=0:
            if result['start'] == int(index[-1])+1:
                index.append(result['end'])
                name_to_id[current_person_id] += result['word']
                index = []

for cle in name_to_id:
    name_to_id[cle] = name_to_id[cle].strip()

for person in patients:
    for cle in name_to_id:
        if person in str(name_to_id[cle]):
            extracted_text = extracted_text.replace(person,cle)
            
print(extracted_text)

ID1 Du Jardin, who got a consultation with John Doe


## Loading second model for all name(s) recognition

In [67]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "B-PER" : "blue",
    "I-PER" : "blue"
}

options = {"colors" : colors}

In [68]:
config = BertConfig.from_pretrained("dslim/bert-base-NER")

config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 12

model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", config=config)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, ignore_labels=[])
ner_results = nlp(extracted_text)
print(ner_results)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'O', 'score': 0.6165615, 'index': 1, 'word': 'ID', 'start': 0, 'end': 2}, {'entity': 'O', 'score': 0.9855903, 'index': 2, 'word': '##1', 'start': 2, 'end': 3}, {'entity': 'B-PER', 'score': 0.98333794, 'index': 3, 'word': 'Du', 'start': 4, 'end': 6}, {'entity': 'I-PER', 'score': 0.87076205, 'index': 4, 'word': 'J', 'start': 7, 'end': 8}, {'entity': 'I-PER', 'score': 0.83386534, 'index': 5, 'word': '##ard', 'start': 8, 'end': 11}, {'entity': 'I-ORG', 'score': 0.48710266, 'index': 6, 'word': '##in', 'start': 11, 'end': 13}, {'entity': 'O', 'score': 0.9999664, 'index': 7, 'word': ',', 'start': 13, 'end': 14}, {'entity': 'O', 'score': 0.9999758, 'index': 8, 'word': 'who', 'start': 15, 'end': 18}, {'entity': 'O', 'score': 0.9999747, 'index': 9, 'word': 'got', 'start': 19, 'end': 22}, {'entity': 'O', 'score': 0.9999769, 'index': 10, 'word': 'a', 'start': 23, 'end': 24}, {'entity': 'O', 'score': 0.9999739, 'index': 11, 'word': 'consultation', 'start': 25, 'end': 37}, {'entity': 'O'

## Name(s) being seen in the text

In [8]:
person_entities = [result['word'] for result in ner_results if result['entity'] in ['B-PER', 'I-PER']]

person_names = ' '.join(person_entities)

print(person_names)

Laurent Du ##pu ##is Emmanuel ##le More ##au Du ##pu ##is


## Name(s) / ID(s) association and Pseudonymized text 

In [9]:
name_to_id = {}
current_person_id = None
pseudonymized_text = []
noms = []

for result in ner_results:
    if result['entity'] == 'B-PER':
        if result['word'] in patients:
            person_name = result['word']
            if person_name not in noms:
                noms.append(person_name)
                current_person_id = 'ID{}'.format(len(name_to_id) + 1)
                name_to_id[current_person_id] = person_name
                pseudonymized_text.append(current_person_id)
            else:
                for cle, valeur in name_to_id.items():
                    if person_name in valeur:
                        pseudonymized_text.append(cle)
        else:
            for verif in ner_results:
                if int(verif['index'])==result['index']+1:
                    if "#" in verif['word']:
                        person = result['word']+verif['word'].replace('##','')
                        if person in noms:
                            for cle, valeur in name_to_id.items():
                                if person in valeur:
                                    pseudonymized_text.append(cle)
                    else:
                        pseudonymized_text.append(result['word'])
                    
    
    elif result['entity'] == 'I-PER':
        if result['word'] in patients:
            noms.append(result['word'])
            person_name += ' ' + result['word']  
            name_to_id[current_person_id] = person_name.replace('##', '')
            pseudonymized_text.append(current_person_id)
            
        if "#" in result['word']:
            for a in ner_results:
                if a['index'] == int(result['index'])-1:
                    b = a['word'] + result['word'].replace('##','')
                    if b in patients:
                        temp = []
                        for cle, valeur in name_to_id.items():
                            temp.append(valeur)
                        if b not in valeur:
                            noms.append(b)
                            person_name = person_name+" "+b
                            name_to_id[current_person_id] = person_name
                            pseudonymized_text.append(current_person_id)
                        else:
                            pseudonymized_text.append(current_person_id)

                    else:
                        pseudonymized_text.append(b)
    else:
        pseudonymized_text.append(result['word'])

pseudonymized_text = ' '.join(pseudonymized_text)
print(pseudonymized_text)

Mr . ID1 Dupu ##puis , born on October 10 , 1982 , presented himself at the Saint - Antoine Hospital on June 25 , 202 ##4 , complaining of persistent chest pain for three days . After a thorough clinical examination by Dr . ##le Moreau , as well as additional tests including a chest X - ray and an electro ##card ##io ##gram , it was determined that the patient , residing at 55 Rue du F ##au ##bourg Saint - Honor ##é , 750 ##0 ##8 Paris , and reach ##able at 01 22 33 44 55 , is suffering from stable an ##gin ##a . The recommended treatment includes taking ni ##tro ##gly ##cer ##in in case of acute pain , as well as starting a beta - block ##er treatment to prevent future episodes . Regular follow - up with a card ##iol ##ogist is strongly advised . Mr . Dupu ##puis has been informed of the lifestyle changes he needs to make , including quit ##ting smoking , adopting a balanced diet , and engaging in regular physical activity . The patient understood the recommendations and was referred 

In [10]:
cleaned_text = re.sub(r'\s*##\s*', '', pseudonymized_text)
print(cleaned_text)

Mr . ID1 Dupupuis , born on October 10 , 1982 , presented himself at the Saint - Antoine Hospital on June 25 , 2024 , complaining of persistent chest pain for three days . After a thorough clinical examination by Dr .le Moreau , as well as additional tests including a chest X - ray and an electrocardiogram , it was determined that the patient , residing at 55 Rue du Faubourg Saint - Honoré , 75008 Paris , and reachable at 01 22 33 44 55 , is suffering from stable angina . The recommended treatment includes taking nitroglycerin in case of acute pain , as well as starting a beta - blocker treatment to prevent future episodes . Regular follow - up with a cardiologist is strongly advised . Mr . Dupupuis has been informed of the lifestyle changes he needs to make , including quitting smoking , adopting a balanced diet , and engaging in regular physical activity . The patient understood the recommendations and was referred to the cardiology department for specialized follow - up care .


In [11]:
print(name_to_id)

{'ID1': 'Laurent'}


## Third model for problem(s) / treatment(s) recognition

In [257]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "B-problem" : "red",
    "I-problem" : "red",
    "B-treatment" : "green",
    "I-treatment" : "green"
}

options = {"colors" : colors}

In [258]:
config = BertConfig.from_pretrained("samrawal/bert-base-uncased_clinical-ner")

#config.hidden_dropout_prob = 0.2
#config.attention_probs_dropout_prob = 0.2
#config.num_hidden_layers = 12

tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner", config=config)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
#example = "My name is John Do and I live in Berlin with Maria Martinez who is the sister of Mario"
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [259]:
problem = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-problem'):
        compteur+=1    
        problem.append(result['word'])
    if (result['entity'] == 'I-problem'):
        problem[compteur] = problem[compteur] + ' '+result['word']

In [260]:
problem

['persistent chest pain',
 'short ##ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ##ness of breath',
 'hyper',
 '##tension',
 'type 2 diabetes',
 'hyper ##lip ##ide ##mia',
 'asthma',
 'attacks',
 'sy ##sto ##lic murmurs',
 'bilateral w ##hee ##zing',
 'abnormalities',
 'my ##oca ##rdial in ##far ##ction',
 'diffuse pulmonary infiltrate ##s',
 'congestion',
 'elevated tr ##op ##oni ##n levels',
 'a fast ##ing',
 'acute corona ##ry syndrome',
 'a my ##oca ##rdial in ##far ##ction',
 'an asthma ex ##ace ##rba ##tion glucose',
 'hyper ##tension',
 'the my ##oca ##rdial in ##far ##ction',
 'chest pain',
 'cardiac load']

In [261]:
problems = [word.replace('#', '') for word in problem]
problems

['persistent chest pain',
 'short ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ness of breath',
 'hyper',
 'tension',
 'type 2 diabetes',
 'hyper lip ide mia',
 'asthma',
 'attacks',
 'sy sto lic murmurs',
 'bilateral w hee zing',
 'abnormalities',
 'my oca rdial in far ction',
 'diffuse pulmonary infiltrate s',
 'congestion',
 'elevated tr op oni n levels',
 'a fast ing',
 'acute corona ry syndrome',
 'a my oca rdial in far ction',
 'an asthma ex ace rba tion glucose',
 'hyper tension',
 'the my oca rdial in far ction',
 'chest pain',
 'cardiac load']

In [262]:
treatment = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-treatment'):
        compteur+=1    
        treatment.append(result['word'])
    if (result['entity'] == 'I-treatment'):
        treatment[compteur] = treatment[compteur] +result['word']

In [263]:
treatments = [word.replace('#', '') for word in treatment]
treatments

['li',
 'sinopril',
 'metformin',
 'atorvastatin',
 'ventolin(albuterol)',
 'subopti',
 'malcontrol',
 'management',
 'ivadministration',
 'heparin',
 'ni',
 'troglycerin',
 'metoprolol',
 'adjustment',
 'diabetestreatment',
 'insulin',
 'ventolininhaler']

## Fourth model for dates recognition

In [394]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "DATE" : "blue"
}

options = {"colors" : colors}

In [433]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("tner/roberta-large-ontonotes5")
model = AutoModelForTokenClassification.from_pretrained("tner/roberta-large-ontonotes5")

extracted_text = "John Smith, born on 07/12/1980 in Pontault Combault 77340 with his phone number being 01 02 03 04 05, presented to our hospital named François Mitterrand located in 75015 for a consultation for persistent chest pain associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Doe: Mr. Smith reports chest pain for the past three days, described as a pressure sensation. This pain is accompanied by shortness of breath, particularly during moderate exertion. He has a history of hypertension diagnosed in 2015, currently on Lisinopril 10 mg once daily, type 2 diabetes diagnosed in 2018, on Metformin 500 mg twice daily, hyperlipidemia diagnosed in 2017, treated with Atorvastatin 20 mg once daily, and asthma since childhood, requiring the use of Ventolin (albuterol) during attacks. During the physical examination, his temperature was 36.8°C, heart rate 88 bpm, blood pressure 145/90 mmHg, and oxygen saturation 98%. Cardiac auscultation revealed systolic murmurs, and lung auscultation noted bilateral wheezing. Investigations showed an ECG with abnormalities suggestive of a myocardial infarction, a chest X-ray revealing diffuse pulmonary infiltrates indicating congestion, and blood tests indicating elevated troponin levels, a fasting blood glucose of 140 mg/dL. The diagnosis is acute coronary syndrome, likely a myocardial infarction, asthma exacerbation, and suboptimal control of blood glucose and hypertension. The treatment plan includes immediate hospitalization for the management of the myocardial infarction, intravenous administration of heparin, nitroglycerin to relieve chest pain, metoprolol to reduce cardiac workload, and an adjustment of diabetes treatment with insulin. An increased frequency of use of the Ventolin inhaler is also recommended. I recommend regular follow-up in cardiology and endocrinology after hospital discharge to ensure optimal management."

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", ignore_labels=[])
ner_results = nlp(extracted_text)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [434]:
dates = []
date_dic = {}

year_pattern = re.compile(r'\b\d{4}\b')

for result in ner_results:
    if result['entity_group'] == 'DATE':
        dates.append(result['word'])
        match = year_pattern.search(result['word'])
        if match:
            date_dic[int(match.group())] = result['word']

extracted_text = extracted_text.replace(date_dic[min(date_dic)]," DATE DE NAISSANCE")
print(extracted_text)

John Smith, born on DATE DE NAISSANCE in Pontault Combault 77340 with his phone number being 01 02 03 04 05, presented to our hospital named François Mitterrand located in 75015 for a consultation for persistent chest pain associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Doe: Mr. Smith reports chest pain for the past three days, described as a pressure sensation. This pain is accompanied by shortness of breath, particularly during moderate exertion. He has a history of hypertension diagnosed in 2015, currently on Lisinopril 10 mg once daily, type 2 diabetes diagnosed in 2018, on Metformin 500 mg twice daily, hyperlipidemia diagnosed in 2017, treated with Atorvastatin 20 mg once daily, and asthma since childhood, requiring the use of Ventolin (albuterol) during attacks. During the physical examination, his temperature was 36.8°C, heart rate 88 bpm, blood pressure 145/90 mmHg, and oxygen saturation 98%. Cardiac auscult

## City recognition

In [319]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [270]:
destination = []
for result in ner_results:
    if result["entity_group"] == "LOC":
        destination.append(result["word"])
print(destination)

[]


## Phone number recognition

In [315]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

labels = ["phone number"]
entities = model.predict_entities(cleaned_text, labels)

for entity in entities:
    phone_number = entity["text"]
                
cleaned_text = cleaned_text.replace(phone_number,"NUMERO_DE_TELEPHONE")
print(cleaned_text)

John Smith, born the 12/07/1980 at Pontault 
Combault 77340 with his phone number being 
NUMERO_DE_TELEPHONE, presented to our hospital called 
François Mitterrand for a consultation for 
persistent chest pain associated with shortness 
of breath. Here is a summary of his medical 
history and recent consultation done by doctor 
Doe:  
Mr. Smith reports chest pain for the past three 
days, described as a pressure sensation. This 
pain is accompanied by shortness of breath, 
particularly during moderate exertion. He has a 
history of hypertension diagnosed in 2015, 
currently on Lisinopril 10 mg once a day, Type 2 
diabetes diagnosed in 2018, on Metformin 500 
mg twice a day, hyperlipidemia diagnosed in 
2017, treated with Atorvastatin 20 mg once a 
day, and asthma since childhood, requiring the 
use of Ventolin (albuterol) during attacks.  
On physical examination, his temperature was 
36.8°C, heart rate 88 bpm, blood pressure 
145/90 mmHg, and oxygen saturation 98%. 
Cardiac auscultati

## Postal Code recognitionn

In [706]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "ID_NUM" : "blue"
}

options = {"colors" : colors}

In [709]:
pdf_path = 'medicalreport2.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)
extracted_text = "Marc Dupont, born on 08/11/1975 in Marseille, 13001, and with the phone number 07 23 45 67 89, presented himself to our hospital named Charles de Gaulle, located in 75014, for a consultation regarding persistent chest pains associated with shortness of breath. Here is a summary of his medical history and his recent consultation performed by Dr. Martin: Mr. Dupont reports having chest pains for five days, described as a burning sensation. These pains are accompanied by shortness of breath, especially when climbing stairs. He has a history of hypertension diagnosed in 2012, currently treated with Enalapril 20 mg once a day, type 2 diabetes diagnosed in 2014, on Glucophage 850 mg twice a day, hyperlipidemia diagnosed in 2016, treated with Rosuvastatin 10 mg once a day, and asthma since childhood, requiring the use of Bricanyl during attacks. "

In [710]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("zmilczarek/pii-detection-roberta-v2")
model = AutoModelForTokenClassification.from_pretrained("zmilczarek/pii-detection-roberta-v2")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)
print(ner_results)
postal_code = []
for result in ner_results:
    if result["entity_group"] == "ID_NUM":
        postal_code.append(result["word"])

postal_code = [s.replace(" ", "") for s in postal_code]

compteur = 0
cleaned_postal_codes = []
code = []
for number in postal_code:
    compteur += len(number)
    code+=number
    if compteur==5:
        code = "".join(code)
        cleaned_postal_codes.append(code)
        compteur=0
        code=[]

for postal_code in cleaned_postal_codes:
    extracted_text = extracted_text.replace(postal_code,"CODE_POSTAL")
print(extracted_text)

[{'entity_group': 'NAME_STUDENT', 'score': 0.66099393, 'word': 'Marc Dupont', 'start': 0, 'end': 11}, {'entity_group': 'ID_NUM', 'score': 0.6839845, 'word': ' 23', 'start': 82, 'end': 84}, {'entity_group': 'ID_NUM', 'score': 0.7832932, 'word': ' 45', 'start': 85, 'end': 87}, {'entity_group': 'ID_NUM', 'score': 0.75750965, 'word': ' 67', 'start': 88, 'end': 90}, {'entity_group': 'ID_NUM', 'score': 0.63185424, 'word': ' 89', 'start': 91, 'end': 93}]
Marc Dupont, born on 08/11/1975 in Marseille, 13001, and with the phone number 07 23 45 67 89, presented himself to our hospital named Charles de Gaulle, located in 75014, for a consultation regarding persistent chest pains associated with shortness of breath. Here is a summary of his medical history and his recent consultation performed by Dr. Martin: Mr. Dupont reports having chest pains for five days, described as a burning sensation. These pains are accompanied by shortness of breath, especially when climbing stairs. He has a history of h

## Adress recognition

In [5]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": text, "ents": entities}

colors = {
    "DATE" : "blue"
}

options = {"colors" : colors}

In [11]:
text = "Mr. Pierre Martin, born on March 12, 1980, presented himself at the Hôpital Saint-Antoine on June 25, 2024, complaining of persistent chest pains for three days. After a thorough clinical examination by Dr. Sophie Bernard, as well as additional tests including a chest X-ray and electrocardiogram, it was determined that the patient, residing at 45 Avenue des Champs-Élysées, 75008 Paris, and reachable at 01 98 76 54 32, is suffering from stable angina."
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(text)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)
print(ner_results)

[{'entity_group': 'PREFIX', 'score': 0.9214648, 'word': ' Mr.', 'start': 0, 'end': 3}, {'entity_group': 'FIRSTNAME', 'score': 0.7392576, 'word': ' Pierre', 'start': 3, 'end': 10}, {'entity_group': 'MIDDLENAME', 'score': 0.5092601, 'word': ' Martin', 'start': 10, 'end': 17}, {'entity_group': 'DATE', 'score': 0.9143021, 'word': ' March 12, 1980', 'start': 26, 'end': 41}, {'entity_group': 'CITY', 'score': 0.9780513, 'word': ' Saint-Antoine', 'start': 75, 'end': 89}, {'entity_group': 'DATE', 'score': 0.99826324, 'word': ' June 25, 2024', 'start': 92, 'end': 106}, {'entity_group': 'FIRSTNAME', 'score': 0.8639211, 'word': ' Sophie', 'start': 206, 'end': 213}, {'entity_group': 'MIDDLENAME', 'score': 0.53551745, 'word': ' Bernard', 'start': 213, 'end': 221}, {'entity_group': 'STREETADDRESS', 'score': 0.9531714, 'word': ' 45 Avenue des Champs-Élysées', 'start': 345, 'end': 374}, {'entity_group': 'BUILDINGNUMBER', 'score': 0.9741322, 'word': ' 7', 'start': 375, 'end': 377}, {'entity_group': 'BUI