## Imports

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertConfig, TrainingArguments, Trainer
from transformers import pipeline
import datasets
import fitz
import re
import spacy
from datetime import datetime

## Extracting text from pdf 

In [137]:
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            page_text = page.get_text()
            text += page_text
    return text

pdf_path = 'CR 18.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)
extracted_text = """
--------------------------------------------------------------------- REPORT from the University Hospital of Champignac --------------------------------------------------------------------- Examination on 10/02/2020 (patient Le Pic Celine, 11 years old) Requesting service: HEMATO Abdomino-Pelvic MRI Performed on MR 450 W MRI installed on 06/16/2018 - 1.5 Tesla INDICATION: Monitoring of an abdominal infectious involvement in an 11-year-old child in the context of leukemia. TECHNIQUE: Axial T2, IP OP, diffusion, T1 sequences without and then after gadolinium injection at arterial, portal, and late times on the liver. Contrast agent Dotarem Vial 10 ml - Injected quantity 8 ml - Batch: 14gd001c (01-2017). RESULT: Exam compared to the CT scan of July 21, 2020: Identification of heterogeneous hepatic areas replacing previously described hepatic lesions on the CT scan. Identification of two nodular formations with a peripheral T1 hypersignal in segments VII and VI without pathological contrast uptake, compatible with sequelae. The lesion in segment VI shows a b1000 diffusion hypersignal but without ADC restriction on mapping. Patency of the hepatic veins, portal trunk, and its branches. Normal appearance of the spleen, adrenal glands, pancreas, and gallbladder. Patency of the vessels to visceral and digestive destinations explored. Appearance of renal cortical scars in the middle third of the right kidney and left kidney, and in the lower pole of the left kidney replacing previously described infectious lesions. No intra-peritoneal peri-hepatic or peri-splenic fluid effusion. CONCLUSION: Exam compared to the CT scan of July 21, 2020: Heterogeneity of the hepatic parenchyma sequelae replacing previously described hepatic lesions without active-looking lesions. Bilateral renal cortical scars. Dr. Franck I."""
#extracted_text = extracted_text.replace("patient","The patient is : ")
#extracted_text = extracted_text.replace("DR","The examination has been done by the doctor")

print(extracted_text)



--------------------------------------------------------------------- REPORT from the University Hospital of Champignac --------------------------------------------------------------------- Examination on 10/02/2020 (patient Le Pic Celine, 11 years old) Requesting service: HEMATO Abdomino-Pelvic MRI Performed on MR 450 W MRI installed on 06/16/2018 - 1.5 Tesla INDICATION: Monitoring of an abdominal infectious involvement in an 11-year-old child in the context of leukemia. TECHNIQUE: Axial T2, IP OP, diffusion, T1 sequences without and then after gadolinium injection at arterial, portal, and late times on the liver. Contrast agent Dotarem Vial 10 ml - Injected quantity 8 ml - Batch: 14gd001c (01-2017). RESULT: Exam compared to the CT scan of July 21, 2020: Identification of heterogeneous hepatic areas replacing previously described hepatic lesions on the CT scan. Identification of two nodular formations with a peripheral T1 hypersignal in segments VII and VI without pathological contra

In [138]:
def standardize_dates(text):
    date_patterns = [
        (r'\b(\d{1,2})[/\-](\d{1,2})[/\-](\d{2,4})\b', '%d/%m/%Y'),
        (r'\b(\d{2,4})[/\-](\d{1,2})[/\-](\d{1,2})\b', '%Y/%m/%d'),
        (r'\b(\d{1,2}) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (\d{2,4})\b', '%d %b %Y')
    ]
    
    for pattern, date_format in date_patterns:
        matches = re.findall(pattern, text)
        for match in matches:
            try:
                date_str = ' '.join(match)
                standardized_date = datetime.strptime(date_str, date_format).strftime('%Y-%m-%d')
                text = text.replace(date_str, standardized_date)
            except ValueError:
                continue

    return text

In [139]:
def clean_special_characters(text):
    special_chars = {
        '\n': ' ',  
        '\t': ' ', 
    }

    for char, replacement in special_chars.items():
        text = text.replace(char, replacement)

    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [140]:
def segment_text(text):
    segments = text.split('\n\n')
    
    segments = [clean_special_characters(segment) for segment in segments]
    
    return segments

In [141]:
def preprocess_text(text):
    text = standardize_dates(text)
    text = clean_special_characters(text)
    segments = segment_text(text)
    return segments

In [142]:
processed_segments = preprocess_text(extracted_text)
processed_segments = " ".join(processed_segments)

print(processed_segments)

--------------------------------------------------------------------- REPORT from the University Hospital of Champignac --------------------------------------------------------------------- Examination on 10/02/2020 (patient Le Pic Celine, 11 years old) Requesting service: HEMATO Abdomino-Pelvic MRI Performed on MR 450 W MRI installed on 06/16/2018 - 1.5 Tesla INDICATION: Monitoring of an abdominal infectious involvement in an 11-year-old child in the context of leukemia. TECHNIQUE: Axial T2, IP OP, diffusion, T1 sequences without and then after gadolinium injection at arterial, portal, and late times on the liver. Contrast agent Dotarem Vial 10 ml - Injected quantity 8 ml - Batch: 14gd001c (01-2017). RESULT: Exam compared to the CT scan of July 21, 2020: Identification of heterogeneous hepatic areas replacing previously described hepatic lesions on the CT scan. Identification of two nodular formations with a peripheral T1 hypersignal in segments VII and VI without pathological contras

## Loading model for name(s) of patient(s) only / Classification of the tokens

In [143]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": processed_segments, "ents": entities}

colors = {
    "PATIENT" : "blue"
}

options = {"colors" : colors}

In [145]:
import nltk
import math

tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForTokenClassification.from_pretrained("obi/deid_roberta_i2b2")

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple",ignore_labels=[])

max_chunk_size = 80

chunks = []
current_chunk = ""

for word in processed_segments.split():
    if len(current_chunk) + len(word) + 1 <= max_chunk_size:
        if current_chunk:
            current_chunk += " " + word
        else:
            current_chunk = word
    else:
        chunks.append(current_chunk)
        current_chunk = word

if current_chunk:
    chunks.append(current_chunk)

results = []
for chunk in chunks:
    chunk_results = nlp(chunk)
    results.extend(chunk_results)

for entity in results:

    patients = [result['word'] for result in results if result['entity_group'] == 'PATIENT' and result['score']>0.8]
    patients = [patient.strip() for patient in patients]
    print(entity)
print(patients)

{'entity_group': 'O', 'score': 0.9990838, 'word': ' --------------------------------------------------------------------- REPORT', 'start': 0, 'end': 76}
{'entity_group': 'O', 'score': 0.98796415, 'word': ' from the', 'start': 0, 'end': 8}
{'entity_group': 'HOSP', 'score': 0.98960525, 'word': ' University Hospital of', 'start': 9, 'end': 31}
{'entity_group': 'HOSP', 'score': 0.8303835, 'word': ' Champignac', 'start': 32, 'end': 42}
{'entity_group': 'O', 'score': 0.98570436, 'word': ' ---------------------------------------------------------------------', 'start': 0, 'end': 69}
{'entity_group': 'O', 'score': 0.99999607, 'word': ' Examination on', 'start': 0, 'end': 14}
{'entity_group': 'DATE', 'score': 0.9289859, 'word': ' 10/02/', 'start': 15, 'end': 21}
{'entity_group': 'DATE', 'score': 0.85902995, 'word': '2020', 'start': 21, 'end': 25}
{'entity_group': 'O', 'score': 0.9999977, 'word': ' (patient', 'start': 26, 'end': 34}
{'entity_group': 'PATIENT', 'score': 0.99923825, 'word': ' Le 

In [146]:
patients

['Le Pic', 'Celine']

In [147]:
name_to_id = {}
index = []
for result in results:
    if result['entity_group'] == 'PATIENT':
        if len(index)==0 and str(result['word']) not in str(name_to_id.values()):
            index.append(result['end'])
            current_person_id = 'ID{}'.format(len(name_to_id) + 1)
            name_to_id[current_person_id] = result['word']
        elif len(index)!=0:
            if result['start'] == int(index[-1])+1:
                index.append(result['end'])
                name_to_id[current_person_id] += result['word']
                index = []

for cle in name_to_id:
    name_to_id[cle] = name_to_id[cle].strip()

for person in patients:
    for cle in name_to_id:
        if person in str(name_to_id[cle]):
            extracted_text = extracted_text.replace(person,cle)
            
print(extracted_text)


--------------------------------------------------------------------- REPORT from the University Hospital of Champignac --------------------------------------------------------------------- Examination on 10/02/2020 (patient ID1 ID1, 11 years old) Requesting service: HEMATO Abdomino-Pelvic MRI Performed on MR 450 W MRI installed on 06/16/2018 - 1.5 Tesla INDICATION: Monitoring of an abdominal infectious involvement in an 11-year-old child in the context of leukemia. TECHNIQUE: Axial T2, IP OP, diffusion, T1 sequences without and then after gadolinium injection at arterial, portal, and late times on the liver. Contrast agent Dotarem Vial 10 ml - Injected quantity 8 ml - Batch: 14gd001c (01-2017). RESULT: Exam compared to the CT scan of July 21, 2020: Identification of heterogeneous hepatic areas replacing previously described hepatic lesions on the CT scan. Identification of two nodular formations with a peripheral T1 hypersignal in segments VII and VI without pathological contrast upt

In [148]:
name_to_id

{'ID1': 'Le Pic Celine'}

## Loading second model for all name(s) recognition

In [70]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "B-PER" : "blue",
    "I-PER" : "blue"
}

options = {"colors" : colors}

In [71]:
config = BertConfig.from_pretrained("dslim/bert-base-NER")

config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 12

model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", config=config)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, ignore_labels=[])
ner_results = nlp(extracted_text)
print(ner_results)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity': 'O', 'score': 0.9998043, 'index': 1, 'word': 'R', 'start': 1, 'end': 2}, {'entity': 'O', 'score': 0.99991786, 'index': 2, 'word': '##EP', 'start': 2, 'end': 4}, {'entity': 'O', 'score': 0.9969331, 'index': 3, 'word': '##OR', 'start': 4, 'end': 6}, {'entity': 'O', 'score': 0.9996519, 'index': 4, 'word': '##T', 'start': 6, 'end': 7}, {'entity': 'O', 'score': 0.9999136, 'index': 5, 'word': 'of', 'start': 8, 'end': 10}, {'entity': 'O', 'score': 0.9999516, 'index': 6, 'word': 'the', 'start': 11, 'end': 14}, {'entity': 'B-MISC', 'score': 0.9836367, 'index': 7, 'word': 'MR', 'start': 15, 'end': 17}, {'entity': 'O', 'score': 0.9545875, 'index': 8, 'word': '##I', 'start': 17, 'end': 18}, {'entity': 'I-MISC', 'score': 0.6985221, 'index': 9, 'word': 'G', 'start': 19, 'end': 20}, {'entity': 'O', 'score': 0.9751133, 'index': 10, 'word': '##IE', 'start': 20, 'end': 22}, {'entity': 'O', 'score': 0.9996331, 'index': 11, 'word': 'of', 'start': 23, 'end': 25}, {'entity': 'B-LOC', 'score': 0.

## Name(s) being seen in the text

In [72]:
person_entities = [result['word'] for result in ner_results if result['entity'] in ['B-PER', 'I-PER']]

person_names = ' '.join(person_entities)

print(person_names)

Le Pi Ce


## Name(s) / ID(s) association and Pseudonymized text 

In [73]:
name_to_id = {}
current_person_id = None
pseudonymized_text = []
noms = []

for result in ner_results:
    if result['entity'] == 'B-PER':
        if result['word'] in patients:
            person_name = result['word']
            if person_name not in noms:
                noms.append(person_name)
                current_person_id = 'ID{}'.format(len(name_to_id) + 1)
                name_to_id[current_person_id] = person_name
                pseudonymized_text.append(current_person_id)
            else:
                for cle, valeur in name_to_id.items():
                    if person_name in valeur:
                        pseudonymized_text.append(cle)
        else:
            for verif in ner_results:
                if int(verif['index'])==result['index']+1:
                    if "#" in verif['word']:
                        person = result['word']+verif['word'].replace('##','')
                        if person in noms:
                            for cle, valeur in name_to_id.items():
                                if person in valeur:
                                    pseudonymized_text.append(cle)
                    else:
                        pseudonymized_text.append(result['word'])
                    
    
    elif result['entity'] == 'I-PER':
        if result['word'] in patients:
            noms.append(result['word'])
            person_name += ' ' + result['word']  
            name_to_id[current_person_id] = person_name.replace('##', '')
            pseudonymized_text.append(current_person_id)
            
        if "#" in result['word']:
            for a in ner_results:
                if a['index'] == int(result['index'])-1:
                    b = a['word'] + result['word'].replace('##','')
                    if b in patients:
                        temp = []
                        for cle, valeur in name_to_id.items():
                            temp.append(valeur)
                        if b not in valeur:
                            noms.append(b)
                            person_name = person_name+" "+b
                            name_to_id[current_person_id] = person_name
                            pseudonymized_text.append(current_person_id)
                        else:
                            pseudonymized_text.append(current_person_id)

                    else:
                        pseudonymized_text.append(b)
    else:
        pseudonymized_text.append(result['word'])

pseudonymized_text = ' '.join(pseudonymized_text)
print(pseudonymized_text)

R ##EP ##OR ##T of the MR ##I G ##IE of Mo ##ulin ##sar ##t MR ##I examination of 25 / 08 / 2017 ( The patient that has been examined is : Le ##c ##line , 8 years old ) I . R . M . E ##NC ##EP ##HA ##L ##I ##Q ##UE and CE ##R ##VI ##CA ##L Examination performed on IN ##TE ##RA NO ##VA 1 . 5 P ##H ##IL ##IP ##S installed on 20 / 11 / 2016 - 1 . 5 T ##ES ##LA IN ##DI ##CA ##TI ##ON : Extension assessment of a l ##ytic les ##ion of the skull base , suspected of his ##ti ##oc ##yt ##osis , o ##ste ##oar ##th ##rit ##is or secondary location . T ##EC ##H ##NI ##Q ##UE : Sa ##git ##tal T ##1 of the brain , a ##xial T ##2 and a ##xial FL ##A ##IR of the brain , co ##rona ##l T ##2 * of the brain , a ##xial T ##1 , T ##2 FA ##T SAT fine cuts of the skull base and o ##cci ##pit ##o - c ##er ##vic ##al junction , a ##xial T ##1 g ##ado FA ##T SAT and co ##rona ##l T ##1 g ##ado FA ##T SAT on the same region , sa ##git ##tal T ##1 g ##ado , sa ##git ##tal T ##2 of the c ##er ##vic ##al spine and 

In [74]:
cleaned_text = re.sub(r'\s*##\s*', '', pseudonymized_text)
print(cleaned_text)

REPORT of the MRI GIE of Moulinsart MRI examination of 25 / 08 / 2017 ( The patient that has been examined is : Lecline , 8 years old ) I . R . M . ENCEPHALIQUE and CERVICAL Examination performed on INTERA NOVA 1 . 5 PHILIPS installed on 20 / 11 / 2016 - 1 . 5 TESLA INDICATION : Extension assessment of a lytic lesion of the skull base , suspected of histiocytosis , osteoarthritis or secondary location . TECHNIQUE : Sagittal T1 of the brain , axial T2 and axial FLAIR of the brain , coronal T2 * of the brain , axial T1 , T2 FAT SAT fine cuts of the skull base and occipito - cervical junction , axial T1 gado FAT SAT and coronal T1 gado FAT SAT on the same region , sagittal T1 gado , sagittal T2 of the cervical spine and sagittal T2 of the left occipito - cervical junction . Contrast agent : Dotarem 5 ml vial - Quantity injected : 5 ml - Lot : 11gdo25a ( 03 / 2014 ) RESULT : Analysis of the skull base : demonstration of a tissue process corresponding to the lytic areas visualized on the bo

In [11]:
print(name_to_id)

{'ID1': 'Laurent'}


## Third model for problem(s) / treatment(s) recognition

In [257]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "B-problem" : "red",
    "I-problem" : "red",
    "B-treatment" : "green",
    "I-treatment" : "green"
}

options = {"colors" : colors}

In [258]:
config = BertConfig.from_pretrained("samrawal/bert-base-uncased_clinical-ner")

#config.hidden_dropout_prob = 0.2
#config.attention_probs_dropout_prob = 0.2
#config.num_hidden_layers = 12

tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner", config=config)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
#example = "My name is John Do and I live in Berlin with Maria Martinez who is the sister of Mario"
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [259]:
problem = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-problem'):
        compteur+=1    
        problem.append(result['word'])
    if (result['entity'] == 'I-problem'):
        problem[compteur] = problem[compteur] + ' '+result['word']

In [260]:
problem

['persistent chest pain',
 'short ##ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ##ness of breath',
 'hyper',
 '##tension',
 'type 2 diabetes',
 'hyper ##lip ##ide ##mia',
 'asthma',
 'attacks',
 'sy ##sto ##lic murmurs',
 'bilateral w ##hee ##zing',
 'abnormalities',
 'my ##oca ##rdial in ##far ##ction',
 'diffuse pulmonary infiltrate ##s',
 'congestion',
 'elevated tr ##op ##oni ##n levels',
 'a fast ##ing',
 'acute corona ##ry syndrome',
 'a my ##oca ##rdial in ##far ##ction',
 'an asthma ex ##ace ##rba ##tion glucose',
 'hyper ##tension',
 'the my ##oca ##rdial in ##far ##ction',
 'chest pain',
 'cardiac load']

In [261]:
problems = [word.replace('#', '') for word in problem]
problems

['persistent chest pain',
 'short ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ness of breath',
 'hyper',
 'tension',
 'type 2 diabetes',
 'hyper lip ide mia',
 'asthma',
 'attacks',
 'sy sto lic murmurs',
 'bilateral w hee zing',
 'abnormalities',
 'my oca rdial in far ction',
 'diffuse pulmonary infiltrate s',
 'congestion',
 'elevated tr op oni n levels',
 'a fast ing',
 'acute corona ry syndrome',
 'a my oca rdial in far ction',
 'an asthma ex ace rba tion glucose',
 'hyper tension',
 'the my oca rdial in far ction',
 'chest pain',
 'cardiac load']

In [262]:
treatment = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-treatment'):
        compteur+=1    
        treatment.append(result['word'])
    if (result['entity'] == 'I-treatment'):
        treatment[compteur] = treatment[compteur] +result['word']

In [263]:
treatments = [word.replace('#', '') for word in treatment]
treatments

['li',
 'sinopril',
 'metformin',
 'atorvastatin',
 'ventolin(albuterol)',
 'subopti',
 'malcontrol',
 'management',
 'ivadministration',
 'heparin',
 'ni',
 'troglycerin',
 'metoprolol',
 'adjustment',
 'diabetestreatment',
 'insulin',
 'ventolininhaler']

## Fourth model for dates recognition

In [153]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "DATE" : "blue"
}

options = {"colors" : colors}

In [154]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("tner/roberta-large-ontonotes5")
model = AutoModelForTokenClassification.from_pretrained("tner/roberta-large-ontonotes5")

extracted_text = "John Smith, born on 07/12/1980 in Pontault Combault 77340 with his phone number being 01 02 03 04 05, presented to our hospital named François Mitterrand located in 75015 for a consultation for persistent chest pain associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Doe: Mr. Smith reports chest pain for the past three days, described as a pressure sensation. This pain is accompanied by shortness of breath, particularly during moderate exertion. He has a history of hypertension diagnosed in 2015, currently on Lisinopril 10 mg once daily, type 2 diabetes diagnosed in 2018, on Metformin 500 mg twice daily, hyperlipidemia diagnosed in 2017, treated with Atorvastatin 20 mg once daily, and asthma since childhood, requiring the use of Ventolin (albuterol) during attacks. During the physical examination, his temperature was 36.8°C, heart rate 88 bpm, blood pressure 145/90 mmHg, and oxygen saturation 98%. Cardiac auscultation revealed systolic murmurs, and lung auscultation noted bilateral wheezing. Investigations showed an ECG with abnormalities suggestive of a myocardial infarction, a chest X-ray revealing diffuse pulmonary infiltrates indicating congestion, and blood tests indicating elevated troponin levels, a fasting blood glucose of 140 mg/dL. The diagnosis is acute coronary syndrome, likely a myocardial infarction, asthma exacerbation, and suboptimal control of blood glucose and hypertension. The treatment plan includes immediate hospitalization for the management of the myocardial infarction, intravenous administration of heparin, nitroglycerin to relieve chest pain, metoprolol to reduce cardiac workload, and an adjustment of diabetes treatment with insulin. An increased frequency of use of the Ventolin inhaler is also recommended. I recommend regular follow-up in cardiology and endocrinology after hospital discharge to ensure optimal management."

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", ignore_labels=[])
ner_results = nlp(extracted_text)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [434]:
dates = []
date_dic = {}

year_pattern = re.compile(r'\b\d{4}\b')

for result in ner_results:
    if result['entity_group'] == 'DATE':
        dates.append(result['word'])
        match = year_pattern.search(result['word'])
        if match:
            date_dic[int(match.group())] = result['word']

extracted_text = extracted_text.replace(date_dic[min(date_dic)]," DATE DE NAISSANCE")
print(extracted_text)

John Smith, born on DATE DE NAISSANCE in Pontault Combault 77340 with his phone number being 01 02 03 04 05, presented to our hospital named François Mitterrand located in 75015 for a consultation for persistent chest pain associated with shortness of breath. Here is a summary of his medical history and recent consultation performed by Dr. Doe: Mr. Smith reports chest pain for the past three days, described as a pressure sensation. This pain is accompanied by shortness of breath, particularly during moderate exertion. He has a history of hypertension diagnosed in 2015, currently on Lisinopril 10 mg once daily, type 2 diabetes diagnosed in 2018, on Metformin 500 mg twice daily, hyperlipidemia diagnosed in 2017, treated with Atorvastatin 20 mg once daily, and asthma since childhood, requiring the use of Ventolin (albuterol) during attacks. During the physical examination, his temperature was 36.8°C, heart rate 88 bpm, blood pressure 145/90 mmHg, and oxygen saturation 98%. Cardiac auscult

## City recognition

In [319]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [270]:
destination = []
for result in ner_results:
    if result["entity_group"] == "LOC":
        destination.append(result["word"])
print(destination)

[]


## Phone number recognition

In [315]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

labels = ["phone number"]
entities = model.predict_entities(cleaned_text, labels)

for entity in entities:
    phone_number = entity["text"]
                
cleaned_text = cleaned_text.replace(phone_number,"NUMERO_DE_TELEPHONE")
print(cleaned_text)

John Smith, born the 12/07/1980 at Pontault 
Combault 77340 with his phone number being 
NUMERO_DE_TELEPHONE, presented to our hospital called 
François Mitterrand for a consultation for 
persistent chest pain associated with shortness 
of breath. Here is a summary of his medical 
history and recent consultation done by doctor 
Doe:  
Mr. Smith reports chest pain for the past three 
days, described as a pressure sensation. This 
pain is accompanied by shortness of breath, 
particularly during moderate exertion. He has a 
history of hypertension diagnosed in 2015, 
currently on Lisinopril 10 mg once a day, Type 2 
diabetes diagnosed in 2018, on Metformin 500 
mg twice a day, hyperlipidemia diagnosed in 
2017, treated with Atorvastatin 20 mg once a 
day, and asthma since childhood, requiring the 
use of Ventolin (albuterol) during attacks.  
On physical examination, his temperature was 
36.8°C, heart rate 88 bpm, blood pressure 
145/90 mmHg, and oxygen saturation 98%. 
Cardiac auscultati

## Postal Code recognitionn

In [36]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "ID_NUM" : "blue"
}

options = {"colors" : colors}

In [56]:
pdf_path = 'medicalreport2.pdf' 
extracted_text = "Marc Dupont, born on 08/11/1975 in Marseille, 13001, and with the phone number 07 23 45 67 89, presented himself to our hospital named Charles de Gaulle, located in 75014, for a consultation regarding persistent chest pains associated with shortness of breath. Here is a summary of his medical history and his recent consultation performed by Dr. Martin: Mr. Dupont reports having chest pains for five days, described as a burning sensation. These pains are accompanied by shortness of breath, especially when climbing stairs. He has a history of hypertension diagnosed in 2012, currently treated with Enalapril 20 mg once a day, type 2 diabetes diagnosed in 2014, on Glucophage 850 mg twice a day, hyperlipidemia diagnosed in 2016, treated with Rosuvastatin 10 mg once a day, and asthma since childhood, requiring the use of Bricanyl during attacks. "

In [57]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, RobertaConfig, pipeline

config = RobertaConfig.from_pretrained("zmilczarek/pii-detection-roberta-v2")
config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 27

tokenizer = AutoTokenizer.from_pretrained("zmilczarek/pii-detection-roberta-v2")
model = AutoModelForTokenClassification.from_pretrained("zmilczarek/pii-detection-roberta-v2", config=config)

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

print(ner_results)
postal_code = []
for result in ner_results:
    if result["entity_group"] == "ID_NUM":
        postal_code.append(result["word"])

postal_code = [s.replace(" ", "") for s in postal_code]

compteur = 0
cleaned_postal_codes = []
code = []
for number in postal_code:
    compteur += len(number)
    code+=number
    if compteur==5:
        code = "".join(code)
        cleaned_postal_codes.append(code)
        compteur=0
        code=[]

for postal_code in cleaned_postal_codes:
    extracted_text = extracted_text.replace(postal_code,"CODE_POSTAL")
    

print(extracted_text)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at zmilczarek/pii-detection-roberta-v2 and are newly initialized: ['roberta.encoder.layer.12.attention.output.LayerNorm.bias', 'roberta.encoder.layer.12.attention.output.LayerNorm.weight', 'roberta.encoder.layer.12.attention.output.dense.bias', 'roberta.encoder.layer.12.attention.output.dense.weight', 'roberta.encoder.layer.12.attention.self.key.bias', 'roberta.encoder.layer.12.attention.self.key.weight', 'roberta.encoder.layer.12.attention.self.query.bias', 'roberta.encoder.layer.12.attention.self.query.weight', 'roberta.encoder.layer.12.attention.self.value.bias', 'roberta.encoder.layer.12.attention.self.value.weight', 'roberta.encoder.layer.12.intermediate.dense.bias', 'roberta.encoder.layer.12.intermediate.dense.weight', 'roberta.encoder.layer.12.output.LayerNorm.bias', 'roberta.encoder.layer.12.output.LayerNorm.weight', 'roberta.encoder.layer.12.output.dense.bias', 'roberta.encoder.layer.1

[{'entity_group': 'NAME_STUDENT', 'score': 0.22703944, 'word': 'Marc', 'start': 0, 'end': 4}, {'entity_group': 'URL_PERSONAL', 'score': 0.122942775, 'word': ' Dup', 'start': 5, 'end': 8}, {'entity_group': 'PHONE_NUM', 'score': 0.11556338, 'word': 'ont', 'start': 8, 'end': 11}, {'entity_group': 'ID_NUM', 'score': 0.26391494, 'word': ' 13', 'start': 46, 'end': 48}, {'entity_group': 'ID_NUM', 'score': 0.22211309, 'word': '001', 'start': 48, 'end': 51}, {'entity_group': 'ID_NUM', 'score': 0.2983673, 'word': ' 07', 'start': 79, 'end': 81}, {'entity_group': 'ID_NUM', 'score': 0.29338244, 'word': ' 23', 'start': 82, 'end': 84}, {'entity_group': 'ID_NUM', 'score': 0.29582024, 'word': ' 45', 'start': 85, 'end': 87}, {'entity_group': 'ID_NUM', 'score': 0.2813172, 'word': ' 67', 'start': 88, 'end': 90}, {'entity_group': 'ID_NUM', 'score': 0.2355655, 'word': ' 89', 'start': 91, 'end': 93}, {'entity_group': 'ID_NUM', 'score': 0.2319983, 'word': ' 75', 'start': 165, 'end': 167}, {'entity_group': 'ID

## Adress recognition

In [128]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": text, "ents": entities}

colors = {
    "STREETADDRESS" : "blue"
}

options = {"colors" : colors}

In [129]:
text = "Mr. Julien Lefebvre, born April 22, 1985, presented himself at Hôpital Saint-Antoine on June 25, 2024, complaining of persistent chest pain for three days. After a thorough clinical examination by Dr. Claire Dubois, as well as additional tests including a chest X-ray and electrocardiogram, it was determined that the patient, residing at 78 Rue de la Paix, 75002 Paris, and reachable at 01 87 65 43 21, is suffering from stable angina. The recommended treatment includes taking nitroglycerin in case of acute pain, as well as starting beta-blocker therapy to prevent future episodes. Regular follow-up with a cardiologist is strongly advised. Mr. Lefebvre has been informed of the measures to modify his lifestyle, including smoking cessation, a balanced diet, and regular physical activity. The patient understood the recommendations and was referred to the cardiology department for specialized follow-up care."
from transformers import AutoTokenizer, AutoModelForTokenClassification, DebertaConfig

config = DebertaConfig.from_pretrained("lakshyakh93/deberta_finetuned_pii")
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 25

model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii", config=config)
from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(text)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)
print(ner_results)

Some weights of DebertaForTokenClassification were not initialized from the model checkpoint at lakshyakh93/deberta_finetuned_pii and are newly initialized: ['deberta.encoder.layer.12.attention.output.LayerNorm.bias', 'deberta.encoder.layer.12.attention.output.LayerNorm.weight', 'deberta.encoder.layer.12.attention.output.dense.bias', 'deberta.encoder.layer.12.attention.output.dense.weight', 'deberta.encoder.layer.12.attention.self.in_proj.weight', 'deberta.encoder.layer.12.attention.self.pos_proj.weight', 'deberta.encoder.layer.12.attention.self.pos_q_proj.bias', 'deberta.encoder.layer.12.attention.self.pos_q_proj.weight', 'deberta.encoder.layer.12.attention.self.q_bias', 'deberta.encoder.layer.12.attention.self.v_bias', 'deberta.encoder.layer.12.intermediate.dense.bias', 'deberta.encoder.layer.12.intermediate.dense.weight', 'deberta.encoder.layer.12.output.LayerNorm.bias', 'deberta.encoder.layer.12.output.LayerNorm.weight', 'deberta.encoder.layer.12.output.dense.bias', 'deberta.encode

[{'entity_group': 'PREFIX', 'score': 0.18814906, 'word': ' Mr.', 'start': 0, 'end': 3}, {'entity_group': 'FIRSTNAME', 'score': 0.09770989, 'word': ' Jul', 'start': 4, 'end': 7}, {'entity_group': 'FIRSTNAME', 'score': 0.09952756, 'word': 'ien', 'start': 7, 'end': 10}, {'entity_group': 'LASTNAME', 'score': 0.09798784, 'word': ' L', 'start': 11, 'end': 12}, {'entity_group': 'LASTNAME', 'score': 0.10504931, 'word': 'ef', 'start': 12, 'end': 14}, {'entity_group': 'LASTNAME', 'score': 0.0921215, 'word': 'eb', 'start': 14, 'end': 16}, {'entity_group': 'LASTNAME', 'score': 0.086102545, 'word': 'vre', 'start': 16, 'end': 19}, {'entity_group': 'SSN', 'score': 0.06493821, 'word': ' April', 'start': 26, 'end': 31}, {'entity_group': 'AMOUNT', 'score': 0.075393595, 'word': ' 22, 1985', 'start': 32, 'end': 40}, {'entity_group': 'URL', 'score': 0.051511087, 'word': 'pital', 'start': 65, 'end': 70}, {'entity_group': 'CITY', 'score': 0.059554584, 'word': ' Saint', 'start': 71, 'end': 76}, {'entity_group