## Imports

In [305]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, BertConfig, TrainingArguments, Trainer
from transformers import pipeline
import datasets
import fitz
import re
import spacy

## Extracting text from pdf 

In [389]:
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as pdf_document:
        text = ""
        for page_number in range(pdf_document.page_count):
            page = pdf_document.load_page(page_number)
            page_text = page.get_text()
            text += page_text
    return text

pdf_path = 'medicalreport15.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)

John Smith, born the 12/07/1980 at 
Pontault Combault 77340 with his 
phone number being 01 02 03 04 
05, presented to our hospital called 
François Mitterrand situated in 
75015 for a consultation for 
persistent chest pain associated 
with shortness of breath. Here is a 
summary of his medical history and 
recent consultation done by doctor 
Doe:  
Mr. Smith reports chest pain for the 
past three days, described as a 
pressure sensation. This pain is 
accompanied by shortness of 
breath, particularly during moderate 
exertion. He has a history of 
hypertension diagnosed in 2015, 
currently on Lisinopril 10 mg once a 
day, Type 2 diabetes diagnosed in 
2018, on Metformin 500 mg twice a 
day, hyperlipidemia diagnosed in 
2017, treated with Atorvastatin 20 
mg once a day, and asthma since 
childhood, requiring the use of 
Ventolin (albuterol) during attacks.  
On physical examination, his 
temperature was 36.8°C, heart rate 
88 bpm, blood pressure 145/90 
mmHg, and oxygen saturation 98%

## Loading model for name(s) of patient(s) only / Classification of the tokens

In [309]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "PATIENT" : "blue"
}

options = {"colors" : colors}

In [310]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
model = AutoModelForTokenClassification.from_pretrained("obi/deid_roberta_i2b2")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)

patients = [result['word'] for result in ner_results if result['entity_group'] == 'PATIENT']
patients = [patient.strip() for patient in patients]

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [311]:
patients

['John', 'Smith', 'Smith']

## Loading second model for all name(s) recognition

In [312]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "B-PER" : "blue",
    "I-PER" : "blue"
}

options = {"colors" : colors}

In [313]:
config = BertConfig.from_pretrained("dslim/bert-base-NER")

config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2
config.num_hidden_layers = 12

model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", config=config)
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, ignore_labels=[])
ner_results = nlp(extracted_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

## Name(s) being seen in the text

In [253]:
person_entities = [result['word'] for result in ner_results if result['entity'] in ['B-PER', 'I-PER']]

person_names = ' '.join(person_entities)

print(person_names)

John Smith François Mi ##tter ##rand Do Smith


## Name(s) / ID(s) association and Pseudonymized text 

In [254]:
name_to_id = {}
current_person_id = None
pseudonymized_text = []
noms = []

for result in ner_results:
    if result['entity'] == 'B-PER' and result['word'] in patients:
        person_name = result['word']
        if person_name not in noms:
            noms.append(person_name)
            current_person_id = 'ID{}'.format(len(name_to_id) + 1)
            name_to_id[current_person_id] = person_name
            pseudonymized_text.append(current_person_id)
        else:
            for cle, valeur in name_to_id.items():
                if name_to_id[cle] in valeur:
                    pseudonymized_text.append(cle)
    elif result['entity'] == 'I-PER' and result['word'] in patients:
        noms.append(result['word'])
        person_name += ' ' + result['word']  
        name_to_id[current_person_id] = person_name.replace('##', '')
        pseudonymized_text.append(current_person_id)
    else:
        pseudonymized_text.append(result['word'])

pseudonymized_text = ' '.join(pseudonymized_text)
print(pseudonymized_text)

ID1 ID1 , born the 12 / 07 / 1980 at Pont ##ault Co ##mba ##ult 77 ##34 ##0 with his phone number being 01 02 03 04 05 , presented to our hospital called François Mi ##tter ##rand for a consultation for persistent chest pain associated with short ##ness of breath . Here is a summary of his medical history and recent consultation done by doctor Do ##e : Mr . ID1 reports chest pain for the past three days , described as a pressure sensation . This pain is accompanied by short ##ness of breath , particularly during moderate ex ##ert ##ion . He has a history of h ##yper ##tens ##ion diagnosed in 2015 , currently on Li ##sin ##op ##ril 10 mg once a day , Type 2 diabetes diagnosed in 2018 , on Met ##form ##in 500 mg twice a day , h ##yper ##lip ##ide ##mia diagnosed in 2017 , treated with At ##or ##vas ##tat ##in 20 mg once a day , and as ##th ##ma since childhood , requiring the use of V ##ent ##olin ( al ##but ##ero ##l ) during attacks . On physical examination , his temperature was 36 . 

In [255]:
cleaned_text = re.sub(r'\s*##\s*', '', pseudonymized_text)
print(cleaned_text)

ID1 ID1 , born the 12 / 07 / 1980 at Pontault Combault 77340 with his phone number being 01 02 03 04 05 , presented to our hospital called François Mitterrand for a consultation for persistent chest pain associated with shortness of breath . Here is a summary of his medical history and recent consultation done by doctor Doe : Mr . ID1 reports chest pain for the past three days , described as a pressure sensation . This pain is accompanied by shortness of breath , particularly during moderate exertion . He has a history of hypertension diagnosed in 2015 , currently on Lisinopril 10 mg once a day , Type 2 diabetes diagnosed in 2018 , on Metformin 500 mg twice a day , hyperlipidemia diagnosed in 2017 , treated with Atorvastatin 20 mg once a day , and asthma since childhood , requiring the use of Ventolin ( albuterol ) during attacks . On physical examination , his temperature was 36 . 8°C , heart rate 88 bpm , blood pressure 145 / 90 mmHg , and oxygen saturation 98 % . Cardiac auscultatio

In [256]:
print(name_to_id)

{'ID1': 'John Smith'}


## Third model for problem(s) / treatment(s) recognition

In [257]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "B-problem" : "red",
    "I-problem" : "red",
    "B-treatment" : "green",
    "I-treatment" : "green"
}

options = {"colors" : colors}

In [258]:
config = BertConfig.from_pretrained("samrawal/bert-base-uncased_clinical-ner")

#config.hidden_dropout_prob = 0.2
#config.attention_probs_dropout_prob = 0.2
#config.num_hidden_layers = 12

tokenizer = AutoTokenizer.from_pretrained("samrawal/bert-base-uncased_clinical-ner")
model = AutoModelForTokenClassification.from_pretrained("samrawal/bert-base-uncased_clinical-ner", config=config)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
#example = "My name is John Do and I live in Berlin with Maria Martinez who is the sister of Mario"
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [259]:
problem = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-problem'):
        compteur+=1    
        problem.append(result['word'])
    if (result['entity'] == 'I-problem'):
        problem[compteur] = problem[compteur] + ' '+result['word']

In [260]:
problem

['persistent chest pain',
 'short ##ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ##ness of breath',
 'hyper',
 '##tension',
 'type 2 diabetes',
 'hyper ##lip ##ide ##mia',
 'asthma',
 'attacks',
 'sy ##sto ##lic murmurs',
 'bilateral w ##hee ##zing',
 'abnormalities',
 'my ##oca ##rdial in ##far ##ction',
 'diffuse pulmonary infiltrate ##s',
 'congestion',
 'elevated tr ##op ##oni ##n levels',
 'a fast ##ing',
 'acute corona ##ry syndrome',
 'a my ##oca ##rdial in ##far ##ction',
 'an asthma ex ##ace ##rba ##tion glucose',
 'hyper ##tension',
 'the my ##oca ##rdial in ##far ##ction',
 'chest pain',
 'cardiac load']

In [261]:
problems = [word.replace('#', '') for word in problem]
problems

['persistent chest pain',
 'short ness of breath',
 'chest pain',
 'a pressure sensation',
 'this pain',
 'short ness of breath',
 'hyper',
 'tension',
 'type 2 diabetes',
 'hyper lip ide mia',
 'asthma',
 'attacks',
 'sy sto lic murmurs',
 'bilateral w hee zing',
 'abnormalities',
 'my oca rdial in far ction',
 'diffuse pulmonary infiltrate s',
 'congestion',
 'elevated tr op oni n levels',
 'a fast ing',
 'acute corona ry syndrome',
 'a my oca rdial in far ction',
 'an asthma ex ace rba tion glucose',
 'hyper tension',
 'the my oca rdial in far ction',
 'chest pain',
 'cardiac load']

In [262]:
treatment = []
compteur = -1
for result in ner_results:
    if (result['entity'] == 'B-treatment'):
        compteur+=1    
        treatment.append(result['word'])
    if (result['entity'] == 'I-treatment'):
        treatment[compteur] = treatment[compteur] +result['word']

In [263]:
treatments = [word.replace('#', '') for word in treatment]
treatments

['li',
 'sinopril',
 'metformin',
 'atorvastatin',
 'ventolin(albuterol)',
 'subopti',
 'malcontrol',
 'management',
 'ivadministration',
 'heparin',
 'ni',
 'troglycerin',
 'metoprolol',
 'adjustment',
 'diabetestreatment',
 'insulin',
 'ventolininhaler']

## Fourth model for dates recognition

In [394]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": extracted_text, "ents": entities}

colors = {
    "DATE" : "blue"
}

options = {"colors" : colors}

In [395]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("tner/roberta-large-ontonotes5")
model = AutoModelForTokenClassification.from_pretrained("tner/roberta-large-ontonotes5")



from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple", ignore_labels=[])
ner_results = nlp(extracted_text)
spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [397]:
dates = []
extracted_text = []
date_dic = {}

year_pattern = re.compile(r'\b\d{4}\b')

for result in ner_results:
    if result['entity_group'] == 'DATE':
        dates.append(result['word'])
        match = year_pattern.search(result['word'])
        if match:
            date_dic[int(match.group())] = result['word']

for result in ner_results:
    if result['word'] == date_dic[min(date_dic)]:
        extracted_text.append("DATE DE NAISSANCE")
    else:
        extracted_text.append(result['word'])
        
extracted_text = ' '.join(extracted_text)
print(extracted_text)

[]


## City recognition

In [319]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(cleaned_text)

spacy_format = convert_to_spacy_format(ner_results)
spacy.displacy.render(spacy_format, options=options, style="ent", manual=True, jupyter=True)

In [270]:
destination = []
for result in ner_results:
    if result["entity_group"] == "LOC":
        destination.append(result["word"])
print(destination)

[]


## Phone number recognition

In [315]:
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_multi_pii-v1")

labels = ["phone number"]
entities = model.predict_entities(cleaned_text, labels)

for entity in entities:
    phone_number = entity["text"]
                
cleaned_text = cleaned_text.replace(phone_number,"NUMERO_DE_TELEPHONE")
print(cleaned_text)

John Smith, born the 12/07/1980 at Pontault 
Combault 77340 with his phone number being 
NUMERO_DE_TELEPHONE, presented to our hospital called 
François Mitterrand for a consultation for 
persistent chest pain associated with shortness 
of breath. Here is a summary of his medical 
history and recent consultation done by doctor 
Doe:  
Mr. Smith reports chest pain for the past three 
days, described as a pressure sensation. This 
pain is accompanied by shortness of breath, 
particularly during moderate exertion. He has a 
history of hypertension diagnosed in 2015, 
currently on Lisinopril 10 mg once a day, Type 2 
diabetes diagnosed in 2018, on Metformin 500 
mg twice a day, hyperlipidemia diagnosed in 
2017, treated with Atorvastatin 20 mg once a 
day, and asthma since childhood, requiring the 
use of Ventolin (albuterol) during attacks.  
On physical examination, his temperature was 
36.8°C, heart rate 88 bpm, blood pressure 
145/90 mmHg, and oxygen saturation 98%. 
Cardiac auscultati

## Postal Code recognitionn

In [332]:
def convert_to_spacy_format(ner_results):
    entities = []
    for entity in ner_results:
        entities.append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"]
        })
    return {"text": cleaned_text, "ents": entities}

colors = {
    "ID_NUM" : "blue"
}

options = {"colors" : colors}

In [398]:
pdf_path = 'medicalreport15.pdf' 
extracted_text = extract_text_from_pdf(pdf_path)


In [399]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("zmilczarek/pii-detection-roberta-v2")
model = AutoModelForTokenClassification.from_pretrained("zmilczarek/pii-detection-roberta-v2")

from transformers import pipeline

nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
ner_results = nlp(extracted_text)

postal_code = []
for result in ner_results:
    if result["entity_group"] == "ID_NUM":
        postal_code.append(result["word"])

postal_code = [s.replace(" ", "") for s in postal_code]

compteur = 0
cleaned_postal_codes = []
code = []
for number in postal_code:
    compteur += len(number)
    code+=number
    if compteur==5:
        code = "".join(code)
        cleaned_postal_codes.append(code)
        compteur=0
        code=[]

for postal_code in cleaned_postal_codes:
    extracted_text = extracted_text.replace(postal_code,"CODE_POSTAL")
print(extracted_text)

John Smith, born the 12/07/1980 at 
Pontault Combault CODE_POSTAL with his 
phone number being 01 02 03 04 
05, presented to our hospital called 
François Mitterrand situated in 
CODE_POSTAL for a consultation for 
persistent chest pain associated 
with shortness of breath. Here is a 
summary of his medical history and 
recent consultation done by doctor 
Doe:  
Mr. Smith reports chest pain for the 
past three days, described as a 
pressure sensation. This pain is 
accompanied by shortness of 
breath, particularly during moderate 
exertion. He has a history of 
hypertension diagnosed in 2015, 
currently on Lisinopril 10 mg once a 
day, Type 2 diabetes diagnosed in 
2018, on Metformin 500 mg twice a 
day, hyperlipidemia diagnosed in 
2017, treated with Atorvastatin 20 
mg once a day, and asthma since 
childhood, requiring the use of 
Ventolin (albuterol) during attacks.  
On physical examination, his 
temperature was 36.8°C, heart rate 
88 bpm, blood pressure 145/90 
mmHg, and oxygen sa