## Basic medical application of LLMs with Hugging Face
This project focuses on using different methods to analyze medical transcription, including summarization.

In [1]:
from datasets import load_dataset
from transformers import pipeline
import pandas as pd

## Summarization

In [2]:
## Data from Hugging Face "rungalileo/medical_transcription_4"

In [3]:
med_dataset = load_dataset("rungalileo/medical_transcription_4", cache_dir="../datasets/"
)  
med_dataset  

Found cached dataset parquet (C:/Users/Sealion/Desktop/SOS_JOB/datasets/rungalileo___parquet/rungalileo--medical_transcription_4-cc509920750fa75b/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 4499
    })
    test: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})

In [4]:
med_sample = pd.DataFrame.from_dict(med_dataset['train'])
med_sample.head(5)

Unnamed: 0,id,text,label
0,0,"2-D STUDY,1. Mild aortic stenosis, widely calc...",0
1,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha...",3
2,2,"CHIEF COMPLAINT:, The patient comes for three...",0
3,3,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi...",3
4,4,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu...",0


In [5]:
set(med_sample['label'])

{0, 1, 2, 3}

In [6]:
class_dict = {0: "Medical Records", 1:"Other", 2:"Internal Medicine", 3: "Surgery"}
med_sample['class'] = med_sample['label'].map(class_dict)
med_sample = med_sample[['id', 'text', 'class']]
med_sample.columns = ['id', 'document', 'class']
med_sample

Unnamed: 0,id,document,class
0,0,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Medical Records
1,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha...",Surgery
2,2,"CHIEF COMPLAINT:, The patient comes for three...",Medical Records
3,3,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi...",Surgery
4,4,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu...",Medical Records
...,...,...,...
4494,4494,"CHIEF COMPLAINT:, Headaches.,HEADACHE HISTORY:...",Medical Records
4495,4495,"DATE OF INJURY : October 4, 2000,DATE OF EXAMI...",Medical Records
4496,4496,"PREOPERATIVE DIAGNOSES:,1. Lumbar osteomyelit...",Other
4497,4497,"PREOPERATIVE DIAGNOSIS:, Prostate cancer.,POS...",Surgery


In [7]:
med_sample['class'].value_counts()

Surgery              1442
Medical Records      1126
Internal Medicine    1040
Other                 891
Name: class, dtype: int64

In [8]:
summarizer = pipeline(
    task="summarization",
    model="t5-small",
    min_length=20,
    max_length=100,
    truncation=True,
    model_kwargs={"cache_dir": "../datasets/"},
)  

In [9]:
med_sample['document'][1]

'PREOPERATIVE DIAGNOSES: , Dysphagia and esophageal spasm.,POSTOPERATIVE DIAGNOSES: , Esophagitis and esophageal stricture.,PROCEDURE:,  Gastroscopy.,MEDICATIONS:,  MAC.,DESCRIPTION OF PROCEDURE: , The Olympus gastroscope was introduced into the oropharynx and passed carefully through the esophagus, stomach, and duodenum, to the third portion of the duodenum.  The hypopharynx was normal and the upper esophageal sphincter was unremarkable.  The esophageal contour was normal, with the gastroesophageal junction located at 38 cm from the incisors.  At this point, there were several linear erosions and a sense of stricturing at 38 cm.  Below this, there was a small hiatal hernia with the hiatus noted at 42 cm from the incisors.  The mucosa within the hernia was normal.  The gastric lumen was normal with normal mucosa throughout.  The pylorus was patent permitting passage of the scope into the duodenum, which was normal through the third portion.  During withdrawal of the scope, additional v

In [35]:
# med_sample['document'][1].split(',SUMMARY,')[0]

In [36]:
# med_sample['document'][1]

In [12]:
med_sample['document']

0       2-D STUDY,1. Mild aortic stenosis, widely calc...
1       PREOPERATIVE DIAGNOSES: , Dysphagia and esopha...
2       CHIEF COMPLAINT:,  The patient comes for three...
3       PROCEDURE: , Bilateral L5, S1, S2, and S3 radi...
4       DISCHARGE DIAGNOSES:,1. Chronic obstructive pu...
                              ...                        
4494    CHIEF COMPLAINT:, Headaches.,HEADACHE HISTORY:...
4495    DATE OF INJURY : October 4, 2000,DATE OF EXAMI...
4496    PREOPERATIVE DIAGNOSES:,1.  Lumbar osteomyelit...
4497    PREOPERATIVE DIAGNOSIS:,  Prostate cancer.,POS...
4498    IDENTIFYING DATA:  ,Mr. T is a 45-year-old whi...
Name: document, Length: 4499, dtype: object

In [13]:
summarizer(med_sample["document"][1])



[{'summary_text': 'hypopharynx was normal and upper esophageal sphincter was unremarkable . below this, there was a small hiatal hernia with the hiatus noted at 42 cm from the incisors . the gastric lumen was normal with normal mucosa throughout .'}]

In [14]:
med_sample = med_sample.iloc[:20, :]                  # only select first 20 rows to reduce expensive computation (save time).
results = summarizer(med_sample["document"].to_list())
results

[{'summary_text': 'Mild aortic stenosis, widely calcified, minimally restricted.,2. Mild left ventricular hypertrophy but normal systolic function.,3. Moderate biatrial enlargement .'},
 {'summary_text': 'hypopharynx was normal and upper esophageal sphincter was unremarkable . below this, there was a small hiatal hernia with the hiatus noted at 42 cm from the incisors . the gastric lumen was normal with normal mucosa throughout .'},
 {'summary_text': 'the patient comes for three-week postpartum checkup complaining of allergies . she is breastfeeding and feels like her milk is adequate .'},
 {'summary_text': 'fluoroscopy was used to identify bony landmarks of the sacrum and the sacroiliac joints . the skin, subcutaneous tissue, and muscle within the planned needle approach were anesthetized with 1% Lidocaine .'},
 {'summary_text': 'the patient is a 71-year-old lady who came in with increased shortness of breath of one day duration . she denied history of chest pain or fevers or cough wi

In [15]:
newmed_sample =  pd.DataFrame.from_dict(results).rename({"summary_text": "generated_summary"}, axis=1).join(pd.DataFrame.from_dict(med_sample))
[["generated_summary", "class", "document"]]

[['generated_summary', 'class', 'document']]

In [16]:
display(newmed_sample)

Unnamed: 0,generated_summary,id,document,class
0,"Mild aortic stenosis, widely calcified, minima...",0,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Medical Records
1,hypopharynx was normal and upper esophageal sp...,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha...",Surgery
2,the patient comes for three-week postpartum ch...,2,"CHIEF COMPLAINT:, The patient comes for three...",Medical Records
3,fluoroscopy was used to identify bony landmark...,3,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi...",Surgery
4,the patient is a 71-year-old lady who came in ...,4,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu...",Medical Records
5,the patient agreed to proceed and informed con...,5,"INDICATION:, Coronary artery disease, severe ...",Internal Medicine
6,the patient is a 2-year-old girl who comes in ...,6,"SUBJECTIVE:, The patient is a 2-year-old litt...",Medical Records
7,the patient has been suffering from a chronic ...,7,"REASON FOR EVALUATION: , The patient is a 37-y...",Medical Records
8,76 y/o male suddenly became anosmic following ...,8,"CC:, Progressive visual loss.,HX:, 76 y/o male...",Medical Records
9,the patient was advised to protect the knee by...,9,The patient was told that the injection may ca...,Internal Medicine


In [17]:
newmed_sample["document"][1]

'PREOPERATIVE DIAGNOSES: , Dysphagia and esophageal spasm.,POSTOPERATIVE DIAGNOSES: , Esophagitis and esophageal stricture.,PROCEDURE:,  Gastroscopy.,MEDICATIONS:,  MAC.,DESCRIPTION OF PROCEDURE: , The Olympus gastroscope was introduced into the oropharynx and passed carefully through the esophagus, stomach, and duodenum, to the third portion of the duodenum.  The hypopharynx was normal and the upper esophageal sphincter was unremarkable.  The esophageal contour was normal, with the gastroesophageal junction located at 38 cm from the incisors.  At this point, there were several linear erosions and a sense of stricturing at 38 cm.  Below this, there was a small hiatal hernia with the hiatus noted at 42 cm from the incisors.  The mucosa within the hernia was normal.  The gastric lumen was normal with normal mucosa throughout.  The pylorus was patent permitting passage of the scope into the duodenum, which was normal through the third portion.  During withdrawal of the scope, additional v

In [18]:
newmed_sample["generated_summary"][1]

'hypopharynx was normal and upper esophageal sphincter was unremarkable . below this, there was a small hiatal hernia with the hiatus noted at 42 cm from the incisors . the gastric lumen was normal with normal mucosa throughout .'

## Models
## (a). With AutoTokenizer and AutoModelForSeq2SqeLM

In [19]:
# Comparisons
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the pre-trained tokenizer and model.
tokenizer = AutoTokenizer.from_pretrained("t5-small", cache_dir="../datasets/")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small", cache_dir="../datasets/")

# For summarization, T5-small expects a prefix "summarize: ", so we prepend that to each cases as a prompt.
cases = list(map(lambda cases: "summarize: " + cases, newmed_sample["document"]))
a = pd.DataFrame(cases, columns=["prompts"])
a

Unnamed: 0,prompts
0,"summarize: 2-D STUDY,1. Mild aortic stenosis, ..."
1,"summarize: PREOPERATIVE DIAGNOSES: , Dysphagia..."
2,"summarize: CHIEF COMPLAINT:, The patient come..."
3,"summarize: PROCEDURE: , Bilateral L5, S1, S2, ..."
4,"summarize: DISCHARGE DIAGNOSES:,1. Chronic obs..."
5,"summarize: INDICATION:, Coronary artery disea..."
6,"summarize: SUBJECTIVE:, The patient is a 2-ye..."
7,"summarize: REASON FOR EVALUATION: , The patien..."
8,"summarize: CC:, Progressive visual loss.,HX:, ..."
9,summarize: The patient was told that the injec...


In [20]:
# cases[1]

In [21]:
# Tokenize the input
inputs = tokenizer(
    cases, max_length=1024, return_tensors="pt", padding=True, truncation=True
)
print("input_ids:")
print(inputs["input_ids"])
print("attention_mask:")
print(inputs["attention_mask"])

input_ids:
tensor([[21603,    10,  8401,  ...,     0,     0,     0],
        [21603,    10, 22694,  ...,     0,     0,     0],
        [21603,    10,     3,  ...,     0,     0,     0],
        ...,
        [21603,    10,  3388,  ...,     0,     0,     0],
        [21603,    10,     3,  ...,     0,     0,     0],
        [21603,    10,     3,  ...,     6,   150,     1]])
attention_mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])


In [22]:
# Generate summaries
summary_ids = model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,
    num_beams=2,
    min_length=0,
    max_length=40,
)
print(summary_ids)

tensor([[    0,   132,    19,   209,    12,   204,  1220,     3,     9,   127,
          1225,  5925, 17753,  6821,  1153,   894,     6,    68,   150,     3,
             9,   127,  1225,     3,  1913,    32,     7,   159,     3,     5,
           132,    19,   209,    12,   204,  1220,     3,     9,   127,  1225],
        [    0,     8,     3,    15,     7, 10775,   545,   138, 17643,    47,
          1389,     6,    28,     8, 24038,    15,     7, 10775,   545,   138,
         23704,  1069,    44,  6654,  2446,    45,     8,     3, 11542,     7,
           127,     7,     3,     5,   666,    48,     6,   132,    47,     3],
        [    0,     8,  1868,   639,    21,   386,    18,  8041,   442,  2274,
           440,   691,   413,     6, 24184,    13, 18500,     3,     5,   255,
            19,   338,    81,     3,     9,  3016,  8950,  4394,     3,     9,
           239,     6,    59,   136, 24268,    53,    42,     3,  3903,  6031],
        [    0,  6720,    32,  1859, 15652,    47

In [23]:
# Decode the generated summaries
decoded_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
med_sample_modelA = pd.DataFrame(decoded_summaries, columns=["decoded_summariesA"])
display(med_sample_modelA)

Unnamed: 0,decoded_summariesA
0,there is 1 to 2+ aortic regurgitation easily s...
1,"the esophageal contour was normal, with the ga..."
2,the patient comes for three-week postpartum ch...
3,fluoroscopy was used to identify bony landmark...
4,chronic atrial fibrillation with prior ablatio...
5,"coronary artery disease, severe aortic stenosi..."
6,the patient is a 2-year-old girl who comes in ...
7,the patient has been suffering from a chronic ...
8,76 y/o male suddenly became anosmic following ...
9,the patient was told that the injection may ca...


In [24]:
med_sample_modelA["decoded_summariesA"][1]

'the esophageal contour was normal, with the gastroesophageal junction located at 38 cm from the incisors. below this, there was '

## (b). Another method with T5Tokenizer and T5ForConditionalGeneration

In [25]:
# Alternative method
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small", cache_dir="../working/cache/")
model = T5ForConditionalGeneration.from_pretrained(
    "t5-small", cache_dir="../datasets/"
)

In [26]:
inputs = tokenizer(
    cases, max_length=1024, return_tensors="pt", padding=True, truncation=True
)
summary_ids = model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,
    num_beams=2,
    min_length=0,
    max_length=40,
)
decoded_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

In [27]:
med_sample_modelB = pd.DataFrame(decoded_summaries, columns=["decoded_summariesB"])
display(med_sample_modelB)

Unnamed: 0,decoded_summariesB
0,there is 1 to 2+ aortic regurgitation easily s...
1,"the esophageal contour was normal, with the ga..."
2,the patient comes for three-week postpartum ch...
3,fluoroscopy was used to identify bony landmark...
4,chronic atrial fibrillation with prior ablatio...
5,"coronary artery disease, severe aortic stenosi..."
6,the patient is a 2-year-old girl who comes in ...
7,the patient has been suffering from a chronic ...
8,76 y/o male suddenly became anosmic following ...
9,the patient was told that the injection may ca...


In [28]:
med_sample_modelB["decoded_summariesB"][1]

'the esophageal contour was normal, with the gastroesophageal junction located at 38 cm from the incisors. below this, there was'

## Translation (Chinese to English)

In [29]:
## Chinese to English (Only one model availble from Hugging Face now)
en_to_es_translation_pipeline = pipeline(
    task="translation",
    model="Helsinki-NLP/opus-mt-zh-en",
    model_kwargs={"cache_dir": "../datasets/"},
)

In [30]:
en_to_es_translation_pipeline(
    "她突然生病了，需要马上看医生，根据现有的症状我们需要尽快选择对口的医院和医生"
)

[{'translation_text': 'She suddenly got sick, she needed to see a doctor right away, and according to the symptoms we had, we had to choose the counterpart hospital and the doctor as soon as possible.'}]

## Generation

In [31]:
t5_small_pipeline = pipeline(
    task="text2text-generation",
    model="t5-small",
    max_length=50,
    model_kwargs={"cache_dir": "../datasets/"},
)

In [32]:
t5_small_pipeline(
    "translate English to Romanian: She suddenly got sick, she needed to see a doctor right away, and according to the symptoms we had, we had to choose the counterpart hospital and the doctor as soon as possible."
)

[{'generated_text': 'A fost brusc bolnavă, a trebuit să vadă un medic imediat şi, potrivit simptomelor pe care le-am avut, a trebuit să alegem cât mai curând spitalul omolog şi medicul.'}]

In [38]:
# t5_small_pipeline(
#     "translate English to Chinese: She suddenly got sick, she needed to see a doctor right away, and according to the symptoms we had, we had to choose the counterpart hospital and the doctor as soon as possible."
# )

In [33]:
## Many thanks to HuggingFace (References).