In [22]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

import re
from itertools import combinations
import matplotlib.pyplot as plt

from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
from collections import Counter
from pandarallel import pandarallel
import multiprocessing
pandarallel.initialize(nb_workers = multiprocessing.cpu_count()-2, use_memory_fs=False)
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, set_seed
from datasets import Dataset

INFO: Pandarallel will run on 14 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [3]:
data = pd.read_csv('medical_samples.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           4999 non-null   int64 
 1   Pseudo_Patient_Name  4999 non-null   object
 2   description          4999 non-null   object
 3   medical_specialty    4999 non-null   object
 4   sample_name          4999 non-null   object
 5   transcription        4966 non-null   object
 6   keywords             3931 non-null   object
dtypes: int64(1), object(6)
memory usage: 273.5+ KB


1. Assigning a Patient_ID to make it easier to navigate

In [4]:
df = data.copy()

# Create a dictionary to map each unique name to a unique ID
id_map = {}
for i, name in enumerate(df['Pseudo_Patient_Name'].unique()):
    id_map[name] = i+1

# Add a new 'ID' column based on the name-to-ID mapping
df['Patient_ID'] = df['Pseudo_Patient_Name'].map(id_map)
df = df[['Patient_ID', 'Pseudo_Patient_Name', 'description', 'medical_specialty', 'sample_name', 'transcription', 'keywords']]
df.head(2)

Unnamed: 0,Patient_ID,Pseudo_Patient_Name,description,medical_specialty,sample_name,transcription,keywords
0,1,James,A 23-year-old white female presents with complaint of allergies.,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal...","allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic,"
1,2,Chester,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor. He exercises three times a week at home and does cardio. He has difficulty walking two blocks or five flights of stairs. Difficulty with snoring. He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling. He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive...","bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, body weight, laparoscopic gastric, weight loss, pounds, months, weight, laparoscopic, band, loss, diets, overweight, lost"


1. Get Patient information

In [33]:

def extract_age(text):
    import re
    age_pattern = r'(\d{1,2})\s?-?\s?(?:years?|yrs?)-?(?:old)?'
    age_match = re.search(age_pattern, text.lower())
    if age_match:
        age = int(age_match.group(1))
        return age
    else:
        return 0

def extract_gender(text):
    import re
    male_patterns = [r'\b(he|him|his)\b', r'\b(male|man|boy)\b']
    female_patterns = [r'\b(she|her|hers)\b', r'\b(female|woman|girl)\b']
    
    # Check for male patterns
    for pattern in male_patterns:
        if re.search(pattern, text.lower()):
            return 'male'
    
    # Check for female patterns
    for pattern in female_patterns:
        if re.search(pattern, text.lower()):
            return 'female'
    
    # If no patterns found, return None
    return None

def extract_weight(text):
    import re
    weight_patterns = [
        r'weight\s+is\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighing\s+in\s+at\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'tipping\s+the\s+scales\s+at\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'measured\s+at\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'carrying\s+a\s+weight\s+of\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'mass\s+is\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+approximately\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+around\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+between\s+(\d+(?:\.\d+)?)\s+and\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+less\s+than\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+more\s+than\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+a\s+solid\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'currently\s+weighs\s+in\s+at\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+a\s+healthy\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weight\s+has\s+been\s+consistent\s+at\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weight\s+was\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weighs\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weight\s+is\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'currently\s+weigh\s+(\d+(?:\.\d+)?)\s+(kilograms|kg|pounds|lbs)',
        r'weight\s*:\s*(\d+(?:\.\d+)?)\s*(kilograms|kg|pounds|lbs)'
    ]
    
    for pattern in weight_patterns:
        weight_match = re.search(pattern, text.lower())
        if weight_match:
            weight_value = float(weight_match.group(1))
            weight_unit = weight_match.group(2)
            if weight_unit in ['kilograms', 'kg']:
                weight_value_pounds = weight_value * 2.20462
            else:
                weight_value_pounds = weight_value
            return weight_value_pounds
    
    return 0

df['combined'] = df['description'] = df['transcription'].astype(str)
df['age'] = df['combined'].parallel_apply(extract_age)
df['gender'] = df['combined'].parallel_apply(extract_gender)
df['weight'] = df['combined'].parallel_apply(extract_weight)


patient_id = df['Patient_ID'].unique()

In [34]:
# df.groupby('Patient_ID').agg({'age': 'first', 'gender': 'first', 'weight': 'first'})
# group the DataFrame by ID
grouped = df[['Patient_ID', 'Pseudo_Patient_Name', 'age', 'gender', 'weight']].groupby('Patient_ID')

# define a function to get the first non-null value in a Series
def first_non_null(series):
    index = series.first_valid_index()
    if index is not None:
        return series[index]

# apply the function to each group and reset the index
patient_information = grouped.apply(lambda x: x.apply(first_non_null)).reset_index(drop=True)
patient_information.head()

Unnamed: 0,Patient_ID,Pseudo_Patient_Name,age,gender,weight
0,1,James,23,female,130.0
1,2,Chester,13,male,312.0
2,3,Shannon,42,male,160.0
3,4,Domingo,0,,0.0
4,5,Eugene,0,female,0.0


In [35]:
patient_information.shape

(1280, 5)

Patients with no information are ones who only did a surgical procedure and pre and post surgery examinations at the hospital as seen below.

In [36]:
x = patient_information[(patient_information['age'].isnull()) & (patient_information['gender'].isnull()) & (patient_information['weight'].isnull())]
x = pd.merge(df[['Patient_ID', 'Pseudo_Patient_Name']], x, how='left', on='Patient_ID',suffixes=('_left', '_right'))
x = x[~x['Pseudo_Patient_Name_right'].isnull()]
x = pd.merge(x, df[['Patient_ID', 'Pseudo_Patient_Name', 'description', 'transcription']], how='left', on='Patient_ID')
x[['Patient_ID', 'Pseudo_Patient_Name', 'age', 'weight', 'gender', 'description', 'transcription']]

Unnamed: 0,Patient_ID,Pseudo_Patient_Name,age,weight,gender,description,transcription


In [37]:
df1 = df[['Patient_ID', 'description', 'medical_specialty', 'sample_name', 'transcription']].copy()
df1.head()

Unnamed: 0,Patient_ID,description,medical_specialty,sample_name,transcription
0,1,"SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal...",Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal..."
1,2,"PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor. He exercises three times a week at home and does cardio. He has difficulty walking two blocks or five flights of stairs. Difficulty with snoring. He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling. He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive...",Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor. He exercises three times a week at home and does cardio. He has difficulty walking two blocks or five flights of stairs. Difficulty with snoring. He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling. He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive..."
2,3,"HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 pounds. He is 5'9"". He has a BMI of 51. He has been overweight for ten years since the age of 33, at his highest he was 358 pounds, at his lowest 260. He is pursuing surgical attempts of weight loss to feel good, get healthy, and begin to exercise again. He wants to be able to exercise and play volleyball. Physically, he is sluggish. He gets tired quickly. He does not go out...",Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 pounds. He is 5'9"". He has a BMI of 51. He has been overweight for ten years since the age of 33, at his highest he was 358 pounds, at his lowest 260. He is pursuing surgical attempts of weight loss to feel good, get healthy, and begin to exercise again. He wants to be able to exercise and play volleyball. Physically, he is sluggish. He gets tired quickly. He does not go out..."
3,4,"2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left ventricle.,3. Normal LV systolic function with left ventricular ejection fraction of 51%.,4. Normal LV diastolic function.,5. No pericardial effusion.,6. Normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve.,7. PA systolic pressure is 36 mmHg.,DOPPLER: , ,1. Mild mitral and tricuspid regurgitation.,2. Trace aortic and pulmonary regurgitation.",Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left ventricle.,3. Normal LV systolic function with left ventricular ejection fraction of 51%.,4. Normal LV diastolic function.,5. No pericardial effusion.,6. Normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve.,7. PA systolic pressure is 36 mmHg.,DOPPLER: , ,1. Mild mitral and tricuspid regurgitation.,2. Trace aortic and pulmonary regurgitation."
4,5,1. The left ventricular cavity size and wall thickness appear normal. The wall motion and left ventricular systolic function appears hyperdynamic with estimated ejection fraction of 70% to 75%. There is near-cavity obliteration seen. There also appears to be increased left ventricular outflow tract gradient at the mid cavity level consistent with hyperdynamic left ventricular systolic function. There is abnormal left ventricular relaxation pattern seen as well as elevated left atrial pr...,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall thickness appear normal. The wall motion and left ventricular systolic function appears hyperdynamic with estimated ejection fraction of 70% to 75%. There is near-cavity obliteration seen. There also appears to be increased left ventricular outflow tract gradient at the mid cavity level consistent with hyperdynamic left ventricular systolic function. There is abnormal left ventricular relaxation pattern seen as well as elevated left atrial pr...


In [38]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Patient_ID         4999 non-null   int64 
 1   description        4999 non-null   object
 2   medical_specialty  4999 non-null   object
 3   sample_name        4999 non-null   object
 4   transcription      4966 non-null   object
dtypes: int64(1), object(4)
memory usage: 195.4+ KB


If the transcription is null, put the description in its place.

In [39]:
# Load and preprocess your medical dataset
# data = pd.read_csv('medical_data.csv')
questions = df['transcription'].tolist()
answers = df['age'].tolist()

# Split your dataset into train and test sets
train_size = int(0.8 * len(questions))
train_questions = questions[:train_size]
train_answers = answers[:train_size]
test_questions = questions[train_size:]
test_answers = answers[train_size:]

In [49]:
train_pd = pd.DataFrame({'text':train_questions,'label':train_answers})
test_pd = pd.DataFrame({'text':test_questions,'label':test_answers})

train_pd.dropna(inplace=True)
test_pd.dropna(inplace=True)

In [55]:
train_dataset = Dataset.from_pandas(train_pd)
test_dataset = Dataset.from_pandas(test_pd)

In [56]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 3967
})

In [57]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

set_seed(42)
def tokenize_text(text):
    return tokenizer(text['text'],truncation=True,padding=True,return_tensors='pt')

tokenized_train_dataset = train_dataset.map(tokenize_text,batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_text,batched=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

Map:   0%|          | 0/3967 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [58]:
# Fine-tune the language model on your medical dataset
training_args = TrainingArguments(
    output_dir='results',
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    num_train_epochs=5,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    seed=42,
    fp16=True
)

In [61]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer
)

In [62]:
trainer.train()

  0%|          | 0/2480 [00:00<?, ?it/s]

TypeError: BertForQuestionAnswering.forward() got an unexpected keyword argument 'labels'

In [None]:
# Save the fine-tuned model
trainer.save_model('./medical_qa_model')

# Test the model on a sample question
nlp = pipeline('question-answering', model='./medical_qa_model', tokenizer=tokenizer)
result = nlp({
    'question': 'What is the patient\'s age?',
    'context': 'The patient is a 35-year-old male with a history of diabetes and hypertension.'
})
print(result)

In [63]:
# Import required libraries
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, set_seed

# Load and preprocess your medical dataset
questions = df['transcription'].tolist()
answers = df['age'].tolist()

# Split your dataset into train and test sets
train_size = int(0.8 * len(questions))
train_questions = questions[:train_size]
train_answers = answers[:train_size]
test_questions = questions[train_size:]
test_answers = answers[train_size:]

# Load a pre-trained language model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Fine-tune the language model on your medical dataset
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    max_steps=train_size,
    num_train_epochs=3,
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    seed=42
)

set_seed(42)

train_dataset = [{'transcription': q, 'age': a} for q, a in zip(train_questions, train_answers)]
eval_dataset = [{'transcription': q, 'age': a} for q, a in zip(test_questions, test_answers)]

data_collator = lambda data: {
    'input_ids': tokenizer(
        [item['transcription'] for item in data],
        [item['age'] for item in data],
        truncation=True,
        padding=True,
        return_tensors='pt'
    )['input_ids'],
    'attention_mask': tokenizer(
        [item['transcription'] for item in data],
        [item['age'] for item in data],
        truncation=True,
        padding=True,
        return_tensors='pt'
    )['attention_mask'],
    'start_positions': tokenizer(
        [item['transcription'] for item in data],
        [item['age'] for item in data],
        truncation=True,
        padding=True,
        return_offsets_mapping=True,
        return_tensors='pt'
    ).offset_mapping[:, 0],
    'end_positions': tokenizer(
        [item['transcription'] for item in data],
        [item['age'] for item in data],
        truncation=True,
        padding=True,
        return_offsets_mapping=True,
        return_tensors='pt'
    ).offset_mapping[:, 1] - 1
}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

trainer.train()

# Save the fine-tuned model
trainer.save_model('./medical_qa_model')

# Test the model on a sample question
nlp = pipeline('question-answering', model='./medical_qa_model', tokenizer=tokenizer)
result = nlp({
    'question': 'What is the patient\'s age?',
    'context': 'The patient is a 35-year-old male with a history of diabetes and hypertension.'
})
print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

KeyError: ignored

In [53]:
train_questions.len()

AttributeError: ignored