In [1]:
import numpy as np
import pandas as pd
import os,sys

In [2]:
import re

In [3]:
dir_root = '/nfs/turbo/med-kayvan-lab/Projects/HeartFailure/Data/Raw/physionet.org/files/mimic-iv-note/2.2/note'

In [4]:
discharge = pd.read_csv(os.path.join(dir_root,'discharge.csv'))

## Calcualte the average length of claims

In [6]:
def rough_process(text):
    p_text = text.replace('\n' , '')
    p_text = p_text.replace('  ' , '')
    p_text = p_text.replace('___' , '')
    return p_text

# p rough_process(discharge['text'].iloc[0])

In [7]:
discharge['rough'] = discharge['text'].apply(rough_process)

In [8]:
discharge['rough'].apply(len).mean()

9868.705968160968

## DRG extraction (Optional)

In [258]:
"""
https://github.com/hanyin88/DRG-LLaMA/blob/main/data/MIMIC_Preprocessing.py
Can be used as one of the reference for text pre-processing, but it does little help
"""

'\nhttps://github.com/hanyin88/DRG-LLaMA/blob/main/data/MIMIC_Preprocessing.py\nCan be used as one of the reference for text pre-processing, but it does little help\n'

## Extract information useful for MEDIQA-CORR task

In [49]:
def generate_summary(text):
    # Regular expressions for various sections
    chief_complaint_pattern = re.compile(r"Chief Complaint:\n(.+?)\n")
    history_pattern = re.compile(r"History of Present Illness:\n(.+?)\n")
    medications_admission_pattern = re.compile(r"Medications on Admission:\n(.+?)\n")
    discharge_medications_pattern = re.compile(r"Discharge Medications:\n(.+?)\n")
    diagnosis_pattern = re.compile(r"Discharge Diagnosis:\n(.+?)\n")
    
    # Function to extract and format information
    def extract_and_format(pattern, text, summary_index):
        match = pattern.search(text)
        return f"{summary_index}. {match.group(1).strip()}" if match else None
    
    # Extract information using defined patterns
    summary = []
    summary.append(extract_and_format(chief_complaint_pattern, text, 0))
    summary.append(extract_and_format(history_pattern, text, 1))
    summary.append(extract_and_format(medications_admission_pattern, text, 2))
    summary.append(extract_and_format(discharge_medications_pattern, text, 3))
    summary.append(extract_and_format(diagnosis_pattern, text, 4))
    
    # Filter out None values and join the summary
    formatted_summary = "\n".join(filter(None, summary))
    return formatted_summary

### chief complaint and gender

In [75]:
# chief complaint
chief_complaint_pattern1 = re.compile(r"Chief Complaint:\n(.+?)\n")
def find_complaints(note):
    match = chief_complaint_pattern1.search(note)
    if match:
        return match.group(1).strip()
    else:
        return 'No chief complaint'

In [92]:
gender_pattern = re.compile(r'Sex:\s*(\w)\n')
def find_gender(note):
    match = gender_pattern.search(note)
    if match:
        return match.group(1).strip()
    else:
        return 'Not reported'

### History of Present Illness:

In [254]:
def condense_text(text):
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'\n+', ' ', text)  
    return text.strip()

In [255]:
def find_history_present_illness(note):
    history_pattern_1 = re.compile(r"History of Present Illness:\s*(.*?)(?=\n\s+\nPast Medical History:)", re.DOTALL)
    history_pattern_2 = re.compile(r"History of Present Illness:\s*(.*?)(?=\nREVIEW OF SYSTEMS:)", re.DOTALL)

    if re.search(history_pattern_2, note):
        matched = re.search(history_pattern_2, note).group(1)
    elif re.search(history_pattern_1, note):
        matched = re.search(history_pattern_1, note).group(1)
    else:
        matched = None
    if matched:
        text = matched.strip()
        text = condense_text(text)
        # text = text.replace('___','')
        return text
    else:
        return 'Not reported'

In [256]:
print(find_history_present_illness(discharge['text'].iloc[30]))

Ms. ___ is a ___ y/o woman with a past medical history of MS, and a right parietal brain abscess which was discovered approxiamtely one month ago, when she presented with left arm and face numbness. The abscess was drained in the OR on ___, and she was initially started on broad spectrum antibiotics until culture data returned with S. anginosus and fusobacterium, she was then transitioned to Ceftriaxone 2g IV q12h, and flagyl 500mg TID, which she has been on since through her PICC line. On ___, she was seen in ___ clinic and a repeat MRI was performed which revealed increased edema with persistent ring enhancing abnormality at the right parietal surgical site, concerning for ongoing abscess. She was therefore scheduled for repeat drainage on ___. She was seen as an outpatient in the infectious disease office today, ___, and it was recommended that she be admitted to the hospital one day early for broadening of her antibiotic regimen prior to drainage. She states that over the past mont

### vital signs

In [49]:
# vital signs
vital_sign_pattern1 = re.compile(r"\nVS(.+?)\n")
vital_sign_pattern2 = re.compile(r"\nVitals(.+?)\n")
vital_sign_pattern3 = re.compile(r"\nPhysical Exam:\nO:(.+?)\n")
vital_sign_pattern4 = re.compile(r"\nPHYSICAL EXAM ON ADMISSION:\nO:(.+?)\n")

def customize_BP(string):
    bp_pattern = re.compile(r'\d+\s*/\s*\d+')
    bp_readings = bp_pattern.findall(string)
    for bp in bp_readings:
        string = string.replace(bp, "BP_PLACEHOLDER")
    splits = string.split()
    for i, s in enumerate(splits):
        if 'BP_PLACEHOLDER' in s:
            splits[i] = bp_readings.pop(0)
    return splits

def find_vitals(note):
    if re.search(vital_sign_pattern1, note):
        matched = re.search(vital_sign_pattern1, note).group(1)
        splits = 1
    elif re.search(vital_sign_pattern2, note):
        matched = re.search(vital_sign_pattern2, note).group(1)
        splits = 2
    elif re.search(vital_sign_pattern3, note):
        matched = re.search(vital_sign_pattern3, note).group(1)
        splits = 3
    elif re.search(vital_sign_pattern4, note):
        matched = re.search(vital_sign_pattern4, note).group(1)
        splits = 4
    else:
        splits = None
    if splits:
        splits = customize_BP(matched)
        splits = [s for s in splits if 'O2' not in s ]
        splits = [s for s in splits if re.search(r'\d+|\d+/\d+', s)]
        splits = [re.sub(r'[a-zA-Z]', '', s) for s in splits]
        pattern = re.compile(r'[^\w/\.]|_')
        splits = [pattern.sub('', s) for s in splits]
        return f'The Temperature is {splits[0]}, BP is {splits[1]}, HR is {splits[2]}, RR is {splits[3]}, SpO2 is {splits[4]}.'
    else:
        return 'No vital signs at admission date'

In [50]:
for i in range(5):
    print(find_vitals(discharge['text'].iloc[i]))

The Temperature is 98.1, BP is 107/61, HR is 78, RR is 18, SpO2 is 97.
The Temperature is 98.1, BP is 105/57, HR is 79, RR is 20, SpO2 is 97.
The Temperature is 98.7, BP is 84/48, HR is 91, RR is 24, SpO2 is 98.
The Temperature is 97, BP is 98/65, HR is 103, RR is 18, SpO2 is 94.
No vital signs at admission date


### main diagnosis

In [420]:
def condense_text(text):
    text = re.sub(r'\=+', '', text)
    text = re.sub(r'\n\n', '. ', text) 
    text = re.sub(r'(?<!:)\n', ', ', text) 
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'\-|_', '', text) 
    return text.strip()

In [421]:
def find_diagnosis(note):
    diagnosis_pattern1 = re.compile(r"Discharge Diagnosis:\n(.*?)\n\s+\nDischarge Condition:", re.DOTALL)
    if re.search(diagnosis_pattern1, note):
        matched = re.search(diagnosis_pattern1, note)
    else:
        matched = None
    if matched:
        text = matched.group(1).strip()
        return condense_text(text)
    else:
        return 'Not found!'
    

In [422]:
print(find_diagnosis(discharge['text'].iloc[3]))

PRIMARY DIAGNOSIS. Decompensated HCV cirrhosis, Hyperkalemia. SECONDARY DIAGNOSES. HIV, COPD, Hyponatremia


In [423]:
for i in range(5):
    print(i)
    print(find_diagnosis(discharge['text'].iloc[i]))
    print('===========================')

0
Ascites from Portal HTN
1
Primary:  diuretic refractory ascites, Secondary: HCV cirrhosis, HIV, hyponatremia, COPD
2
Primary: Hypotension, Hyperkalemia, Acute Kidney Injury. Secondary: HIV, Cirrhosis, COPD
3
PRIMARY DIAGNOSIS. Decompensated HCV cirrhosis, Hyperkalemia. SECONDARY DIAGNOSES. HIV, COPD, Hyponatremia
4
Dementia
