In [1]:
import os
import csv
import pandas as pd
import pprint as pp

# Adjust this directory for your local setup
mimic_dir = '/Users/vince/Documents/MIMIC/'
os.listdir(mimic_dir)

['.DS_Store', 'mimic-iv-note', 'mimic-iv-2.2']

In [2]:
mimic_note_dir = os.path.join(mimic_dir, 'mimic-iv-note/note')
mimic_icu_dir = os.path.join(mimic_dir, 'mimic-iv-2.2/icu')
mimic_hosp_dir = os.path.join(mimic_dir, 'mimic-iv-2.2/hosp')

### Dataset Construction

A subset of patients (10k at for now) will be used to create the initial dataset

The goal is to create chronological semi-structured journal entries for each patient

The LLM will be trained to output a sentiment score based on the current clinical data

Fine tuning can be accomplished based on discharge type. Some patients were sent to hospice or died, so sentiment should be negative.

Some lab results have the priority STAT which implies a dire situation. Abnormal range values will also be a key indicator of a health problem.



In [3]:
patients_df = pd.read_csv(mimic_hosp_dir + '/patients.csv', nrows=10000)
patients_df.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000068,F,19,2160,2008 - 2010,
3,10000084,M,72,2160,2017 - 2019,2161-02-13
4,10000102,F,27,2136,2008 - 2010,


In [5]:
admissions_df = pd.read_csv(mimic_hosp_dir + '/admissions.csv', nrows=10000)
admissions_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [13]:
admissions_df['hadm_id'].unique().shape

(10000,)

In [12]:
admissions_df['discharge_location'].value_counts()[:5]

discharge_location
HOME                        3613
HOME HEALTH CARE            1785
SKILLED NURSING FACILITY     982
REHAB                        217
DIED                         192
Name: count, dtype: int64

**Merge lab item codes**

In [14]:
d_labitems_df = pd.read_csv(mimic_hosp_dir + '/d_labitems.csv', nrows=10000)
d_labitems_df.head()

Unnamed: 0,itemid,label,fluid,category
0,50801,Alveolar-arterial Gradient,Blood,Blood Gas
1,50802,Base Excess,Blood,Blood Gas
2,50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
3,50804,Calculated Total CO2,Blood,Blood Gas
4,50805,Carboxyhemoglobin,Blood,Blood Gas


In [15]:
labevents_df = pd.read_csv(mimic_hosp_dir + '/labevents.csv', nrows=10000)
labevents_df.head()

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
0,1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.4,,0.9,1.1,abnormal,ROUTINE,
1,2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.1,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.
2,3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15.0,ng/mL,30.0,60.0,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ...
3,4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102.0,IU/L,0.0,40.0,abnormal,ROUTINE,
4,5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.3,g/dL,3.5,5.2,abnormal,ROUTINE,


In [25]:
labevents_df['comments'].isna().sum()

8080

In [94]:
labmerge_df = pd.merge(labevents_df, d_labitems_df, on='itemid', how='left')
labmerge_df.head()

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments,label,fluid,category
0,1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.4,,0.9,1.1,abnormal,ROUTINE,,INR(PT),Blood,Hematology
1,2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.1,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.,PT,Blood,Hematology
2,3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15.0,ng/mL,30.0,60.0,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ...,25-OH Vitamin D,Blood,Chemistry
3,4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102.0,IU/L,0.0,40.0,abnormal,ROUTINE,,Alanine Aminotransferase (ALT),Blood,Chemistry
4,5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.3,g/dL,3.5,5.2,abnormal,ROUTINE,,Albumin,Blood,Chemistry


In [152]:
cols = ['subject_id', 'hadm_id', 'specimen_id', 'charttime', 'value', 'valuenum',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments', 'label', 'fluid', 'category']

labmerge_df = labmerge_df[cols]
labmerge_df['charttime'] = pd.to_datetime(labmerge_df['charttime'])
labmerge_df = labmerge_df.sort_values(by='charttime')
labmerge_df.head()

Unnamed: 0,subject_id,hadm_id,specimen_id,charttime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments,label,fluid,category
5439,10000883,,48550065,2124-05-14 15:58:00,,,mg/dL,2.0,25.0,,STAT,NEG.,Salicylate,Blood,Chemistry
5444,10000883,,48550065,2124-05-14 15:58:00,17,17.0,mg/dL,6.0,20.0,,STAT,,Urea Nitrogen,Blood,Chemistry
5443,10000883,,48550065,2124-05-14 15:58:00,NEG,,,,,,STAT,POSITIVE TRICYCLIC RESULTS REPRESENT POTENTIAL...,Tricyclic Antidepressant Screen,Blood,Chemistry
5442,10000883,23334588.0,48550065,2124-05-14 15:58:00,1.4,1.4,uIU/mL,0.27,4.2,,STAT,,Thyroid Stimulating Hormone,Blood,Chemistry
5441,10000883,23334588.0,48550065,2124-05-14 15:58:00,1.4,1.4,uIU/mL,0.27,4.2,,STAT,,Thyroid Stimulating Hormone,Blood,Chemistry


In [154]:
def create_grouped_text_entry(group):
    specimen_id = group.name
    for index, row in group.iterrows():
        subject_id = row['subject_id']
        hadm_id = row['hadm_id']
        charttime = row['charttime']
        i = index
        break

    text_entries = f"Time: {charttime}, Subject ID: {subject_id}, HAdm ID: {hadm_id}, Specimen ID: {specimen_id}, Lab Tests:"
    # Append details for each test in the group
    for index, row in group.iterrows():
        test_details = (f"\nTest [{index-i+1}]: {row['label']} ({row['category']} - {row['fluid']}), "
                        f"Value: {row['value']} {row['valueuom'] if pd.notna(row['valueuom']) else ''} "
                        f"(Range: [{row['ref_range_lower']}, {row['ref_range_upper']}]), "
                        f"Flag: {row['flag']}, Priority: {row['priority']}{', Comments: ' + row['comments'] if pd.notna(row['comments']) else ''};")
        text_entries += test_details
    
    return specimen_id, subject_id, hadm_id, charttime, text_entries

# Assuming labmerge_filtered is your original dataframe
grouped_data = labmerge_df.groupby('specimen_id').apply(create_grouped_text_entry)

# Convert the series of tuples into a DataFrame
lab_journal_df = pd.DataFrame(grouped_data.tolist(), columns=['specimen_id', 'subject_id', 'hadm_id', 'time', 'text'])

# Display the resulting DataFrame
lab_journal_df.head()


  grouped_data = labmerge_df.groupby('specimen_id').apply(create_grouped_text_entry)


Unnamed: 0,specimen_id,subject_id,hadm_id,time,text
0,30901,10000826,21086876.0,2146-12-24 14:40:00,"Time: 2146-12-24 14:40:00, Subject ID: 1000082..."
1,120307,10000980,,2191-08-21 11:20:00,"Time: 2191-08-21 11:20:00, Subject ID: 1000098..."
2,313687,10001319,,2138-09-27 22:22:00,"Time: 2138-09-27 22:22:00, Subject ID: 1000131..."
3,422899,10000980,,2190-11-16 10:54:00,"Time: 2190-11-16 10:54:00, Subject ID: 1000098..."
4,498884,10000980,25242409.0,2191-04-08 08:46:00,"Time: 2191-04-08 08:46:00, Subject ID: 1000098..."


In [158]:
# pd.set_option('display.max_columns', 50)

microbiologyevents_df = pd.read_csv(mimic_hosp_dir + '/microbiologyevents.csv', nrows=10000)
microbiologyevents_df['charttime'] = pd.to_datetime(microbiologyevents_df['charttime'])
microbiologyevents_df = microbiologyevents_df.sort_values(by='charttime')

microbiologyevents_df.head()

Unnamed: 0,microevent_id,subject_id,hadm_id,micro_specimen_id,order_provider_id,chartdate,charttime,spec_itemid,spec_type_desc,test_seq,...,org_name,isolate_num,quantity,ab_itemid,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,comments
555,556,10001725,,4439807,P35SU0,2110-01-04 00:00:00,2110-01-04 14:33:00,70068,SWAB,2,...,YEAST,1.0,,,,,,,,
556,557,10001725,,4439807,P35SU0,2110-01-04 00:00:00,2110-01-04 14:33:00,70068,SWAB,1,...,,,,,,,,,,GRAM STAIN NEGATIVE FOR BACTERIAL VAGINOSIS. ...
557,558,10001725,25563031.0,3176906,,2110-04-11 00:00:00,2110-04-11 17:51:00,70091,MRSA SCREEN,1,...,,,,,,,,,,No MRSA isolated.
558,559,10001725,25563031.0,743256,,2110-04-14 00:00:00,2110-04-14 12:37:00,70079,URINE,1,...,CITROBACTER FREUNDII COMPLEX,1.0,,90008.0,TRIMETHOPRIM/SULFA,=>16,=>,16.0,R,
559,560,10001725,25563031.0,743256,,2110-04-14 00:00:00,2110-04-14 12:37:00,70079,URINE,1,...,CITROBACTER FREUNDII COMPLEX,1.0,,90010.0,NITROFURANTOIN,<=16,<=,16.0,S,


In [190]:
import numpy as np

def create_microbiology_text_entry(group):
    # Initialize the entry text
    micro_specimen_id = group.name
    group = group.sort_index()
    
    i = np.inf
    hadm_id = None
    charttime = None
    for index, row in group.iterrows():
        subject_id = row['subject_id']
        hadm_id = row['hadm_id'] or hadm_id
        charttime = row['charttime'] or charttime
        i = min(index, i)

    # Start building the journal entry
    text_entries = f"Time: {charttime}, Subject ID: {subject_id}, HAdm ID: {hadm_id}, Micro Specimen ID: {micro_specimen_id}, Microbiology Tests:"
    # Append details for each test in the group
    for index, row in group.iterrows():
            test_details = (f"\nTest [{index-i+1}]: {row['test_name']} {row['spec_type_desc']} {row['org_name']}, "
                            f"Interpretation: {row['interpretation'] if pd.notna(row['interpretation']) else 'Missing'}"
                            f"{', Comments: ' + row['comments'] if (pd.notna(row['comments']) and row['comments'] != '___') else ''};")
            text_entries += test_details

    return micro_specimen_id, subject_id, hadm_id, charttime, text_entries

# Group by 'micro_specimen_id' and apply the function
grouped_micro_data = microbiologyevents_df.groupby('micro_specimen_id').apply(create_microbiology_text_entry)

# Convert the series of tuples into a DataFrame
micro_journal_df = pd.DataFrame(grouped_micro_data.tolist(), columns=['micro_specimen_id', 'subject_id', 'hadm_id', 'time', 'text'])

# Display the resulting DataFrame
micro_journal_df.head()


  grouped_micro_data = microbiologyevents_df.groupby('micro_specimen_id').apply(create_microbiology_text_entry)


Unnamed: 0,micro_specimen_id,subject_id,hadm_id,time,text
0,361,10013653,26666796.0,2182-10-27 09:10:00,"Time: 2182-10-27 09:10:00, Subject ID: 1001365..."
1,3584,10029291,,2123-02-20 01:13:00,"Time: 2123-02-20 01:13:00, Subject ID: 1002929..."
2,3758,10011427,20219031.0,2136-03-22 14:22:00,"Time: 2136-03-22 14:22:00, Subject ID: 1001142..."
3,9243,10005012,28371912.0,2169-10-23 19:30:00,"Time: 2169-10-23 19:30:00, Subject ID: 1000501..."
4,9430,10023771,20044587.0,2113-08-25 12:45:00,"Time: 2113-08-25 12:45:00, Subject ID: 1002377..."


In [191]:
# Filter to find the entry for micro_specimen_id 3575952
entry = micro_journal_df[micro_journal_df['micro_specimen_id'] == 3575952]

# Print the full text entry
if not entry.empty:
    print(entry.iloc[0]['text'])  # Adjust the index if necessary to target the specific entry
else:
    print("No entry found for micro_specimen_id 3575952.")


Time: 2117-10-26 10:00:00, Subject ID: 10021487, HAdm ID: 27112038.0, Micro Specimen ID: 3575952, Microbiology Tests:
Test [1]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [2]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [3]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [4]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [5]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [6]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [7]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [8]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [9]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [10]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [11]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [12]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [13]: WOUND CULTURE ABSCESS KLEBSIELLA PNEUMON

In [194]:
micro_journal_df.iloc[0].text

'Time: 2182-10-27 09:10:00, Subject ID: 10013653, HAdm ID: 26666796.0, Micro Specimen ID: 361, Microbiology Tests:\nTest [1]: GRAM STAIN SPUTUM nan, Interpretation: Missing, Comments: >25 PMNs and >10 epithelial cells/100X field.  Gram stain indicates extensive contamination with upper respiratory secretions. Bacterial culture results are invalid.  PLEASE SUBMIT ANOTHER SPECIMEN.  ;\nTest [2]: RESPIRATORY CULTURE SPUTUM nan, Interpretation: Missing, Comments: TEST CANCELLED, PATIENT CREDITED.  ;'

In [134]:
omr_df = pd.read_csv(mimic_hosp_dir + '/omr.csv', nrows=10000)
omr_df

Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
0,10000032,2180-04-27,1,Blood Pressure,110/65
1,10000032,2180-04-27,1,Weight (Lbs),94
2,10000032,2180-05-07,1,BMI (kg/m2),18.0
3,10000032,2180-05-07,1,Height (Inches),60
4,10000032,2180-05-07,1,Weight (Lbs),92.15
...,...,...,...,...,...
9995,10012853,2177-01-05,1,Weight (Lbs),149.8
9996,10012853,2177-02-12,1,BMI (kg/m2),25.6
9997,10012853,2177-02-12,1,Weight (Lbs),149
9998,10012853,2177-11-03,1,BMI (kg/m2),26.1


In [140]:
def create_omr_text_entry(group):
    entries = []
    for index, row in group.iterrows():
        entry = f"{row['result_name']}: {row['result_value']}"
        entries.append(entry)
    combined_entries = ", ".join(entries)
    return combined_entries

omr_df_cleaned = omr_df.drop_duplicates(subset=['subject_id', 'chartdate', 'result_name', 'result_value'])

# Group by 'subject_id' and 'chartdate' and apply the function to each group
grouped_omr_entries = omr_df_cleaned.groupby(['subject_id', 'chartdate']).apply(create_omr_text_entry)

# Convert the grouped entries to a DataFrame
journal_omr_df = grouped_omr_entries.reset_index(name='text')
journal_omr_df.columns = ['subject_id', 'chartdate', 'text']

# Display the resulting DataFrame
journal_omr_df.head()

  grouped_omr_entries = omr_df_cleaned.groupby(['subject_id', 'chartdate']).apply(create_omr_text_entry)


Unnamed: 0,subject_id,chartdate,text
0,10000032,2180-04-27,"Blood Pressure: 110/65, Weight (Lbs): 94"
1,10000032,2180-05-07,"BMI (kg/m2): 18.0, Height (Inches): 60, Weight..."
2,10000032,2180-05-25,"Blood Pressure: 106/60, BMI (kg/m2): 18.6, Wei..."
3,10000032,2180-06-01,"Blood Pressure: 121/77, BMI (kg/m2): 18.7, Wei..."
4,10000032,2180-06-22,"Blood Pressure: 100/60, BMI (kg/m2): 18.9, Hei..."


In [146]:
journal_omr_df['subject_id'].value_counts()

subject_id
10003019    164
10011912    134
10002013    117
10002755    100
10002221     86
           ... 
10008454      1
10010718      1
10003757      1
10006395      1
10010655      1
Name: count, Length: 212, dtype: int64