In [1]:
import os
import csv
import pandas as pd
import pprint as pp

# Adjust this directory for your local setup
mimic_dir = '/Users/vince/Documents/MIMIC/'
os.listdir(mimic_dir)

['.DS_Store', 'mimic-iv-note', 'mimic-iv-2.2']

In [2]:
mimic_note_dir = os.path.join(mimic_dir, 'mimic-iv-note/note')
mimic_icu_dir = os.path.join(mimic_dir, 'mimic-iv-2.2/icu')
mimic_hosp_dir = os.path.join(mimic_dir, 'mimic-iv-2.2/hosp')

### Dataset Construction

A subset of patients (10k at for now) will be used to create the initial dataset

The goal is to create chronological semi-structured journal entries for each patient

The LLM will be trained to output a sentiment score based on the current clinical data

Fine tuning can be accomplished based on discharge type. Some patients were sent to hospice or died, so sentiment should be negative.

Some lab results have the priority STAT which implies a dire situation. Abnormal range values will also be a key indicator of a health problem.



In [213]:
patients_df = pd.read_csv(mimic_hosp_dir + '/patients.csv', nrows=20000)
patients_df.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
1,10000048,F,23,2126,2008 - 2010,
2,10000068,F,19,2160,2008 - 2010,
3,10000084,M,72,2160,2017 - 2019,2161-02-13
4,10000102,F,27,2136,2008 - 2010,


In [214]:
patients_df.to_csv('data/patients_df.csv', index=False)

**Admissions** 

First merge patient gender and age into df

In [265]:
admissions_df = pd.read_csv(mimic_hosp_dir + '/admissions.csv', nrows=20000)
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])
admissions_df = pd.merge(admissions_df, patients_df[['subject_id', 'gender', 'anchor_age']], on='subject_id', how='left')
admissions_df.head()


Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,gender,anchor_age
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0,F,52
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0,F,52
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0,F,52
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0,F,52
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0,F,19


In [270]:
admissions_df['hadm_id'].isna().sum()

0

In [None]:
admissions_df['subject_id'].unique().shape

**Sample patients**

20000 unique hospital admissions

8579 unique patients

In [273]:
sample_patients = admissions_df['subject_id']

# Define a function to process chunks
def process_chunk(chunk):
    return chunk[chunk['subject_id'].isin(sample_patients)]

# Read and filter a large CSV in chunks
chunk_size = 10000  # Define a suitable chunk size

In [220]:
admissions_df['discharge_location'].value_counts()[:5]

discharge_location
HOME                        10194
HOME HEALTH CARE             4859
SKILLED NURSING FACILITY     2711
REHAB                         698
DIED                          580
Name: count, dtype: int64

In [271]:
admissions_df.to_csv('data/admissions_df.csv', index=False)

**Admissions Journal**

In [348]:

# Create the formatted text for journal entries
# Conceal information about final outcome

admissions_df['journal_text'] = (
    "Subject ID: " + admissions_df['subject_id'].astype(str) +
    ", Hospital Admission ID: " + admissions_df['hadm_id'].astype(str) +
    ", Admission Time: " + admissions_df['admittime'].dt.strftime('%Y-%m-%d %H:%M:%S') +
    "\nAdmission Type: " + admissions_df['admission_type'] +
    ", Admission Location: " + admissions_df['admission_location'] +
    ", Insurance: " + admissions_df['insurance'] +
    ", Language: " + admissions_df['language'] +
    ", Marital Status: " + admissions_df['marital_status'] +
    ", Race: " + admissions_df['race'] +
    ", Gender: " + admissions_df['gender'] +
    ", Age: " + admissions_df['anchor_age'].astype(str)
)

# Selecting relevant columns for the final DataFrame
admissions_journal_df = admissions_df[['subject_id', 'hadm_id', 'admittime', 'discharge_location', 'journal_text']]

# Renaming the 'journal_text' column for clarity
admissions_journal_df.rename(columns={'journal_text': 'text'}, inplace=True)

admissions_journal_df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  admissions_journal_df.rename(columns={'journal_text': 'text'}, inplace=True)


Unnamed: 0,subject_id,hadm_id,admittime,discharge_location,text
0,10000032,22595853,2180-05-06 22:23:00,HOME,"Subject ID: 10000032, Hospital Admission ID: 2..."
1,10000032,22841357,2180-06-26 18:27:00,HOME,"Subject ID: 10000032, Hospital Admission ID: 2..."
2,10000032,25742920,2180-08-05 23:44:00,HOSPICE,"Subject ID: 10000032, Hospital Admission ID: 2..."
3,10000032,29079034,2180-07-23 12:35:00,HOME,"Subject ID: 10000032, Hospital Admission ID: 2..."
4,10000068,25022803,2160-03-03 23:16:00,,"Subject ID: 10000068, Hospital Admission ID: 2..."
...,...,...,...,...,...
19995,10486120,21496368,2113-11-02 01:11:00,OTHER FACILITY,"Subject ID: 10486120, Hospital Admission ID: 2..."
19996,10486130,25382870,2148-09-16 20:12:00,HOME,"Subject ID: 10486130, Hospital Admission ID: 2..."
19997,10486144,20471114,2125-05-13 20:30:00,SKILLED NURSING FACILITY,"Subject ID: 10486144, Hospital Admission ID: 2..."
19998,10486144,21214054,2124-04-04 14:31:00,HOME,"Subject ID: 10486144, Hospital Admission ID: 2..."


In [None]:
admissions_journal_df.to_csv('data/admissions_journal_df.csv', index=False)

**Merge lab item codes**

In [222]:
d_labitems_df = pd.read_csv(mimic_hosp_dir + '/d_labitems.csv')
d_labitems_df

Unnamed: 0,itemid,label,fluid,category
0,50801,Alveolar-arterial Gradient,Blood,Blood Gas
1,50802,Base Excess,Blood,Blood Gas
2,50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
3,50804,Calculated Total CO2,Blood,Blood Gas
4,50805,Carboxyhemoglobin,Blood,Blood Gas
...,...,...,...,...
1617,53150,Anti Hbs,Blood,Chemistry
1618,53151,Anti-la,Blood,Chemistry
1619,53152,HIV FINAL,Blood,Chemistry
1620,53153,HIV Screen,Blood,Chemistry


In [274]:
# labevents_df = pd.read_csv(mimic_hosp_dir + '/labevents.csv', nrows=10000)
chunks = pd.read_csv(mimic_hosp_dir + '/labevents.csv', chunksize=chunk_size)
filtered_chunks = [process_chunk(chunk) for chunk in chunks]
labevents_df = pd.concat(filtered_chunks, ignore_index=True)
labevents_df['charttime'] = pd.to_datetime(labevents_df['charttime'])
labevents_df['storetime'] = pd.to_datetime(labevents_df['storetime'])
labevents_df

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
0,1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.4,,0.9,1.1,abnormal,ROUTINE,
1,2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.1,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.
2,3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15.0,ng/mL,30.0,60.0,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ...
3,4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102.0,IU/L,0.0,40.0,abnormal,ROUTINE,
4,5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.3,g/dL,3.5,5.2,abnormal,ROUTINE,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5133140,5455735,10486144,,139341,50887,,2125-12-02 11:15:00,NaT,HOLD. DISCARD GREATER THAN 24 HRS OLD.,,,,,,STAT,___
5133141,5455736,10486144,,3756110,50919,,2125-12-02 11:15:00,NaT,,,,,,,STAT,HOLD. DISCARD GREATER THAN 8 HOURS OLD.
5133142,5455737,10486144,,20184886,50979,,2125-12-02 11:15:00,NaT,,,,,,,STAT,HOLD.
5133143,5455738,10486144,,67321103,50955,,2125-12-02 11:15:00,NaT,___,,,,,,STAT,HOLD.


In [275]:
labevents_df.to_csv('data/labevents_df.csv', index=False)

In [276]:
labevents_df['comments'].isna().sum()

4209273

In [277]:
patients_df['subject_id'].unique().shape

(20000,)

In [278]:
labmerge_df = pd.merge(labevents_df, d_labitems_df, on='itemid', how='left')
labmerge_df['charttime'] = pd.to_datetime(labmerge_df['charttime'])
labmerge_df = labmerge_df.sort_values(by='charttime')
labmerge_df.head()


Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments,label,fluid,category
2862028,3046098,10269629,,84189135,51221,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,33.9,33.9,%,36.0,48.0,abnormal,STAT,,Hematocrit,Blood,Hematology
2862036,3046106,10269629,,84189135,51301,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,9.0,9.0,K/uL,4.0,11.0,,STAT,,White Blood Cells,Blood,Hematology
2862035,3046105,10269629,,84189135,51279,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,3.86,3.86,m/uL,4.2,5.4,abnormal,STAT,,Red Blood Cells,Blood,Hematology
2862034,3046104,10269629,,84189135,51277,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,12.9,12.9,%,10.5,15.5,,STAT,,RDW,Blood,Hematology
2862033,3046103,10269629,,84189135,51265,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,372.0,372.0,K/uL,150.0,440.0,,STAT,,Platelet Count,Blood,Hematology


In [279]:
# Merge labmerge_df with admissions_df on subject_id
merged_df = pd.merge(labmerge_df, admissions_df[['subject_id', 'hadm_id', 'admittime', 'dischtime']], on='subject_id', how='left')
merged_df.head()

Unnamed: 0,labevent_id,subject_id,hadm_id_x,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,...,ref_range_upper,flag,priority,comments,label,fluid,category,hadm_id_y,admittime,dischtime
0,3046098,10269629,,84189135,51221,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,33.9,33.9,...,48.0,abnormal,STAT,,Hematocrit,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00
1,3046106,10269629,,84189135,51301,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,9.0,9.0,...,11.0,,STAT,,White Blood Cells,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00
2,3046105,10269629,,84189135,51279,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,3.86,3.86,...,5.4,abnormal,STAT,,Red Blood Cells,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00
3,3046104,10269629,,84189135,51277,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,12.9,12.9,...,15.5,,STAT,,RDW,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00
4,3046103,10269629,,84189135,51265,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,372.0,372.0,...,440.0,,STAT,,Platelet Count,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00


In [280]:
# Calculate the absolute time differences
merged_df['time_diff_admit'] = (merged_df['charttime'] - merged_df['admittime']).abs()
merged_df['time_diff_disch'] = (merged_df['charttime'] - merged_df['dischtime']).abs()

# Sum the differences to get a single measure of 'closeness'
merged_df['total_time_diff'] = merged_df['time_diff_admit'] + merged_df['time_diff_disch']
merged_df.head()

Unnamed: 0,labevent_id,subject_id,hadm_id_x,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,...,comments,label,fluid,category,hadm_id_y,admittime,dischtime,time_diff_admit,time_diff_disch,total_time_diff
0,3046098,10269629,,84189135,51221,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,33.9,33.9,...,,Hematocrit,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00,304 days 02:45:00,306 days 22:51:00,611 days 01:36:00
1,3046106,10269629,,84189135,51301,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,9.0,9.0,...,,White Blood Cells,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00,304 days 02:45:00,306 days 22:51:00,611 days 01:36:00
2,3046105,10269629,,84189135,51279,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,3.86,3.86,...,,Red Blood Cells,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00,304 days 02:45:00,306 days 22:51:00,611 days 01:36:00
3,3046104,10269629,,84189135,51277,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,12.9,12.9,...,,RDW,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00,304 days 02:45:00,306 days 22:51:00,611 days 01:36:00
4,3046103,10269629,,84189135,51265,P75CKP,2109-03-20 15:40:00,2109-03-20 16:58:00,372.0,372.0,...,,Platelet Count,Blood,Hematology,25570053,2110-01-18 18:25:00,2110-01-21 14:31:00,304 days 02:45:00,306 days 22:51:00,611 days 01:36:00


In [281]:
# Now sort by this total_time_diff and drop duplicates to get the closest admission per lab event
labmerge_df = merged_df.sort_values(by='total_time_diff').drop_duplicates(subset=['labevent_id'], keep='first')

cols = ['labevent_id', 'subject_id', 'specimen_id', 'itemid', 'charttime', 'storetime', 'value', 'valuenum',
       'valueuom', 'ref_range_lower', 'ref_range_upper', 'flag', 'priority',
       'comments', 'label', 'fluid', 'category', 'hadm_id_y']
# Print or view the DataFrame to verify the results
labmerge_df = labmerge_df[cols]
labmerge_df['hadm_id'] = labmerge_df['hadm_id_y']
labmerge_df.drop(columns=['hadm_id_y'], inplace=True)
labmerge_df.head()

Unnamed: 0,labevent_id,subject_id,specimen_id,itemid,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments,label,fluid,category,hadm_id
909003,3058798,10271174,58399214,50862,2117-01-09 02:22:00,2117-01-09 04:04:00,2.6,2.6,g/dL,3.5,5.2,abnormal,STAT,,Albumin,Blood,Chemistry,26331353
909022,3058854,10271174,32059256,50822,2117-01-09 02:32:00,2117-01-09 02:37:00,5.1,5.1,mEq/L,3.3,5.1,,,,"Potassium, Whole Blood",Blood,Blood Gas,26331353
908994,3058808,10271174,58399214,50954,2117-01-09 02:22:00,2117-01-09 04:43:00,___,16590.0,IU/L,94.0,250.0,abnormal,STAT,VERIFIED BY DILUTION.,Lactate Dehydrogenase (LD),Blood,Chemistry,26331353
909023,3058851,10271174,32059256,50818,2117-01-09 02:32:00,2117-01-09 02:36:00,___,73.0,mm Hg,35.0,45.0,abnormal,,VERIFIED. PROVIDER NOTIFIED PER CURRENT LAB P...,pCO2,Blood,Blood Gas,26331353
909024,3058861,10271174,81224702,51087,2117-01-09 02:34:00,NaT,,,,,,,STAT,RANDOM.,Length of Urine Collection,Urine,Chemistry,26331353


In [282]:
def create_grouped_text_entry(group):
    specimen_id = group.name
    group = group.copy().sort_index()
    for index, row in group.iterrows():
        subject_id = row['subject_id']
        hadm_id = row['hadm_id']
        charttime = row['charttime']
        i = index
        break

    text_entries = f"Time: {charttime}, Subject ID: {subject_id}, HAdm ID: {hadm_id}, Specimen ID: {specimen_id}, Lab Tests:"
    # Append details for each test in the group
    for index, row in group.iterrows():
        test_details = (f"\nTest [{index-i+1}]: {row['label']} ({row['category']} - {row['fluid']}), "
                        f"Value: {row['value']} {row['valueuom'] if pd.notna(row['valueuom']) else ''} "
                        f"(Range: [{row['ref_range_lower']}, {row['ref_range_upper']}]), "
                        f"{'Flag: ' + row['flag'] if pd.notna(row['flag']) else ''}, Priority: {row['priority']}{', Comments: ' + row['comments'] if pd.notna(row['comments']) else ''};")
        text_entries += test_details
    
    return specimen_id, subject_id, hadm_id, charttime, text_entries

# Assuming labmerge_filtered is your original dataframe
grouped_data = labmerge_df.groupby('specimen_id').apply(create_grouped_text_entry)

# Convert the series of tuples into a DataFrame
lab_journal_df = pd.DataFrame(grouped_data.tolist(), columns=['specimen_id', 'subject_id', 'hadm_id', 'time', 'text'])

# Display the resulting DataFrame
lab_journal_df.head()


  grouped_data = labmerge_df.groupby('specimen_id').apply(create_grouped_text_entry)


Unnamed: 0,specimen_id,subject_id,hadm_id,time,text
0,40,10326027,28300774,2183-09-15 11:32:00,"Time: 2183-09-15 11:32:00, Subject ID: 1032602..."
1,42,10120037,29159481,2161-03-04 19:00:00,"Time: 2161-03-04 19:00:00, Subject ID: 1012003..."
2,228,10078309,22180474,2175-12-21 17:31:00,"Time: 2175-12-21 17:31:00, Subject ID: 1007830..."
3,396,10262096,27692157,2170-04-18 05:30:00,"Time: 2170-04-18 05:30:00, Subject ID: 1026209..."
4,437,10334535,25122870,2137-11-03 13:50:00,"Time: 2137-11-03 13:50:00, Subject ID: 1033453..."


In [283]:
lab_journal_df.to_csv('data/lab_journal_df.csv', index=False)

In [284]:
lab_journal_df.iloc[20].text

'Time: 2145-04-23 06:33:00, Subject ID: 10440899, HAdm ID: 26609648, Specimen ID: 3372, Lab Tests:\nTest [1]: RDW-SD (Hematology - Blood), Value: 59.4 fL (Range: [35.1, 46.3]), Flag: abnormal, Priority: ROUTINE;\nTest [2]: MCH (Hematology - Blood), Value: 28.3 pg (Range: [26.0, 32.0]), , Priority: ROUTINE;\nTest [3]: Hemoglobin (Hematology - Blood), Value: 8.9 g/dL (Range: [11.2, 15.7]), Flag: abnormal, Priority: ROUTINE;\nTest [4]: Hematocrit (Hematology - Blood), Value: 29.9 % (Range: [34.0, 45.0]), Flag: abnormal, Priority: ROUTINE;\nTest [8]: MCV (Hematology - Blood), Value: 95 fL (Range: [82.0, 98.0]), , Priority: ROUTINE;\nTest [12]: White Blood Cells (Hematology - Blood), Value: 11.6 K/uL (Range: [4.0, 10.0]), Flag: abnormal, Priority: ROUTINE;\nTest [16]: MCHC (Hematology - Blood), Value: 29.8 g/dL (Range: [32.0, 37.0]), Flag: abnormal, Priority: ROUTINE;\nTest [18]: Platelet Count (Hematology - Blood), Value: 655 K/uL (Range: [150.0, 400.0]), Flag: abnormal, Priority: ROUTINE;

In [285]:
# pd.set_option('display.max_columns', 50)

# microbiologyevents_df = pd.read_csv(mimic_hosp_dir + '/microbiologyevents.csv', nrows=10000)
chunks = pd.read_csv(mimic_hosp_dir + '/microbiologyevents.csv', chunksize=chunk_size)
filtered_chunks = [process_chunk(chunk) for chunk in chunks]
microbiologyevents_df = pd.concat(filtered_chunks, ignore_index=True)
microbiologyevents_df['charttime'] = pd.to_datetime(microbiologyevents_df['charttime'])
microbiologyevents_df = microbiologyevents_df.sort_values(by='charttime')

microbiologyevents_df.head()

Unnamed: 0,microevent_id,subject_id,hadm_id,micro_specimen_id,order_provider_id,chartdate,charttime,spec_itemid,spec_type_desc,test_seq,...,org_name,isolate_num,quantity,ab_itemid,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,comments
28355,30440,10101340,,3448348,P758UK,2109-05-30 00:00:00,2109-05-30 11:55:00,70017,SEROLOGY/BLOOD,1,...,,,,,,,,,,NONREACTIVE. Reference Range: Non-Reactive.
50736,54234,10169933,,1252722,P05SHV,2109-07-06 00:00:00,2109-07-06 16:30:00,70041,VIRAL CULTURE:R/O HERPES SIMPLEX VIRUS,1,...,,,,,,,,,,NO VIRUS ISOLATED.
105472,113417,10352416,,1197370,P9401N,2109-12-08 00:00:00,2109-12-08 10:00:00,70077,URINE,1,...,,,,,,,,,,Negative for Chlamydia trachomatis by PCR.
105473,113418,10352416,,5163298,P9401N,2109-12-08 00:00:00,2109-12-08 10:00:00,70017,SEROLOGY/BLOOD,1,...,,,,,,,,,,POSITIVE BY EIA. A positive IgG result genera...
105474,113419,10352416,,5290473,P9401N,2109-12-08 00:00:00,2109-12-08 10:00:00,70017,SEROLOGY/BLOOD,1,...,,,,,,,,,,NEGATIVE BY EIA. A negative result generally ...


In [286]:
# Merge labmerge_df with admissions_df on subject_id
merged_df = pd.merge(microbiologyevents_df, admissions_df[['subject_id', 'hadm_id', 'admittime', 'dischtime']], on='subject_id', how='left')

# Calculate the absolute time differences
merged_df['time_diff_admit'] = (merged_df['charttime'] - merged_df['admittime']).abs()
merged_df['time_diff_disch'] = (merged_df['charttime'] - merged_df['dischtime']).abs()

# Sum the differences to get a single measure of 'closeness'
merged_df['total_time_diff'] = merged_df['time_diff_admit'] + merged_df['time_diff_disch']

# Now sort by this total_time_diff and drop duplicates to get the closest admission per lab event
microbiologyevents_df = merged_df.sort_values(by='total_time_diff').drop_duplicates(subset=['microevent_id'], keep='first')

cols = ['microevent_id', 'subject_id', 'micro_specimen_id',
       'order_provider_id', 'chartdate', 'charttime', 'spec_itemid',
       'spec_type_desc', 'test_seq', 'storedate', 'storetime', 'test_itemid',
       'test_name', 'org_itemid', 'org_name', 'isolate_num', 'quantity',
       'ab_itemid', 'ab_name', 'dilution_text', 'dilution_comparison',
       'dilution_value', 'interpretation', 'comments', 'hadm_id_y']

microbiologyevents_df = microbiologyevents_df[cols]
microbiologyevents_df['hadm_id'] = microbiologyevents_df['hadm_id_y']
microbiologyevents_df.drop(columns=['hadm_id_y'], inplace=True)
microbiologyevents_df = microbiologyevents_df.dropna(subset=['hadm_id'])
microbiologyevents_df.head()



Unnamed: 0,microevent_id,subject_id,micro_specimen_id,order_provider_id,chartdate,charttime,spec_itemid,spec_type_desc,test_seq,storedate,...,isolate_num,quantity,ab_itemid,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,comments,hadm_id
21419,84297,10271174,5817577,,2117-01-09 00:00:00,2117-01-09 02:22:00,70012,BLOOD CULTURE,1,2117-01-15 00:00:00,...,,,,,,,,,NO GROWTH.,26331353
762707,92060,10294074,4138126,,2194-08-07 00:00:00,2194-08-07 00:25:00,70079,URINE,1,2194-08-08 00:00:00,...,,,,,,,,,NO GROWTH.,23396294
570294,16749,10049642,440325,,2177-02-20 00:00:00,2177-02-20 03:35:00,70012,BLOOD CULTURE,1,2177-02-26 00:00:00,...,,,,,,,,,NO GROWTH.,26714941
462491,127059,10401098,7505755,,2167-11-01 00:00:00,2167-11-01 00:49:00,70079,URINE,1,2167-11-02 00:00:00,...,,,,,,,,,"MIXED BACTERIAL FLORA ( >= 3 COLONY TYPES), CO...",24527045
762701,92059,10294074,4030808,,2194-08-06 00:00:00,2194-08-06 23:35:00,70012,BLOOD CULTURE,1,2194-08-12 00:00:00,...,,,,,,,,,NO GROWTH.,23396294


In [287]:
microbiologyevents_df['hadm_id'].isna().sum()

0

In [288]:
microbiologyevents_df.to_csv('data/microbiologyevents_df.csv', index=False)

In [289]:
import numpy as np

def create_microbiology_text_entry(group):
    # Initialize the entry text
    micro_specimen_id = group.name
    group = group.sort_index()
    
    i = np.inf
    hadm_id = None
    charttime = None
    for index, row in group.iterrows():
        subject_id = row['subject_id']
        hadm_id = row['hadm_id'] or hadm_id
        charttime = row['charttime'] or charttime
        i = min(index, i)

    # Start building the journal entry
    text_entries = f"Time: {charttime}, Subject ID: {subject_id}, HAdm ID: {hadm_id}, Micro Specimen ID: {micro_specimen_id}, Microbiology Tests:"
    # Append details for each test in the group
    for index, row in group.iterrows():
            test_details = (f"\nTest [{index-i+1}]: {row['test_name']} {row['spec_type_desc']} {row['org_name']}, "
                            f"Interpretation: {row['interpretation'] if pd.notna(row['interpretation']) else 'Missing'}"
                            f"{', Comments: ' + row['comments'] if (pd.notna(row['comments']) and row['comments'] != '___') else ''};")
            text_entries += test_details

    return micro_specimen_id, subject_id, hadm_id, charttime, text_entries

# Group by 'micro_specimen_id' and apply the function
grouped_micro_data = microbiologyevents_df.groupby('micro_specimen_id').apply(create_microbiology_text_entry)

# Convert the series of tuples into a DataFrame
micro_journal_df = pd.DataFrame(grouped_micro_data.tolist(), columns=['micro_specimen_id', 'subject_id', 'hadm_id', 'time', 'text'])

# Display the resulting DataFrame
micro_journal_df.head()


  grouped_micro_data = microbiologyevents_df.groupby('micro_specimen_id').apply(create_microbiology_text_entry)


Unnamed: 0,micro_specimen_id,subject_id,hadm_id,time,text
0,84,10054716,25339060,2136-05-27 21:10:00,"Time: 2136-05-27 21:10:00, Subject ID: 1005471..."
1,146,10291942,23221552,2176-06-25 10:06:00,"Time: 2176-06-25 10:06:00, Subject ID: 1029194..."
2,300,10080961,24032231,2140-02-23 04:58:00,"Time: 2140-02-23 04:58:00, Subject ID: 1008096..."
3,361,10013653,26666796,2182-10-27 09:10:00,"Time: 2182-10-27 09:10:00, Subject ID: 1001365..."
4,645,10455613,23702964,2193-01-27 09:55:00,"Time: 2193-01-27 09:55:00, Subject ID: 1045561..."


In [290]:
micro_journal_df.to_csv('data/micro_journal_df.csv', index=False)

In [291]:
micro_journal_df.shape

(68005, 5)

In [292]:
# Filter to find the entry for micro_specimen_id 3575952
entry = micro_journal_df[micro_journal_df['micro_specimen_id'] == 3575952]

# Print the full text entry
if not entry.empty:
    print(entry.iloc[0]['text'])  # Adjust the index if necessary to target the specific entry
else:
    print("No entry found for micro_specimen_id 3575952.")


Time: 2117-10-26 10:00:00, Subject ID: 10021487, HAdm ID: 27112038, Micro Specimen ID: 3575952, Microbiology Tests:
Test [1]: WOUND CULTURE ABSCESS ENTEROCOCCUS SP., Interpretation: S;
Test [7]: WOUND CULTURE ABSCESS CITROBACTER FREUNDII COMPLEX, Interpretation: S;
Test [13]: WOUND CULTURE ABSCESS CITROBACTER FREUNDII COMPLEX, Interpretation: S;
Test [19]: WOUND CULTURE ABSCESS CITROBACTER FREUNDII COMPLEX, Interpretation: S;
Test [25]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [31]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [37]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [43]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [49]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [55]: WOUND CULTURE ABSCESS CITROBACTER FREUNDII COMPLEX, Interpretation: S;
Test [61]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S;
Test [67]: WOUND CULTURE ABSCESS ESCHERICHIA COLI, Interpretation: S

In [293]:
# omr_df = pd.read_csv(mimic_hosp_dir + '/omr.csv', nrows=10000)
chunks = pd.read_csv(mimic_hosp_dir + '/omr.csv', chunksize=chunk_size)
filtered_chunks = [process_chunk(chunk) for chunk in chunks]
omr_df = pd.concat(filtered_chunks, ignore_index=True)
omr_df['chartdate'] = pd.to_datetime(omr_df['chartdate'])
omr_df = omr_df.drop_duplicates(subset=['subject_id', 'chartdate', 'result_name', 'result_value'])
omr_df

Unnamed: 0,subject_id,chartdate,seq_num,result_name,result_value
0,10000032,2180-04-27,1,Blood Pressure,110/65
1,10000032,2180-04-27,1,Weight (Lbs),94
2,10000032,2180-05-07,1,BMI (kg/m2),18.0
3,10000032,2180-05-07,1,Height (Inches),60
4,10000032,2180-05-07,1,Weight (Lbs),92.15
...,...,...,...,...,...
264999,10486144,2125-01-17,1,BMI (kg/m2),27.9
265000,10486144,2125-01-17,1,Weight (Lbs),138
265001,10486144,2125-07-26,1,Blood Pressure,130/70
265002,10486144,2125-08-30,1,Blood Pressure,100/52


In [294]:
omr_df.reset_index(inplace=True)

In [295]:

# Merge labmerge_df with admissions_df on subject_id
merged_df = pd.merge(omr_df, admissions_df[['subject_id', 'hadm_id', 'admittime', 'dischtime']], on='subject_id', how='left')

# Calculate the absolute time differences
merged_df['time_diff_admit'] = (merged_df['chartdate'] - merged_df['admittime']).abs()
merged_df['time_diff_disch'] = (merged_df['chartdate'] - merged_df['dischtime']).abs()

# Sum the differences to get a single measure of 'closeness'
merged_df['total_time_diff'] = merged_df['time_diff_admit'] + merged_df['time_diff_disch']

# Now sort by this total_time_diff and drop duplicates to get the closest admission per lab event
omr_df = merged_df.sort_values(by='total_time_diff').drop_duplicates(subset=['index'], keep='first')
omr_df




Unnamed: 0,index,subject_id,chartdate,seq_num,result_name,result_value,hadm_id,admittime,dischtime,time_diff_admit,time_diff_disch,total_time_diff
752760,150414,10272619,2162-09-16,1,BMI (kg/m2),29.5,23536707,2162-09-15 23:35:00,2162-09-16 02:03:00,0 days 00:25:00,0 days 02:03:00,0 days 02:28:00
752745,150413,10272619,2162-09-16,1,Blood Pressure,140/82,23536707,2162-09-15 23:35:00,2162-09-16 02:03:00,0 days 00:25:00,0 days 02:03:00,0 days 02:28:00
752775,150415,10272619,2162-09-16,1,Weight (Lbs),194,23536707,2162-09-15 23:35:00,2162-09-16 02:03:00,0 days 00:25:00,0 days 02:03:00,0 days 02:28:00
1104255,225771,10407848,2138-02-28,1,Weight (Lbs),144.60,26151005,2138-02-27 21:31:00,2138-02-28 00:38:00,0 days 02:29:00,0 days 00:38:00,0 days 03:07:00
1104251,225770,10407848,2138-02-28,1,Height (Inches),67,26151005,2138-02-27 21:31:00,2138-02-28 00:38:00,0 days 02:29:00,0 days 00:38:00,0 days 03:07:00
...,...,...,...,...,...,...,...,...,...,...,...,...
493425,101268,10178988,2142-10-09,1,Weight (Lbs),158.4,24401484,2130-09-25 02:01:00,2130-09-29 17:00:00,4396 days 21:59:00,4392 days 07:00:00,8789 days 04:59:00
493422,101265,10178988,2142-10-09,1,Blood Pressure,138/74,24401484,2130-09-25 02:01:00,2130-09-29 17:00:00,4396 days 21:59:00,4392 days 07:00:00,8789 days 04:59:00
1172794,242970,10438388,2139-08-23,1,Blood Pressure,140/88,26140760,2127-07-24 07:57:00,2127-07-28 11:45:00,4412 days 16:03:00,4408 days 12:15:00,8821 days 04:18:00
1172795,242971,10438388,2139-08-23,1,BMI (kg/m2),30.0,26140760,2127-07-24 07:57:00,2127-07-28 11:45:00,4412 days 16:03:00,4408 days 12:15:00,8821 days 04:18:00


In [297]:
admissions_df[admissions_df['subject_id'] == 10438388]

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag,gender,anchor_age
18181,10438388,26140760,2127-07-24 07:57:00,2127-07-28 11:45:00,,URGENT,P72TSX,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,SINGLE,BLACK/AFRICAN AMERICAN,,,0,F,27


In [300]:
def create_omr_text_entry(group):
    entries = []
    for index, row in group.iterrows():
        entry = f"{row['result_name']}: {row['result_value']}"
        entries.append(entry)
    combined_entries = ", ".join(entries)
    return combined_entries

omr_df_cleaned = omr_df.drop_duplicates(subset=['subject_id', 'chartdate', 'result_name', 'result_value'])

# Group by 'subject_id' and 'chartdate' and apply the function to each group
grouped_omr_entries = omr_df_cleaned.groupby(['subject_id', 'chartdate']).apply(create_omr_text_entry)

# Convert the grouped entries to a DataFrame
journal_omr_df = grouped_omr_entries.reset_index(name='text')
journal_omr_df.columns = ['subject_id', 'chartdate', 'text']

# Display the resulting DataFrame
journal_omr_df.head()

  grouped_omr_entries = omr_df_cleaned.groupby(['subject_id', 'chartdate']).apply(create_omr_text_entry)


Unnamed: 0,subject_id,chartdate,text
0,10000032,2180-04-27,"Blood Pressure: 110/65, Weight (Lbs): 94"
1,10000032,2180-05-07,"Weight (Lbs): 92.15, Height (Inches): 60, BMI ..."
2,10000032,2180-05-25,"Weight (Lbs): 95, BMI (kg/m2): 18.6, Blood Pre..."
3,10000032,2180-06-01,"Weight (Lbs): 95.7, Blood Pressure: 121/77, BM..."
4,10000032,2180-06-22,"Height (Inches): 60, Blood Pressure: 100/60, B..."


In [301]:
journal_omr_df.to_csv('data/journal_omr_df.csv', index=False)

In [302]:
journal_omr_df['subject_id'].value_counts()

subject_id
10476869    293
10207476    290
10151713    285
10157674    259
10088966    237
           ... 
10093625      1
10343242      1
10094582      1
10094773      1
10244410      1
Name: count, Length: 5827, dtype: int64

In [341]:
journal_omr_df.to_csv('data/journal_omr_df.csv', index=False)

### Notes

**Radiology**

In [305]:
# radiology_df = pd.read_csv(mimic_note_dir + '/radiology.csv', nrows=10000)
chunks = pd.read_csv(mimic_note_dir + '/radiology.csv', chunksize=chunk_size)
filtered_chunks = [process_chunk(chunk) for chunk in chunks]
radiology_df = pd.concat(filtered_chunks, ignore_index=True)
radiology_df['charttime'] = pd.to_datetime(radiology_df['charttime'])
radiology_df['storetime'] = pd.to_datetime(radiology_df['storetime'])

In [306]:

# Merge labmerge_df with admissions_df on subject_id
merged_df = pd.merge(radiology_df, admissions_df[['subject_id', 'hadm_id', 'admittime', 'dischtime']], on='subject_id', how='left')

# Calculate the absolute time differences
merged_df['time_diff_admit'] = (merged_df['charttime'] - merged_df['admittime']).abs()
merged_df['time_diff_disch'] = (merged_df['charttime'] - merged_df['dischtime']).abs()

# Sum the differences to get a single measure of 'closeness'
merged_df['total_time_diff'] = merged_df['time_diff_admit'] + merged_df['time_diff_disch']

# Now sort by this total_time_diff and drop duplicates to get the closest admission per lab event
radiology_df = merged_df.sort_values(by='total_time_diff').drop_duplicates(subset=['note_id'], keep='first')
radiology_df





Unnamed: 0,note_id,subject_id,hadm_id_x,note_type,note_seq,charttime,storetime,text,hadm_id_y,admittime,dischtime,time_diff_admit,time_diff_disch,total_time_diff
303774,10271174-RR-11,10271174,26331353.0,RR,11,2117-01-09 02:17:00,2117-01-09 10:41:00,INDICATION: History: ___ with post arrest int...,26331353,2117-01-09 02:57:00,2117-01-09 02:19:00,0 days 00:40:00,0 days 00:02:00,0 days 00:42:00
467273,10430608-RR-19,10430608,21229155.0,RR,19,2116-05-24 10:34:00,2116-05-24 12:58:00,TORSO CT PERFORMED ON ___\n\nCLINICAL HISTORY:...,21229155,2116-05-24 10:35:00,2116-05-24 11:41:00,0 days 00:01:00,0 days 01:07:00,0 days 01:08:00
467272,10430608-RR-18,10430608,21229155.0,RR,18,2116-05-24 10:33:00,2116-05-24 12:16:00,CT HEAD PERFORMED ON ___\n\nCOMPARISON: None....,21229155,2116-05-24 10:35:00,2116-05-24 11:41:00,0 days 00:02:00,0 days 01:08:00,0 days 01:10:00
328183,10294074-RR-44,10294074,23396294.0,RR,44,2194-08-07 01:06:00,2194-08-07 11:56:00,INDICATION: Single episode with head strike.\...,23396294,2194-08-07 00:49:00,2194-08-07 00:00:00,0 days 00:17:00,0 days 01:06:00,0 days 01:23:00
328189,10294074-RR-45,10294074,23396294.0,RR,45,2194-08-07 01:06:00,2194-08-07 11:55:00,INDICATION: Fall with head strike. Evaluatio...,23396294,2194-08-07 00:49:00,2194-08-07 00:00:00,0 days 00:17:00,0 days 01:06:00,0 days 01:23:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221354,10206740-RR-28,10206740,,RR,28,2200-04-19 11:16:00,2200-04-19 13:35:00,EXAMINATION: 3 FOOT STANDING EXTREMITYBILAT; ...,20098413,2188-07-23 01:13:00,2188-08-02 20:15:00,4287 days 10:03:00,4276 days 15:01:00,8564 days 01:04:00
23363,10021477-RR-55,10021477,,RR,55,2171-10-06 12:47:00,2171-10-07 11:24:00,EXAMINATION: BILATERAL DIGITAL SCREENING MAMM...,27360495,2159-12-21 00:17:00,2159-12-22 15:40:00,4307 days 12:30:00,4305 days 21:07:00,8613 days 09:37:00
227464,10208624-RR-22,10208624,,RR,22,2133-01-08 20:14:00,2133-01-08 21:09:00,EXAMINATION: CTA HEAD AND CTA NECK Q16 CT NEC...,23202989,2121-03-19 15:38:00,2121-03-20 18:40:00,4313 days 04:36:00,4312 days 01:34:00,8625 days 06:10:00
196285,10178988-RR-72,10178988,,RR,72,2142-07-22 10:13:00,2142-07-29 07:43:00,EXAMINATION: BILATERAL DIGITAL SCREENING MAMM...,24401484,2130-09-25 02:01:00,2130-09-29 17:00:00,4318 days 08:12:00,4313 days 17:13:00,8632 days 01:25:00


In [310]:
radiology_df.columns

Index(['note_id', 'subject_id', 'hadm_id_x', 'note_type', 'note_seq',
       'charttime', 'storetime', 'text', 'hadm_id_y', 'admittime', 'dischtime',
       'time_diff_admit', 'time_diff_disch', 'total_time_diff', 'hadm_id'],
      dtype='object')

In [311]:
radiology_df['hadm_id'] = radiology_df['hadm_id_y']
radiology_df.drop(columns=['hadm_id_x', 'hadm_id_y'], inplace=True)

In [316]:
radiology_df[radiology_df.subject_id == 10377396]

Unnamed: 0,note_id,subject_id,note_type,note_seq,charttime,storetime,text,admittime,dischtime,time_diff_admit,time_diff_disch,total_time_diff,hadm_id
420360,10377396-RR-23,10377396,RR,23,2179-05-16 18:49:00,2179-05-16 19:16:00,EXAMINATION: CT C-SPINE W/O CONTRAST\n\nINDIC...,2179-05-16 19:52:00,2179-05-16 23:12:00,0 days 01:03:00,0 days 04:23:00,0 days 05:26:00,21664687
420357,10377396-AR-21,10377396,AR,21,2179-05-16 18:48:00,2179-05-16 19:17:00,Addendum:\n\nAdditional information has been o...,2179-05-16 19:52:00,2179-05-16 23:12:00,0 days 01:04:00,0 days 04:24:00,0 days 05:28:00,21664687
420358,10377396-RR-21,10377396,RR,21,2179-05-16 18:48:00,2179-05-16 19:17:00,EXAMINATION: CT HEAD W/O CONTRAST\n\nINDICATI...,2179-05-16 19:52:00,2179-05-16 23:12:00,0 days 01:04:00,0 days 04:24:00,0 days 05:28:00,21664687
420359,10377396-RR-22,10377396,RR,22,2179-05-16 18:48:00,2179-05-16 19:34:00,EXAMINATION: CT SINUS/MANDIBLE/MAXILLOFACIAL ...,2179-05-16 19:52:00,2179-05-16 23:12:00,0 days 01:04:00,0 days 04:24:00,0 days 05:28:00,21664687


In [328]:
# Splitting the DataFrame into RR and AR
rr_df = radiology_df[radiology_df['note_type'] == 'RR']
ar_df = radiology_df[radiology_df['note_type'] == 'AR']

# Merging on 'subject_id', 'charttime' and other necessary identifiers
# Assuming note_seq links RR and AR together, otherwise you might need a different logic
merged_df = pd.merge(rr_df, ar_df, on=['subject_id', 'hadm_id', 'note_seq'], how='left', suffixes=('_rr', '_ar'))

# Creating the combined text with subject details
merged_df['combined_text'] = (
    "Subject ID: " + merged_df['subject_id'].astype(str) +
    ", HAdm ID: " + merged_df['hadm_id'].astype(str) +
    ", Chart Time: " + merged_df['charttime_rr'].dt.strftime('%Y-%m-%d %H:%M:%S') +
    "\n\n" + merged_df['text_rr'] +
    np.where(merged_df['text_ar'].notna(), "\n\nAddendum:\n" + merged_df['text_ar'], "")
)

# Selecting relevant columns
radiology_journal_df = merged_df[['note_id_rr', 'subject_id', 'hadm_id', 'charttime_rr', 'combined_text']]

# Renaming columns for clarity
radiology_journal_df.rename(columns={'charttime_rr': 'charttime', 'note_id_rr': 'note_id', 'combined_text': 'text'}, inplace=True)

# View the result
radiology_journal_df



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  radiology_journal_df.rename(columns={'charttime_rr': 'charttime', 'note_id_rr': 'note_id', 'combined_text': 'text'}, inplace=True)


Unnamed: 0,note_id,subject_id,hadm_id,charttime,text
0,10271174-RR-11,10271174,26331353,2117-01-09 02:17:00,"Subject ID: 10271174, HAdm ID: 26331353, Chart..."
1,10430608-RR-19,10430608,21229155,2116-05-24 10:34:00,"Subject ID: 10430608, HAdm ID: 21229155, Chart..."
2,10430608-RR-18,10430608,21229155,2116-05-24 10:33:00,"Subject ID: 10430608, HAdm ID: 21229155, Chart..."
3,10294074-RR-44,10294074,23396294,2194-08-07 01:06:00,"Subject ID: 10294074, HAdm ID: 23396294, Chart..."
4,10294074-RR-45,10294074,23396294,2194-08-07 01:06:00,"Subject ID: 10294074, HAdm ID: 23396294, Chart..."
...,...,...,...,...,...
97490,10206740-RR-28,10206740,20098413,2200-04-19 11:16:00,"Subject ID: 10206740, HAdm ID: 20098413, Chart..."
97491,10021477-RR-55,10021477,27360495,2171-10-06 12:47:00,"Subject ID: 10021477, HAdm ID: 27360495, Chart..."
97492,10208624-RR-22,10208624,23202989,2133-01-08 20:14:00,"Subject ID: 10208624, HAdm ID: 23202989, Chart..."
97493,10178988-RR-72,10178988,24401484,2142-07-22 10:13:00,"Subject ID: 10178988, HAdm ID: 24401484, Chart..."


In [329]:
radiology_journal_df.to_csv('data/radiology_journal_df.csv', index=False)

**Discharge**


In [330]:
chunks = pd.read_csv(mimic_note_dir + '/discharge.csv', chunksize=chunk_size)
filtered_chunks = [process_chunk(chunk) for chunk in chunks]
discharge_df = pd.concat(filtered_chunks, ignore_index=True)
discharge_df['charttime'] = pd.to_datetime(discharge_df['charttime'])
discharge_df['storetime'] = pd.to_datetime(discharge_df['storetime'])
discharge_df

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25,2160-11-25 15:09:00,\nName: ___ Unit No: __...
...,...,...,...,...,...,...,...,...
15223,10486120-DS-3,10486120,21496368,DS,3,2113-11-03,2114-01-18 15:45:00,\nName: ___ Unit No: ___\...
15224,10486130-DS-15,10486130,25382870,DS,15,2148-09-18,2148-09-18 08:33:00,\nName: ___ Unit No: __...
15225,10486144-DS-25,10486144,21214054,DS,25,2124-04-06,2124-04-08 10:14:00,\nName: ___ Unit No: ___\...
15226,10486144-DS-26,10486144,28877892,DS,26,2124-07-14,2124-07-14 17:01:00,\nName: ___ Unit No: ___\...


In [337]:
discharge_df['charttime'] = pd.to_datetime(discharge_df['charttime'])

# Create the formatted text for journal entries
discharge_df['journal_text'] = (
    "Subject ID: " + discharge_df['subject_id'].astype(str) +
    ", HAdm ID: " + discharge_df['hadm_id'].astype(str) +
    ", Chart Time: " + discharge_df['charttime'].dt.strftime('%Y-%m-%d %H:%M:%S') +
    "\n\n" + discharge_df['text']
)

# Selecting relevant columns for the final DataFrame
discharge_journal_df = discharge_df[['note_id', 'subject_id', 'hadm_id', 'charttime', 'journal_text']]

# Renaming the 'journal_text' column for clarity
discharge_journal_df.rename(columns={'journal_text': 'text'}, inplace=True)

# Display the DataFrame to verify the results
discharge_journal_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  discharge_journal_df.rename(columns={'journal_text': 'text'}, inplace=True)


Unnamed: 0,note_id,subject_id,hadm_id,charttime,text
0,10000032-DS-21,10000032,22595853,2180-05-07,"Subject ID: 10000032, HAdm ID: 22595853, Chart..."
1,10000032-DS-22,10000032,22841357,2180-06-27,"Subject ID: 10000032, HAdm ID: 22841357, Chart..."
2,10000032-DS-23,10000032,29079034,2180-07-25,"Subject ID: 10000032, HAdm ID: 29079034, Chart..."
3,10000032-DS-24,10000032,25742920,2180-08-07,"Subject ID: 10000032, HAdm ID: 25742920, Chart..."
4,10000084-DS-17,10000084,23052089,2160-11-25,"Subject ID: 10000084, HAdm ID: 23052089, Chart..."
...,...,...,...,...,...
15223,10486120-DS-3,10486120,21496368,2113-11-03,"Subject ID: 10486120, HAdm ID: 21496368, Chart..."
15224,10486130-DS-15,10486130,25382870,2148-09-18,"Subject ID: 10486130, HAdm ID: 25382870, Chart..."
15225,10486144-DS-25,10486144,21214054,2124-04-06,"Subject ID: 10486144, HAdm ID: 21214054, Chart..."
15226,10486144-DS-26,10486144,28877892,2124-07-14,"Subject ID: 10486144, HAdm ID: 28877892, Chart..."


In [339]:
import pprint as pp 
pp.pprint(discharge_journal_df.iloc[0].text)

('Subject ID: 10000032, HAdm ID: 22595853, Chart Time: 2180-05-07 00:00:00\n'
 '\n'
 ' \n'
 'Name:  ___                     Unit No:   ___\n'
 ' \n'
 'Admission Date:  ___              Discharge Date:   ___\n'
 ' \n'
 'Date of Birth:  ___             Sex:   F\n'
 ' \n'
 'Service: MEDICINE\n'
 ' \n'
 'Allergies: \n'
 'No Known Allergies / Adverse Drug Reactions\n'
 ' \n'
 'Attending: ___\n'
 ' \n'
 'Chief Complaint:\n'
 'Worsening ABD distension and pain \n'
 ' \n'
 'Major Surgical or Invasive Procedure:\n'
 'Paracentesis\n'
 '\n'
 ' \n'
 'History of Present Illness:\n'
 '___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, \n'
 'bioplar, PTSD, presented from OSH ED with worsening abd \n'
 'distension over past week.  \n'
 'Pt reports self-discontinuing lasix and spirnolactone ___ weeks \n'
 'ago, because she feels like "they don\'t do anything" and that \n'
 'she "doesn\'t want to put more chemicals in her." She does not \n'
 'follow Na-restricted diets. In the past week, she not

In [340]:
discharge_journal_df.to_csv('data/discharge_journal_df.csv', index=False)

In [343]:
admissions_df.discharge_location.value_counts()

discharge_location
HOME                            7186
HOME HEALTH CARE                3499
SKILLED NURSING FACILITY        1922
REHAB                            501
DIED                             412
CHRONIC/LONG TERM ACUTE CARE     330
HOSPICE                          149
AGAINST ADVICE                   141
PSYCH FACILITY                   101
ACUTE HOSPITAL                    74
OTHER FACILITY                    54
ASSISTED LIVING                   29
HEALTHCARE FACILITY                4
Name: count, dtype: int64