In [None]:
import pandas as pd
import json
from random import sample

Load files with discharge notes and info about all admissions

In [None]:
notes = pd.read_csv('/sc-resources/dh-mimic/mimic_iv_2_2/note/discharge.csv')
admissions = pd.read_csv('/sc-resources/dh-mimic/mimic_iv_2_2/hosp/admissions.csv.gz')

Merge files

In [None]:
merged = pd.merge(notes, admissions, how = 'outer', on = ['subject_id','hadm_id'], 
                  validate = 'one_to_one')

Remove patients that only appear once

In [None]:
one_visit = merged['subject_id'].duplicated(keep = False) #all duplicates are True

In [None]:
multiple_visits = merged[one_visit]

Remove unnecessary columns

In [None]:
data_reduced = multiple_visits[['subject_id','hadm_id','admittime','dischtime','text']]

Convert dates to correct data types

In [None]:
data_reduced.loc[:,'dischtime'] = pd.to_datetime(data_reduced.loc[:,'dischtime'], 
                                                errors = 'raise')
data_reduced.loc[:,'admittime'] = pd.to_datetime(data_reduced.loc[:,'admittime'], 
                                                errors = 'raise')
print(type(data_reduced.loc[0,'dischtime']))
print(type(data_reduced.loc[0,'admittime']))

In [None]:
data_reduced.loc[data_reduced['admittime'] == pd.Timestamp('2180-07-23 12:35:00')]

Sort in chronological order and group by patient (subject)

In [None]:
grouped = data_reduced.sort_values('dischtime')
grouped = grouped.groupby('subject_id')

In [None]:
grouped.get_group(10000032)

In [None]:
grouped.get_group(10014651)

In [None]:
for subject_id in grouped.groups:
    
    single_group = grouped.get_group(subject_id)
    
    if single_group.loc[:,'dischtime'].is_monotonic_increasing == False: 
        raise Exception('Visits NOT sorted in chronological order')
        
print('Visits sorted in chronological order.')

Compute 30-day readmission for every note
- except last note
- remove visits without note

In [None]:
notes = []
readmission_info = [] #pd.DataFrame(columns=['hadm_id','subject_id','visit_no','thirty_day_readmission'])

note_counter = 0

for subject_id in grouped.groups:
    
    single_group = grouped.get_group(subject_id)
    
    for row in range(single_group.shape[0]-1):
    
        info = {}
        
        if pd.isna(single_group['text'].iloc[row]):
            continue
        
        notes.append(single_group['text'].iloc[row])
        
        info['hadm_id'] = single_group['hadm_id'].iloc[row]
        info['subject_id'] = subject_id
        info['visit_no'] = row
        info['thirty_day_readmission'] = (single_group['admittime'].iloc[row+1]-single_group['dischtime'].iloc[row]) <= pd.Timedelta(days=30)
        
        readmission_info.append(info)
        
        note_counter += 1

readmission_info = pd.DataFrame(readmission_info)

Test if everything worked

In [None]:
#print(notes[0])
#print(type(notes[0]))
print(readmission_info.head(20))

In [None]:
print(len(notes))
print(len(readmission_info))

Export every note individually

In [None]:
#for i in range(len(notes)):
#    with open(f"single_notes/{readmission_info['hadm_id'].iloc[i]}.txt", 'w') as file:
#        file.write(notes[i])

Export readmission_info

In [None]:
#readmission_info.to_csv('single_notes_readmission_info.csv')