In [1]:
import os, sys
import time
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, date

## Import discharge summaries

In [2]:
path = os.path.join(os.getcwd(), 'NOTEEVENTS.csv')
start = time.time()
df_notes = pd.read_csv(path, encoding='utf8', engine='python')
end = time.time()
print('Read NOTEEVENTS.csv in {} seconds'.format(end-start))

Read NOTEEVENTS.csv in 90.56419706344604 seconds


In [3]:
discharge_notes = df_notes[df_notes['CATEGORY'] == 'Discharge summary']

In [4]:
discharge_notes

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...
5,179,53181,170490.0,2172-03-08,,,Discharge summary,Report,,,Admission Date: [**2172-3-5**] D...
6,180,20646,134727.0,2112-12-10,,,Discharge summary,Report,,,Admission Date: [**2112-12-8**] ...
7,181,42130,114236.0,2150-03-01,,,Discharge summary,Report,,,Admission Date: [**2150-2-25**] ...
8,182,56174,163469.0,2118-08-12,,,Discharge summary,Report,,,Admission Date: [**2118-8-10**] ...
9,183,56174,189681.0,2118-12-09,,,Discharge summary,Report,,,Admission Date: [**2118-12-7**] ...


## Map CHARTDATE to UTC and sort

In [5]:
def to_utc(chartdate):
    utc = datetime.strptime(chartdate+'-0-0-0','%Y-%m-%d-%H-%M-%S').timestamp()
    return utc
discharge_notes['UTC'] = discharge_notes['CHARTDATE'].apply(to_utc)
discharge_notes_utc = discharge_notes.sort_values(by=['UTC'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Only include the first (earliest) visit for each patient

In [6]:
patients = set()
first_visit = pd.DataFrame()
idx_list = []
for i in tqdm(range(len(discharge_notes_utc))):
    instance = discharge_notes.iloc[i, :]
    sub_id = instance['SUBJECT_ID']
    if sub_id in patients:
        pass
    else:
        patients.update({sub_id})
        idx_list.append(i)
first_visit = discharge_notes_utc.iloc[idx_list, :]

100%|██████████| 59652/59652 [00:13<00:00, 4295.64it/s]


In [7]:
first_visit

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,UTC
43658,36983,82574,118464.0,2100-06-09,,,Discharge summary,Report,,,Admission Date: [**2100-6-7**] D...,4.116197e+09
46436,52549,21081,159656.0,2100-06-17,,,Discharge summary,Report,,,Admission Date: [**2100-6-14**] Discharge...,4.116888e+09
3242,3271,12001,173927.0,2100-06-27,,,Discharge summary,Report,,,Admission Date: [**2100-6-14**] Dischar...,4.117752e+09
21574,31369,32096,158366.0,2100-06-30,,,Discharge summary,Report,,,Admission Date: [**2100-6-22**] ...,4.118011e+09
9675,10125,20957,113808.0,2100-07-03,,,Discharge summary,Report,,,Admission Date: [**2100-6-24**] Dischar...,4.118270e+09
4563,4873,4521,167070.0,2100-07-05,,,Discharge summary,Report,,,Admission Date: [**2100-6-28**] Dischar...,4.118443e+09
4564,4874,4521,167070.0,2100-07-06,,,Discharge summary,Report,,,Admission Date: [**2100-6-28**] Dischar...,4.118530e+09
4295,4281,41552,120254.0,2100-07-08,,,Discharge summary,Report,,,Admission Date: [**2100-7-5**] D...,4.118702e+09
44309,51455,6503,114310.0,2100-07-12,,,Discharge summary,Report,,,Admission Date: [**2100-7-11**] ...,4.119048e+09
55027,55618,4521,167070.0,2100-07-12,,,Discharge summary,Addendum,,,"Name: [**Known lastname 1223**], [**Known fir...",4.119048e+09


## Map ICD to each note by HADM_ID

In [8]:
diag_dict = pd.read_csv('DIAGNOSES_ICD.csv')[['HADM_ID', 'SEQ_NUM', 'ICD9_CODE']]
proc_dict = pd.read_csv('PROCEDURES_ICD.csv')[['HADM_ID', 'SEQ_NUM', 'ICD9_CODE']]

In [9]:
diag_dict

Unnamed: 0,HADM_ID,SEQ_NUM,ICD9_CODE
0,172335,1.0,40301
1,172335,2.0,486
2,172335,3.0,58281
3,172335,4.0,5855
4,172335,5.0,4254
5,172335,6.0,2762
6,172335,7.0,7100
7,172335,8.0,2767
8,172335,9.0,7243
9,172335,10.0,45829


In [26]:
def map_icd(note_df, icd_dict):
    import operator
    icd_dict_cp = icd_dict.copy()
    col = ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'ICD9_CODE', 'ICD9_COUNTS']
    df = note_df[['SUBJECT_ID', 'HADM_ID', 'TEXT']]
    icd_lists = []
    icd_counts = []
    for i in tqdm(range(len(df))):
        instance = note_df.iloc[i, :]
        h = instance['HADM_ID']
        icd_list = icd_dict_cp[icd_dict_cp['HADM_ID'] == h]
        icd_sort = sorted(dict(zip(list(icd_list['ICD9_CODE']), list(icd_list['SEQ_NUM']))).items(), 
                      key=operator.itemgetter(1))
        icd_count = len(icd_sort)
        icd_lists.append(icd_sort)
        icd_counts.append(icd_count)
    df['ICD9_CODE'] = icd_lists
    df['ICD9_COUNTS'] = icd_counts
    return df

In [27]:
df_diag = map_icd(first_visit, diag_dict)

100%|██████████| 41127/41127 [01:23<00:00, 489.62it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [29]:
df_proc = map_icd(first_visit, proc_dict)

100%|██████████| 41127/41127 [00:53<00:00, 768.93it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
df_diag

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD9_COUNTS
43658,82574,118464.0,Admission Date: [**2100-6-7**] D...,"[(4589, 1.0), (28411, 2.0), (1985, 3.0), (1988...",17
46436,21081,159656.0,Admission Date: [**2100-6-14**] Discharge...,"[(51881, 1.0), (042, 2.0), (30470, 3.0), (5070...",9
3242,12001,173927.0,Admission Date: [**2100-6-14**] Dischar...,"[(431, 1.0), (5990, 2.0), (7070, 3.0), (3310, ...",8
21574,32096,158366.0,Admission Date: [**2100-6-22**] ...,"[(8080, 1.0), (86121, 2.0), (80704, 3.0), (599...",12
9675,20957,113808.0,Admission Date: [**2100-6-24**] Dischar...,"[(486, 1.0), (99662, 2.0), (7907, 3.0), (5559,...",4
4563,4521,167070.0,Admission Date: [**2100-6-28**] Dischar...,"[(44024, 1.0), (73007, 2.0), (25070, 3.0), (25...",9
4564,4521,167070.0,Admission Date: [**2100-6-28**] Dischar...,"[(44024, 1.0), (73007, 2.0), (25070, 3.0), (25...",9
4295,41552,120254.0,Admission Date: [**2100-7-5**] D...,"[(53100, 1.0), (5849, 2.0), (27652, 3.0), (585...",10
44309,6503,114310.0,Admission Date: [**2100-7-11**] ...,"[(78900, 1.0), (25041, 2.0), (40391, 3.0), (25...",8
55027,4521,167070.0,"Name: [**Known lastname 1223**], [**Known fir...","[(44024, 1.0), (73007, 2.0), (25070, 3.0), (25...",9


In [37]:
df_proc

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,ICD9_CODE,ICD9_COUNTS
43658,82574,118464.0,Admission Date: [**2100-6-7**] D...,"[(9229, 1)]",1
46436,21081,159656.0,Admission Date: [**2100-6-14**] Discharge...,[],0
3242,12001,173927.0,Admission Date: [**2100-6-14**] Dischar...,"[(4513, 1), (4311, 2), (966, 3)]",3
21574,32096,158366.0,Admission Date: [**2100-6-22**] ...,"[(7939, 1), (7906, 2), (390, 3), (7909, 4), (8...",7
9675,20957,113808.0,Admission Date: [**2100-6-24**] Dischar...,"[(3323, 1), (9656, 2), (9604, 3), (3893, 4), (...",5
4563,4521,167070.0,Admission Date: [**2100-6-28**] Dischar...,"[(8411, 1), (8415, 2), (3809, 3), (8848, 4), (...",5
4564,4521,167070.0,Admission Date: [**2100-6-28**] Dischar...,"[(8411, 1), (8415, 2), (3809, 3), (8848, 4), (...",5
4295,41552,120254.0,Admission Date: [**2100-7-5**] D...,"[(4443, 1)]",1
44309,6503,114310.0,Admission Date: [**2100-7-11**] ...,"[(3995, 1), (9904, 2)]",2
55027,4521,167070.0,"Name: [**Known lastname 1223**], [**Known fir...","[(8411, 1), (8415, 2), (3809, 3), (8848, 4), (...",5


In [31]:
df_diag.to_csv('diag_icd_all.csv')
df_proc.to_csv('proc_icd_all.csv')