### Objective: Segment the HPI, Day-to-day Hospital Course, and Discharge Plan within the Discharge Summary.

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
#Upload MIMIC-III NOTEEVENTS.CSV Dataset
notes_csv = '../data/mimic-III-dataset/NOTEEVENTS.csv'

In [3]:
def load_data(dataset):
        print('----Started data loading process...')
        df = pd.read_csv(dataset)
        print('----Data loaded.')
        return df

In [4]:
df_notes = load_data(notes_csv)

----Started data loading process...


  df = pd.read_csv(dataset)


----Data loaded.


In [5]:
df_notes_headers = list(df_notes.columns.values)
print(df_notes_headers)

['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT']


In [6]:
CATEGORY_VALUES = np.unique(df_notes['CATEGORY'].to_numpy())
print(CATEGORY_VALUES)

['Case Management ' 'Consult' 'Discharge summary' 'ECG' 'Echo' 'General'
 'Nursing' 'Nursing/other' 'Nutrition' 'Pharmacy' 'Physician ' 'Radiology'
 'Rehab Services' 'Respiratory ' 'Social Work']


In [7]:
#Filter Notes to only be the Discharge Summaries (not addendums) for segmentation
df_ds = df_notes.loc[(df_notes['CATEGORY'] == 'Discharge summary') & (df_notes['DESCRIPTION'] == 'Report')]

In [8]:
# Extract Hospital Course based on Regex

partition1 = df_ds['TEXT'].str.partition('Brief Hospital Course:')[2].replace(r'^\s*$', np.nan, regex=True)
partition2 = df_ds['TEXT'].str.partition('BRIEF SUMMARY OF HOSPITAL COURSE:')[2].replace(r'^\s*$', np.nan, regex=True)
partition3 = df_ds['TEXT'].str.partition('HOSPITAL COURSE:')[2].replace(r'^\s*$', np.nan, regex=True)
partition4 = df_ds['TEXT'].str.partition('ASSESSMENT:')[2]
df_ds['HOSPITAL_COURSE'] = partition1
df_ds['HOSPITAL_COURSE'] = df_ds["HOSPITAL_COURSE"].fillna(partition2)
df_ds['HOSPITAL_COURSE'] = df_ds["HOSPITAL_COURSE"].fillna(partition3)
df_ds['HOSPITAL_COURSE'] = df_ds["HOSPITAL_COURSE"].fillna(partition4)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ds['HOSPITAL_COURSE'] = partition1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ds['HOSPITAL_COURSE'] = df_ds["HOSPITAL_COURSE"].fillna(partition2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ds['HOSPITAL_COURSE'] = df_ds["HOSPITAL_COURSE"].fillna(partition3)
A value is trying to be s

In [9]:
# Filter out Hospital Courses that are Null
df_hc = df_ds.loc[df_ds['HOSPITAL_COURSE'] != ""]

In [10]:
# Create new column for Extraction
df_hc['REMOVE'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hc['REMOVE'] = True


In [11]:
# Remove the last part of the hospital course after medications

removal = df_hc['HOSPITAL_COURSE'].str.partition('Medications on Admission:')
df_hc['REMOVE'] = pd.isna(removal[1].replace(r'^\s*$', np.nan, regex=True))
df_hc['HOSPITAL_COURSE'] = removal[0]

removal = df_hc['HOSPITAL_COURSE'].str.partition('DISCHARGE MEDICATIONS:')
df_hc.loc[df_hc['REMOVE'] == True, 'REMOVE'] = pd.isna(removal[1].replace(r'^\s*$', np.nan, regex=True))
df_hc['HOSPITAL_COURSE'] = removal[0]

removal = df_hc['HOSPITAL_COURSE'].str.partition('MEDICATIONS ON DISCHARGE:')
df_hc.loc[df_hc['REMOVE'] == True, 'REMOVE'] = pd.isna(removal[1].replace(r'^\s*$', np.nan, regex=True))
df_hc['HOSPITAL_COURSE'] = removal[0]

removal = df_hc['HOSPITAL_COURSE'].str.partition('Discharge Medications:')
df_hc.loc[df_hc['REMOVE'] == True, 'REMOVE'] = pd.isna(removal[1].replace(r'^\s*$', np.nan, regex=True))
df_hc['HOSPITAL_COURSE'] = removal[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hc['REMOVE'] = pd.isna(removal[1].replace(r'^\s*$', np.nan, regex=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hc['HOSPITAL_COURSE'] = removal[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hc['HOSPITAL_COURSE'] = removal[0]
A value is trying to be set on a copy of a slice from

In [12]:
df_hc_filter = df_hc.loc[df_hc['REMOVE'] == False]
df_hc_filter = df_hc_filter.drop(columns=['REMOVE'])

In [13]:
df_hc_filter.reset_index(drop=True, inplace=True)

In [14]:
df_hc_filter

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,HOSPITAL_COURSE
0,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...,\n1. COPD/dyspnea/pneumonia: The patient was ...
1,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...,\n82 y/o female admitted [**2119-5-4**] for co...
2,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...,"\n87 yo F with h/o CHF, COPD on 5 L oxygen at ..."
3,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...,\nMr. [**Known lastname 1829**] was seen at [*...
4,179,53181,170490.0,2172-03-08,,,Discharge summary,Report,,,Admission Date: [**2172-3-5**] D...,\nPatient presented electively for meningioma ...
...,...,...,...,...,...,...,...,...,...,...,...,...
45162,53613,43691,147266.0,2147-03-01,,,Discharge summary,Report,,,Admission Date: [**2147-2-25**] ...,\nMs. [**Known lastname **] was admitted to th...
45163,53614,80847,129802.0,2190-06-05,,,Discharge summary,Report,,,Admission Date: [**2190-5-13**] ...,\nThis patient presented to the Emergency Depa...
45164,53615,41074,182558.0,2121-06-14,,,Discharge summary,Report,,,Admission Date: [**2121-6-13**] ...,\n74 y/o male who presents with massive ICH.\n...
45165,53616,76397,184741.0,2182-04-22,,,Discharge summary,Report,,,Admission Date: [**2182-4-19**] ...,\nGiven splenic laceration Mr. [**Known lastna...


In [17]:
from pathlib import Path
Path("../data/hpi-dataset").mkdir(parents=True, exist_ok=True)

In [18]:
df_hc_filter.to_csv('../data/hpi-dataset/HOSPITAL_COURSES.csv', index=False)