In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
#Upload Hospital Course Dataset
hc_csv = '../data/hpi-dataset/HOSPITAL_COURSES.csv'

In [3]:
def load_data(dataset):
        print('----Started data loading process...')
        df = pd.read_csv(dataset)
        print('----Data loaded.')
        return df

In [4]:
df_hc = load_data(hc_csv)

----Started data loading process...
----Data loaded.


## Strategy 1: First & Last Sentence Approach

In [5]:
# NLTK Import
import nltk
import nltk.data
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Test tokenization on Hopsital Course Section
data = df_hc.loc[0, "HOSPITAL_COURSE"].replace('\n',' ').strip()
print('\n-----\n'.join(sent_tokenize(data)))

1.
-----
COPD/dyspnea/pneumonia:  The patient was initially placed on an aggressive steroid taper and admitted to the Medical Intensive Care Unit due to her difficulty with oxygenation despite CPAP machine.
-----
She was also given nebulizer treatments q.4h.
-----
as well as chest PT.
-----
The nebulizers were increased to q.1h.
-----
due to the fact that she continued to have labored breathing.
-----
Due to persistent respiratory failure and labored breathing, the patient was intubated on [**2118-6-7**] in order to improve oxygenation, ventilation, and ability to suction.
-----
A bronchoscopy was performed on [**2118-6-7**], which demonstrated marked narrowing of the airways with expiration consistent with tracheomalacia.
-----
On [**2118-6-9**], two silicone stents were placed, one in the left main stem (12 x 25 and one in the trachea 16 x 40) by Dr. [**First Name (STitle) **] [**Name (STitle) **] under rigid bronchoscopy with general anesthesia.
-----
On [**2118-6-11**], the patient

In [9]:
# Use Pandas 'apply' method to tokenize hospital courses
df_hc["TOKENIZED_HOSPITAL_COURSE"] = df_hc.apply(lambda row: sent_tokenize(row['HOSPITAL_COURSE'].replace('\n',' ').strip()), axis=1)

In [10]:
# fix sentence issues wih a regex
pat_1 = re.compile(r'^(\.)$|^(\d+.\)?)$|^(#\.)$')
pat_2 = re.compile(r'^([0-9]*\sy.o.)$|^([Pp]t.)$')
for i, x in df_hc['TOKENIZED_HOSPITAL_COURSE'].iteritems():
  if pat_2.search(x[0]):
    x[0:2] = [' '.join(x[0:2])]
  filtered = [k for k in x if not pat_1.search(k)]
  df_hc.at[i,'TOKENIZED_HOSPITAL_COURSE'] = filtered

In [11]:
# Segment the first and last sentence accordingly
r = re.compile(".*discharge|.*death|.*deceased|.*died|.*follow-up|.*followup|.*AMA", re.IGNORECASE)
df_firstlast = df_hc.copy(deep=True)
df_firstlast['HC_HPI'] = df_firstlast['TOKENIZED_HOSPITAL_COURSE'].map(lambda x: x[0])
df_firstlast['HC_DAY_TO_DAY'] = df_firstlast['TOKENIZED_HOSPITAL_COURSE'].map(lambda x: " ".join(x[1:-2]))
df_firstlast['HC_DISCHARGE_PLAN'] = df_firstlast['TOKENIZED_HOSPITAL_COURSE'].map(lambda x: " ".join(list(filter(r.match, x))))

In [12]:
# Save First/Last Sentence Segment Approach
df_firstlast.to_csv('../data/hpi-dataset/HOSPITAL_COURSES_FIRSTLAST.csv', index=False)