In [3]:
# General Python Libraries
import os
import time
import datetime
import string

# Pandas and Numpy
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Gensim
import gensim
from gensim.models import KeyedVectors

# SKLearn
import sklearn
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Download resources
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\winsl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\winsl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1) MIMIC-III Data Processing
The MIMIC-III data is publically available through <a href="url" target="https://physionet.org/content/mimiciii/1.4/">PhysioNet</a>. These are the relevant tables for this paper.
* DIAGNOSIS_ICD table
    * ROW_ID: (INT) Unique identifier for table
    * SUBJECT_ID: (INT) Unique identifier for patient
    * HADM_ID: (INT) Unique identifier for admission
    * ICD9_CODE: (VARCHAR(10)) Final diagnosis associated to patient admission
* ADMISSIONS table
    * ROW_ID: (INT) Unique identifier for table
    * SUBJECT_ID: (INT) Unique identifier for patient
    * HADM_ID: (INT) Unique identifier for admission
    * ADMITTIME: (TIMESTAMP(0)) Admit time for admission
    * DISCHTIME: (TIMESTAMP(0)) Discharge time for admission
* NOTEEVENTS table
    * ROW_ID: (INT) Unique identifier for table
    * SUBJECT_ID: (INT) Unique identifier for patient
    * HADM_ID: (INT) Unique identifier for admission
    * CATEGORY: (VARCHAR(50)) Type of note recorded ('Discharge summary' for discharge summary notes)
    * DESCRIPTION: (VARCHAR(300)) 'Report' indicates a full note, 'Addendum' indicates additional text to be added to the previous report
    * ISERROR: (CHAR(1)) '1' if physician indicates that the note is an error
    * TEXT: (TEXT) Note text

In addition, the paper references particular ICD-9 codes as related to heart failure:
*398.91, 402.01, 402.11, 402.91, 404.01, 404.03, 404.11, 404.13, 404.91, 404.93, 428.0, 428.1, 428.20, 428.21, 428.22, 428.23, 428.30, 428.31, 428.32, 428.33, 428.40, 428.41, 428.42, 428.43, 428.9*

## 1.1) Loading MIMIC-III Data and ICD-9 HF Set

In [2]:
# File paths to raw data CSV (with gz compression)
CURR_DIRNAME = os.getcwd()
DATA_PATH_DIAGNOSES = os.path.join(CURR_DIRNAME, r'mimic-iii-clinical-database-1.4', r'DIAGNOSES_ICD.csv.gz')
DATA_PATH_ADMISSIONS = os.path.join(CURR_DIRNAME, r'mimic-iii-clinical-database-1.4', r'ADMISSIONS.csv.gz')
DATA_PATH_NOTES = os.path.join(CURR_DIRNAME, r'mimic-iii-clinical-database-1.4', r'NOTEEVENTS.csv.gz')

# Record start time
_START_RUNTIME = time.time()

# Additional manual data
HEART_FAILURE_ICD9 = {'39891', 
					'40201', '40211', '40291', 
					'40401', '40403', '40411', '40413', '40491', '40493', 
					 '4280',  '4281', '42820', '42821', '42822', '42823', '42830', '42831', '42832', '42833', '42840', '42841', '42842', '42843', '4289'}

# Load CSV files
# Load diagnoses data
DATA_DIAGNOSES_RAW = pd.read_csv(DATA_PATH_DIAGNOSES, 
                                compression='gzip',
                                on_bad_lines='skip')
print("Total diagnoses for admissions: ", len(DATA_DIAGNOSES_RAW))
# Load admissions data
DATA_ADMISSIONS_RAW = pd.read_csv(DATA_PATH_ADMISSIONS, 
                                compression='gzip',
                                on_bad_lines='skip')
print("Total admissions: ", len(DATA_ADMISSIONS_RAW))
# Load notes data
DATA_NOTES_RAW = pd.read_csv(DATA_PATH_NOTES, 
                            compression='gzip',
                            on_bad_lines='skip')
print("Total notes for admissions: ", len(DATA_NOTES_RAW))

Total diagnoses for admissions:  651047
Total admissions:  58976
Total notes for admissions:  2083180


  exec(code_obj, self.user_global_ns, self.user_ns)


## 1.2) Retrieving All Heart Failure Admissions

In [3]:
hf_admissions_filter = DATA_DIAGNOSES_RAW['ICD9_CODE'].map(lambda x: x in HEART_FAILURE_ICD9)
HF_ADMISSIONS = DATA_DIAGNOSES_RAW[hf_admissions_filter][['SUBJECT_ID', 'HADM_ID']].drop_duplicates()

print("All Admissions #: ", len(HF_ADMISSIONS))
print(HF_ADMISSIONS.head)

All Admissions #:  14040
<bound method NDFrame.head of         SUBJECT_ID  HADM_ID
51             115   114585
67             117   140784
150            124   138376
211            130   198214
321             68   108329
...            ...      ...
650831       97132   144063
650866       97144   109999
650973       97172   133092
650995       97488   152542
651016       97488   161999

[14040 rows x 2 columns]>


## 1.3) Determining Readmissions

### 1.3.1) Finding Admission Times
The **ADMISSIONS** table contains the times of admit and discharge for each admission.

In [4]:
# Find admission times (admit and discharge)
admissions_wTimes = DATA_ADMISSIONS_RAW[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME']].copy()
admissions_wTimes.loc[:, 'DISCHTIME'] = admissions_wTimes['DISCHTIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
admissions_wTimes.loc[:, 'ADMITTIME'] = admissions_wTimes['ADMITTIME'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
hf_admissions_wTimes = HF_ADMISSIONS.merge(admissions_wTimes, how='left', on=['SUBJECT_ID', 'HADM_ID']).sort_values(by=['SUBJECT_ID', 'ADMITTIME'])
hf_admissions_groupedBySubject = hf_admissions_wTimes.groupby('SUBJECT_ID')

print("HF Admissions #: ", len(hf_admissions_wTimes))
#print(hf_admissions_wTimes.head)
print("HF Subjects #: ", len(hf_admissions_groupedBySubject))
#print(hf_admissions_groupedBySubject.head)

HF Admissions #:  14040
HF Subjects #:  10436


### 1.3.2) Finding Time to Next Admission

Time to next admission is determined by discharge time from current admission to the admit time of the subsequent admission. 

In [5]:
# Find readmissions intervals
hf_admissions_wTimes['NEXTADMITTIME'] = hf_admissions_groupedBySubject['ADMITTIME'].shift(-1, axis=0)
get_days = lambda x: x.components.days if hasattr(x, "components") else float("NaN")
hf_admissions_wTimes['READMISSIONINTERVAL'] =  (hf_admissions_wTimes['NEXTADMITTIME'] - hf_admissions_wTimes['DISCHTIME']).map(get_days)

print("HF Admissions # w/ Readmit: ", len(hf_admissions_wTimes))
#print(hf_admissions_wTimes.head)
#print(hf_admissions_wTimes.dtypes)


HF Admissions # w/ Readmit:  14040


#### 1.3.2.A) General Readmissions
General readmissions are defined as admissions that have a subsequent admission. The length of time to the next admission is irrelevant.   

In [6]:
# Find general readmissions
HF_GEN_READMISSIONS = hf_admissions_wTimes.copy().dropna()

print("HF General Readmissions #: ", len(HF_GEN_READMISSIONS))
#print(HF_GEN_READMISSIONS.head)

HF General Readmissions #:  3604


#### 1.3.2.B) 30 Day Readmissions
30-day readmissions are defined as admissions that have a subsequent admission with an admit time that is within 30 days of the current admissions discharge time.   

In [7]:
# Find 30-day readmissions
HF_30DAY_READMISSIONS = HF_GEN_READMISSIONS.copy()[HF_GEN_READMISSIONS['READMISSIONINTERVAL'] <= 30]

print("HF 30-Day Readmissions #: ", len(HF_30DAY_READMISSIONS))
#print(HF_30DAY_READMISSIONS.head)

HF 30-Day Readmissions #:  969


# 1.4) Determining Discharge Summary Notes

## 1.4.1) Loading Discharge Summary Notes
The notes need to meet the following criteria for inclusion:
* The note must not be marked as an error by a provider `(ISERROR != '1')`
* The category is discharge summary `(CATEGORY == 'Discharge summary')`
* Only full notes are included `(DESCRIPTION == 'Report')`

In [8]:
# Find valid discharge summary notes
# - Remove notes marked as error by physician
# - Only keep notes marked as Discharge Summary
dischargeSummary_notes_view = DATA_NOTES_RAW[['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'DESCRIPTION', 'ISERROR', 'TEXT']]

dischargeSummary_notes_view = dischargeSummary_notes_view[(dischargeSummary_notes_view["ISERROR"] != 1) & 
                                                          (dischargeSummary_notes_view["CATEGORY"] == 'Discharge summary') &
                                                          (dischargeSummary_notes_view["DESCRIPTION"] == 'Report')]
dischargeSummary_notes = dischargeSummary_notes_view[['SUBJECT_ID', 'HADM_ID', 'TEXT']].copy()

print("Total discharge summary notes w/o errors: ", len(dischargeSummary_notes))
#print(dischargeSummary_notes.head)
#print(dischargeSummary_notes['TEXT'][0])

Total discharge summary notes w/o errors:  55177


## 1.4.2) Note Text Processing
Note text is processed in the following manner:
1. Tokenization through NLTK
2. Removal of stopwords in NLTK corpora
3. Removal of punctuation from string defintion list
4. Removal of tokens that are numeric
5. Removal of tokens that contain a number

In [9]:
# Create set of stopwords and punctuation to remove from token list
stopwords_cache = set(stopwords.words("English"))
punctuation_cache = set(string.punctuation)
remove_cache = stopwords_cache.union(punctuation_cache)

# Tokenize note text and remove stopwords and numbers from note text
dischargeSummary_notes_tokenized = dischargeSummary_notes.copy()
dischargeSummary_notes_tokenized.loc[:, ['TEXT']] = dischargeSummary_notes_tokenized['TEXT'].map(
    lambda x: [word.lower() for word in word_tokenize(x) if word.lower() not in remove_cache and not word.isdigit() and not any([token.isdigit() for token in word])])

print(dischargeSummary_notes_tokenized.head)
print(dischargeSummary_notes_tokenized['TEXT'][0])


<bound method NDFrame.head of        SUBJECT_ID   HADM_ID                                               TEXT
0           22532  167853.0  [admission, date, discharge, date, service, ad...
1           13702  107527.0  [admission, date, discharge, date, date, birth...
2           13702  167118.0  [admission, date, discharge, date, service, ca...
3           13702  196489.0  [admission, date, discharge, date, service, me...
4           26880  135453.0  [admission, date, discharge, date, date, birth...
...           ...       ...                                                ...
55970       43691  147266.0  [admission, date, discharge, date, date, birth...
55971       80847  129802.0  [admission, date, discharge, date, date, birth...
55972       41074  182558.0  [admission, date, discharge, date, date, birth...
55973       76397  184741.0  [admission, date, discharge, date, date, birth...
55974       87196  121964.0  [admission, date, discharge, date, date, birth...

[55177 rows x 3 colum

## 1.4.3) Filtering for Longest Note per Admission
The longest note in the admission is determined by the note text that contains the most tokens after processing.

In [10]:
# Only keep largest note per admission
dischargeSummary_notes_tokenized.loc[:, ['NOTELENGTH']] = dischargeSummary_notes_tokenized['TEXT'].map(lambda x: len(x))
dischargeSummary_admissions = dischargeSummary_notes_tokenized.sort_values('NOTELENGTH', ascending=False).drop_duplicates(['SUBJECT_ID', 'HADM_ID'])

print("Total discharge summary notes w/ only longest: ", len(dischargeSummary_admissions))
print(dischargeSummary_admissions.head)

Total discharge summary notes w/ only longest:  52691
<bound method NDFrame.head of        SUBJECT_ID   HADM_ID  \
17599       43126  124079.0   
31434       42842  162017.0   
51073       93321  115396.0   
1519        66807  166588.0   
54248       51821  197028.0   
...           ...       ...   
11081        3564  117638.0   
12416        7995  190945.0   
27470        6495  139808.0   
36365         158  169433.0   
20658       24855  156368.0   

                                                    TEXT  NOTELENGTH  
17599  [admission, date, discharge, date, date, birth...        4771  
31434  [admission, date, discharge, date, date, birth...        4648  
51073  [admission, date, discharge, date, date, birth...        4552  
1519   [admission, date, discharge, date, date, birth...        4485  
54248  [admission, date, discharge, date, date, birth...        4449  
...                                                  ...         ...  
11081  [admission, date, discharge, date, serv

# 1.5) Finding Heart Failure Admission with Discharge Summary Note

In [11]:
# Find admissions w/ discharge summary notes
hf_admissions_wNotes = HF_ADMISSIONS.merge(dischargeSummary_admissions, how='inner', on=['SUBJECT_ID', 'HADM_ID'])

print("HF Admissions w/ Notes #: ", len(hf_admissions_wNotes))
#print(hf_admissions_wNotes.head)

HF Admissions w/ Notes #:  13746


# 1.6) Replacing Word Tokens with Word Embedding Vector
Word embeddings are taken from the publically available Word2Vec model, <a href="url" target="https://bio.nlplab.org/">bio.nlplab.org</a>, trained on PubMed abstracts and PubMed Central full text articles. 

If the word is not found in the model, the word vector is randomly initialized.

Each word embedding is of shape (200,).

In [12]:
# Load pre-trained PubMed KeyedVectors through gensim
pubmed_wv = KeyedVectors.load_word2vec_format(os.path.join(CURR_DIRNAME, r"PubMed-and-PMC-w2v.bin"), binary=True) 


In [14]:
# Create replace words with word vectors from PubMed Word2Vec
#   If word not present, initialize with random embeddings
hf_wv_admissions = hf_admissions_wNotes.copy()
hf_wv_admissions.loc[:, ['TEXT_EMBEDDING']] = hf_wv_admissions['TEXT'].map(lambda x: [pubmed_wv[word] if word in pubmed_wv else np.random.default_rng(12345).uniform(-1,1,(200,)) for word in x])

print("HF Admissions w/ Notes WV #: ", len(hf_wv_admissions))
print(np.array(hf_wv_admissions['TEXT_EMBEDDING'][0]).shape)


HF Admissions w/ Notes WV #:  13746
(1048, 200)


## 1.7) Positive Readmission Samples

In [15]:
# Find general readmissions w/ discharge summary notes
HF_GEN_READMISSIONS_WNOTES = HF_GEN_READMISSIONS.merge(hf_wv_admissions, how='inner', on=['SUBJECT_ID', 'HADM_ID'])

print("HF General Readmissions w/ Notes #: ", len(HF_GEN_READMISSIONS_WNOTES))
#print(HF_GEN_READMISSIONS_WNOTES.head)

# Find 30-day readmissions w/ discharge summary notes
HF_30DAY_READMISSIONS_WNOTES = HF_30DAY_READMISSIONS.merge(hf_wv_admissions, how='inner', on=['SUBJECT_ID', 'HADM_ID'])

print("HF 30-day Readmissions w/ Notes #: ", len(HF_30DAY_READMISSIONS_WNOTES))
#print(HF_30DAY_READMISSIONS_WNOTES.head)

HF General Readmissions w/ Notes #:  3543
HF 30-day Readmissions w/ Notes #:  962


## 1.8) Negative Sampling
Under-sampling is used to address the imbalance of positive to negative samples in the dataset. Negative samples are chosen in numbers matching the positive samples (general and 30-day readmissions) by selecting admissions without readmission and with discharge summary notes within the heart failure admission population. 

The goal of the model is to predict readmission within the heart failure population.

In [18]:
# Find heart failure admissions with discharge summary notes with no general readmissions
positive_hf_readmissions = hf_wv_admissions.merge(HF_GEN_READMISSIONS_WNOTES, how='left', on=['SUBJECT_ID', 'HADM_ID'], indicator=True)
negative_hf_readmissions = positive_hf_readmissions[positive_hf_readmissions['_merge'] == 'left_only'].copy()
negative_hf_readmissions.rename(columns={"TEXT_x": "TEXT", "NOTELENGTH_x": "NOTELENGTH", "TEXT_EMBEDDING_x": "TEXT_EMBEDDING"}, inplace=True)

print("HF No General Readmissions w/ Notes #: ", len(negative_hf_readmissions))

# Find heart failure admissions with discharge summary notes with no 30-day readmissions
positive_hf_30day_readmissions = hf_wv_admissions.merge(HF_30DAY_READMISSIONS_WNOTES, how='left', on=['SUBJECT_ID', 'HADM_ID'], indicator=True)
negative_hf_30day_readmissions = positive_hf_30day_readmissions[positive_hf_30day_readmissions['_merge'] == 'left_only'].copy()
negative_hf_30day_readmissions.rename(columns={"TEXT_x": "TEXT", "NOTELENGTH_x": "NOTELENGTH", "TEXT_EMBEDDING_x": "TEXT_EMBEDDING"}, inplace=True)

print("HF No 30-Day Readmissions w/ Notes #: ", len(negative_hf_30day_readmissions))

# Randomly sample from negative pool to match positive sample count
readmission_gen_count = len(HF_GEN_READMISSIONS_WNOTES)
no_readmission_gen_count = len(negative_hf_readmissions)
negative_sample_list_gen = np.random.default_rng().choice(no_readmission_gen_count, (readmission_gen_count,), replace=False)
HF_NO_GEN_READMISSIONS_WNOTES = negative_hf_readmissions.iloc[negative_sample_list_gen, :]
print("Sampled HF No General Readmissions w/ Notes #: ", len(HF_NO_GEN_READMISSIONS_WNOTES))

readmission_30day_count = len(HF_30DAY_READMISSIONS_WNOTES)
no_readmission_30day_count = len(negative_hf_30day_readmissions)
negative_sample_list_30day = np.random.default_rng().choice(no_readmission_30day_count, (readmission_30day_count,), replace=False)
HF_NO_30DAY_READMISSIONS_WNOTES = negative_hf_30day_readmissions.iloc[negative_sample_list_30day, :]
print("Sampled HF No 30-Day Readmissions w/ Notes #: ", len(HF_NO_30DAY_READMISSIONS_WNOTES))


HF No General Readmissions w/ Notes #:  10203
HF No 30-Day Readmissions w/ Notes #:  12784
Sampled HF No General Readmissions w/ Notes #:  3543
Sampled HF No 30-Day Readmissions w/ Notes #:  962


## 1.9) Save Processed Data to File

In [2]:
# Save final processed dataframes to file (pickle)
HF_GEN_READMISSIONS_WNOTES[['TEXT_EMBEDDING', 'NOTELENGTH', 'TEXT']].to_pickle("./positive_gen_readmissions.pkl.gz")
HF_30DAY_READMISSIONS_WNOTES[['TEXT_EMBEDDING', 'NOTELENGTH', 'TEXT']].to_pickle("./positive_30day_readmissions.pkl.gz")
HF_NO_GEN_READMISSIONS_WNOTES[['TEXT_EMBEDDING', 'NOTELENGTH', 'TEXT']].to_pickle("./negative_gen_readmissions.pkl.gz")
HF_NO_30DAY_READMISSIONS_WNOTES[['TEXT_EMBEDDING', 'NOTELENGTH', 'TEXT']].to_pickle("./negative_30day_readmissions.pkl.gz")

NameError: name 'HF_GEN_READMISSIONS_WNOTES' is not defined

# 2) Dataset Creation (TODO: Uncomment General Admission)

In [2]:
# Load processed datasets
#HF_GEN_READMISSIONS_WNOTES = pd.read_pickle("./positive_gen_readmissions.pkl.gz", compression="gzip")
#HF_NO_GEN_READMISSIONS_WNOTES = pd.read_pickle("./negative_gen_readmissions.pkl.gz", compression="gzip")

HF_30DAY_READMISSIONS_WNOTES = pd.read_pickle("./positive_30day_readmissions.pkl.gz", compression="gzip")
HF_NO_30DAY_READMISSIONS_WNOTES = pd.read_pickle("./negative_30day_readmissions.pkl.gz", compression="gzip")

## 2.1) Convert Dataframes to Tensors

### 2.1.1) Combine positive and negative samples into one dataframe per type

In [3]:
#readmission_gen_df = pd.concat([HF_GEN_READMISSIONS_WNOTES, HF_NO_GEN_READMISSIONS_WNOTES])
readmission_30day_df = pd.concat([HF_30DAY_READMISSIONS_WNOTES, HF_NO_30DAY_READMISSIONS_WNOTES])

#print("General Readmission: ", len(HF_GEN_READMISSIONS_WNOTES), " + ", len(HF_NO_GEN_READMISSIONS_WNOTES), " = ", len(readmission_gen_df))
print("30-Day Readmission: ", len(HF_30DAY_READMISSIONS_WNOTES), " + ", len(HF_NO_30DAY_READMISSIONS_WNOTES), " = ", len(readmission_30day_df))

30-Day Readmission:  962  +  962  =  1924


### 2.1.2) Find max note lengths

In [4]:
#max_words_gen_readmit = int(readmission_gen_df['NOTELENGTH'].max())
max_words_30day_readmit = int(readmission_30day_df['NOTELENGTH'].max())

#print("Max Note Length - General Readmissions: ", max_words_gen_readmit)
print("Max Note Length - 30-Day Readmissions: ", max_words_30day_readmit)

Max Note Length - 30-Day Readmissions:  3839


### 2.1.3) Pad note vectors to max length

In [5]:
# Text Embedding Vectors
#padded_note_wv_gen_readmit = pad_sequences(readmission_gen_df['TEXT_EMBEDDING'], maxlen=max_words_gen_readmit, padding="post", value=0., dtype=np.float32)
padded_note_wv_30day_readmit = pad_sequences(readmission_30day_df['TEXT_EMBEDDING'], maxlen=max_words_30day_readmit, padding="post", value=0., dtype=np.float32)

#print(padded_note_wv_gen_readmit.shape)
print(padded_note_wv_30day_readmit.shape)

(1924, 3839, 200)


### 2.1.4) Convert dataframes to tensors

In [6]:
# Text Embedding Vectors
#readmission_gen_tensor = torch.tensor(padded_note_wv_gen_readmit, dtype=torch.float)
readmission_30day_tensor = torch.tensor(padded_note_wv_30day_readmit, dtype=torch.float)

#print(readmission_gen_tensor.shape)
print(readmission_30day_tensor.shape)

torch.Size([1924, 3839, 200])


In [7]:
# Text Token Vectors
#readmission_gen_tensor_tokens = np.array(readmission_gen_df['TEXT'])
readmission_30day_tensor_tokens = np.array(readmission_30day_df['TEXT'])

#print(readmission_gen_tensor_tokens.shape)
print(readmission_30day_tensor_tokens.shape)

(1924,)


### 2.1.5) Create labels for positive and negative

In [8]:
# readmission_gen_labels = torch.cat(
#     (
#         torch.ones((len(HF_GEN_READMISSIONS_WNOTES),)), 
#         torch.zeros((len(HF_NO_GEN_READMISSIONS_WNOTES),))
#     )
# )
readmission_30day_labels = torch.cat(
    (
        torch.ones((len(HF_30DAY_READMISSIONS_WNOTES),)), 
        torch.zeros((len(HF_NO_30DAY_READMISSIONS_WNOTES),))
    )
)

#print(readmission_gen_labels.shape)
print(readmission_30day_labels.shape)

torch.Size([1924])


## 2.2) Custom Dataset

In [9]:
# Create custom dataset class
class CustomDataset(Dataset):
    
    def __init__(self, wv, labels, tokens):
        """
        Store `seqs`. to `self.x` and `hfs` to `self.y`.
        """
        self.x = wv
        self.y = labels
        self.tokens = tokens
    
    def __len__(self):
        """
        Return the number of samples (i.e. admissions).
        """
        return len(self.y)
    
    def __getitem__(self, index):
        """
        Generates one sample of data.
        """
        return self.x[index], self.y[index] 

    def getToken(self, index):
        """
        Generate one sample token
        """
        return self.tokens[index]

# Create datasets for the general and 30-day readmissions
#readmission_gen_dataset = CustomDataset(readmission_gen_tensor, readmission_gen_labels, readmission_gen_tensor_tokens)
readmission_30day_dataset = CustomDataset(readmission_30day_tensor, readmission_30day_labels, readmission_30day_tensor_tokens)

## 2.3) Create Test and Training Datasets

In [10]:
# Split is 90/10 for training/testing
#split_gen_readmission = int(len(readmission_gen_dataset)*0.9)
split_30day_readmission = int(len(readmission_30day_dataset)*0.9)

# Create general readmission training and testing sets
# lengths_gen_readmission = [split_gen_readmission, len(readmission_gen_dataset) - split_gen_readmission]
# train_dataset_gen, val_dataset_gen = random_split(readmission_gen_dataset, lengths_gen_readmission)

# print("Length of general train dataset:", len(train_dataset_gen))
# print("Length of general val dataset:", len(val_dataset_gen))

# Create 30-day readmission training and testing sets
lengths_30day_readmission = [split_30day_readmission, len(readmission_30day_dataset) - split_30day_readmission]
train_dataset_30day, val_dataset_30day = random_split(readmission_30day_dataset, lengths_30day_readmission)

print("Length of 30-day train dataset:", len(train_dataset_30day))
print("Length of 30-day val dataset:", len(val_dataset_30day))

Length of 30-day train dataset: 1731
Length of 30-day val dataset: 193


## 2.4) Dataloaders

In [12]:
# Define load data function for dataloader creation
def load_data(train_dataset, val_dataset, collate_fn = None, batch_size = 32):
    '''
    Returns the data loader for train and validation dataset. 
    Set batchsize default to 32. 
    Set `shuffle=True` only for train dataloader.
    
    Arguments:
        train dataset: train dataset of type `CustomDataset`
        val dataset: validation dataset of type `CustomDataset`
        collate_fn: collate function
        batch_size: size of batches, default to 32
        
    Outputs:
        train_loader, val_loader: train and validation dataloaders
    '''
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              collate_fn=collate_fn)
    
    val_loader = DataLoader(val_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            collate_fn=collate_fn)
    
    return train_loader, val_loader


In [13]:
# Create data loaders for datasets
#train_loader_gen, val_loader_gen = load_data(train_dataset_gen, val_dataset_gen)
train_loader_30day, val_loader_30day = load_data(train_dataset_30day, val_dataset_30day)

# 3) CNN Model

## 3.1) Mask Selection

## 3.2) Build CNN

In [28]:
class NoteCNN(nn.Module):
    def __init__(self, filter_count=100):
        super(NoteCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, filter_count, kernel_size = (1, 200), stride = 1)
        self.conv2 = nn.Conv2d(1, filter_count, kernel_size = (2, 200), stride = 1, padding=(1, 0))
        self.conv3 = nn.Conv2d(1, filter_count, kernel_size = (3, 200), stride = 1, padding=(2, 0))
        self.fc = nn.Linear(filter_count * 3, 2)

    def forward(self, x):
        # Create axis for one channel input
        x = torch.unsqueeze(x, 1)
        # Use convolution layers to create filters for each filter size
        x_1 = self.conv1(x)
        x_2 = self.conv2(x)
        x_3 = self.conv3(x)
        # Find maximum value across filters (max pool)
        x_max_1, _ = torch.max(x_1, dim=2)
        x_max_2, _ = torch.max(x_2, dim=2)
        x_max_3, _ = torch.max(x_3, dim=2)
        # Remove extra dimension
        x_max_1 = torch.squeeze(x_max_1, dim=2)
        x_max_2 = torch.squeeze(x_max_2, dim=2)
        x_max_3 = torch.squeeze(x_max_3, dim=2)
        # Combine filters together
        x_layers = torch.cat([x_max_1, x_max_2, x_max_3], dim=1)
        # Fully connected layer to find most prediction
        pred = F.softmax(self.fc(F.relu(x_layers)), dim=1)
        return pred

# Initialize model
model = NoteCNN()#.cuda()

In [29]:
model_size = sum([param.nelement() * param.element_size() for param in model.parameters()]) / 1e9
print("NoteCNN size in GB:", model_size)
model_parameters_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("NoteCNN parameters: ", model_parameters_count)

NoteCNN size in GB: 0.000483608
NoteCNN parameters:  120902


## 3.3) Model Training

### 3.3.1) Loss and Optimizer

In [30]:
LEARNING_RATE = 0.001

CRITERION = nn.CrossEntropyLoss()
OPTIMIZER = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

### 3.3.2) Evaluate

In [31]:
def eval_model(model, dataloader):
    """
    Evaluate the model using data in data loader
    :return:
        Y_pred: prediction of model on the dataloder.
            Should be an 2D numpy float array where the second dimension has length 2.
        Y_test: truth labels. Should be an numpy array of ints
    """
    model.eval()
    Y_pred = []
    Y_test = []
    for data, target in dataloader:
        output = model(data)
        _, prediction = torch.max(output, 1)
        
        Y_pred.append(prediction)
        Y_test.append(target.long())
        
    Y_pred = np.concatenate(Y_pred, axis=0)
    Y_test = np.concatenate(Y_test, axis=0)

    return Y_pred, Y_test

### 3.3.3) Training and Evaluation

In [36]:
N_EPOCHS = 20

def train_model(model, train_dataloader, n_epoch=N_EPOCHS, optimizer=OPTIMIZER, criterion=CRITERION):
    """
    :param model: A CNN model
    :param train_dataloader: the DataLoader of the training data
    :param n_epoch: number of epochs to train
    :return:
        model: trained model
    """
    model.train() # prep model for training
    
    for epoch in range(n_epoch):
        curr_epoch_loss = []
        epoch_start_time = time.time()
        for data, target in train_dataloader:
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target.long())
            loss.backward()
            optimizer.step()
            
            curr_epoch_loss.append(loss.cpu().data.numpy())
        print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
        print(f"Epoch {epoch} running time = {time.time() - epoch_start_time} seconds")
    return model

### 3.3.4) Training the Model

In [37]:
#model_general = train_model(model, train_loader_gen)
model_30day = train_model(model, train_loader_30day)

Epoch 0: curr_epoch_loss=0.41994789242744446
Epoch 0 running time = 37.83294653892517 seconds
Epoch 1: curr_epoch_loss=0.39950308203697205
Epoch 1 running time = 34.07367515563965 seconds
Epoch 2: curr_epoch_loss=0.3843808174133301
Epoch 2 running time = 36.448219776153564 seconds
Epoch 3: curr_epoch_loss=0.37132877111434937
Epoch 3 running time = 36.09221053123474 seconds
Epoch 4: curr_epoch_loss=0.35690054297447205
Epoch 4 running time = 34.35780715942383 seconds
Epoch 5: curr_epoch_loss=0.34782901406288147
Epoch 5 running time = 35.99550127983093 seconds
Epoch 6: curr_epoch_loss=0.3393585681915283
Epoch 6 running time = 41.21418642997742 seconds
Epoch 7: curr_epoch_loss=0.3343295454978943
Epoch 7 running time = 46.1887047290802 seconds
Epoch 8: curr_epoch_loss=0.33052515983581543
Epoch 8 running time = 39.929301500320435 seconds
Epoch 9: curr_epoch_loss=0.3280082046985626
Epoch 9 running time = 35.44194793701172 seconds
Epoch 10: curr_epoch_loss=0.3257336914539337
Epoch 10 running t

### 3.3.5) Evaluation Metrics

In [38]:
#y_pred_general, y_true_general = eval_model(model_general, val_loader_gen)

# prec_general = precision_score(y_true_general, y_pred_general)
# recall_general = recall_score(y_true_general, y_pred_general)
# f1_general = f1_score(y_true_general, y_pred_general)
# acc_general = accuracy_score(y_true_general, y_pred_general)

# print(("Validation Precision: " + str(prec_general)))
# print(("Validation Recall: " + str(recall_general)))
# print(("Validation F1: " + str(f1_general)))
# print(("Validation Accuracy: " + str(acc_general)))

y_pred_30day, y_true_30day = eval_model(model_30day, val_loader_30day)

prec_30day = precision_score(y_true_30day, y_pred_30day)
recall_30day = recall_score(y_true_30day, y_pred_30day)
f1_30day = f1_score(y_true_30day, y_pred_30day)
acc_30day = accuracy_score(y_true_30day, y_pred_30day)

print(("Validation Precision: " + str(prec_30day)))
print(("Validation Recall: " + str(recall_30day)))
print(("Validation F1: " + str(f1_30day)))
print(("Validation Accuracy: " + str(acc_30day)))

Validation Precision: 0.6632653061224489
Validation Recall: 0.6701030927835051
Validation F1: 0.6666666666666666
Validation Accuracy: 0.6632124352331606


In [None]:
print("Total running time = {:.2f} seconds".format(time.time() - _START_RUNTIME))

# 4) Random Forest

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## 4.1) Datasets for Random Forest
These datasets are the same as used for the CNN model.

### 4.1.1) Load Datasets from Pre-Processed Files 

In [4]:
# Load processed datasets
#HF_GEN_READMISSIONS_WNOTES = pd.read_pickle("./positive_gen_readmissions.pkl.gz", compression="gzip")
#HF_NO_GEN_READMISSIONS_WNOTES = pd.read_pickle("./negative_gen_readmissions.pkl.gz", compression="gzip")

HF_30DAY_READMISSIONS_WNOTES = pd.read_pickle("./positive_30day_readmissions.pkl.gz", compression="gzip")
HF_NO_30DAY_READMISSIONS_WNOTES = pd.read_pickle("./negative_30day_readmissions.pkl.gz", compression="gzip")

### 4.1.2) Combine Positive and Negative Sample Datasets

In [5]:
#readmission_gen_df = pd.concat([HF_GEN_READMISSIONS_WNOTES, HF_NO_GEN_READMISSIONS_WNOTES])
readmission_30day_df = pd.concat([HF_30DAY_READMISSIONS_WNOTES, HF_NO_30DAY_READMISSIONS_WNOTES])

#print("General Readmission: ", len(HF_GEN_READMISSIONS_WNOTES), " + ", len(HF_NO_GEN_READMISSIONS_WNOTES), " = ", len(readmission_gen_df))
print("30-Day Readmission: ", len(HF_30DAY_READMISSIONS_WNOTES), " + ", len(HF_NO_30DAY_READMISSIONS_WNOTES), " = ", len(readmission_30day_df))

30-Day Readmission:  962  +  962  =  1924


### 4.1.3) Transform Dataframe into Numpy Array

In [6]:
# Text Token Vectors
#readmission_gen_tensor_tokens = np.array(readmission_gen_df['TEXT'])
readmission_30day_tensor_tokens = np.array(readmission_30day_df['TEXT'])

#print(readmission_gen_tensor_tokens.shape)
print(readmission_30day_tensor_tokens.shape)

(1924,)


### 4.1.4) Create Labels for Samples

In [7]:
# readmission_gen_labels = torch.cat(
#     (
#         torch.ones((len(HF_GEN_READMISSIONS_WNOTES),)), 
#         torch.zeros((len(HF_NO_GEN_READMISSIONS_WNOTES),))
#     )
# )
readmission_30day_labels = torch.cat(
    (
        torch.ones((len(HF_30DAY_READMISSIONS_WNOTES),)), 
        torch.zeros((len(HF_NO_30DAY_READMISSIONS_WNOTES),))
    )
)

#print(readmission_gen_labels.shape)
print(readmission_30day_labels.shape)

torch.Size([1924])


## 4.2) Feature Weighting through Term Frequency - Inverse Document Frequency (TF-IDF)

### 4.2.1) Setup Hyperparameters and Stopwords

In [18]:
MAX_FEATURES = [10000, 15000, 20000, 25000]

### 4.2.2) Create Function to Use Pre-Tokenized Corpus

In [9]:
def return_original(x):
    return x

### 4.2.3) Create TD-IDF Mapping for Vocabulary

In [19]:
readmission_30day_tfidf = {}
for max_ft in MAX_FEATURES:
    vectorizer = TfidfVectorizer(max_features=max_ft, tokenizer=return_original, preprocessor=return_original)
    readmission_30day_tfidf[max_ft] = vectorizer.fit_transform(readmission_30day_tensor_tokens)

## 4.3) Random Forest Model

### 4.3.1) Create Training and Test Sets

Use same split as CNN model. Features are TD-IDF weights of individual words instead of word vectors from pre-trained Word2Vec model.

In [20]:
rf_datasets = {}
for max_ft in readmission_30day_tfidf:
    X_train, X_test, y_train, y_test = train_test_split(readmission_30day_tfidf[max_ft], readmission_30day_labels, test_size=0.9, shuffle=True)
    rf_datasets[max_ft] = (X_train, X_test, y_train, y_test)

### 4.3.2) Define and Fit Random Forest Classifier

In [21]:
y_pred = {}
for max_ft in MAX_FEATURES: 
    clf = RandomForestClassifier(max_depth=2, random_state=0, max_features=max_ft)
    clf.fit(rf_datasets[max_ft][0], rf_datasets[max_ft][2])
    y_pred[max_ft] = clf.predict(rf_datasets[max_ft][1])

### 4.3.3) Evaluate Random Forest

In [22]:
for max_ft in MAX_FEATURES: 
    prec_30day = precision_score(rf_datasets[max_ft][3], y_pred[max_ft])
    recall_30day = recall_score(rf_datasets[max_ft][3], y_pred[max_ft])
    f1_30day = f1_score(rf_datasets[max_ft][3], y_pred[max_ft])
    acc_30day = accuracy_score(rf_datasets[max_ft][3], y_pred[max_ft])

    print("Max Features: ", max_ft)
    print(("Validation Precision: " + str(prec_30day)))
    print(("Validation Recall: " + str(recall_30day)))
    print(("Validation F1: " + str(f1_30day)))
    print(("Validation Accuracy: " + str(acc_30day)))

Max Features:  10000
Validation Precision: 0.5668629100084104
Validation Recall: 0.7883040935672515
Validation F1: 0.6594911937377691
Validation Accuracy: 0.5981524249422633
Max Features:  15000
Validation Precision: 0.5979667282809612
Validation Recall: 0.7567251461988304
Validation F1: 0.6680433660299432
Validation Accuracy: 0.6287528868360277
Max Features:  20000
Validation Precision: 0.5833333333333334
Validation Recall: 0.5995370370370371
Validation F1: 0.591324200913242
Validation Accuracy: 0.5866050808314087
Max Features:  25000
Validation Precision: 0.6170212765957447
Validation Recall: 0.535796766743649
Validation F1: 0.5735475896168108
Validation Accuracy: 0.6016166281755196


# 5) Chi-squared based Feature Selection

In [23]:
from sklearn.feature_selection import SelectKBest, chi2

## 5.1) Select Correctly Identified Positive Samples from CNN Model

## 5.2) Find Top 20 Features from Correct Predictions

In [None]:
X_new = SelectKBest(chi2, k=20).fit_transform(X)
print(X_new.shape)
print(X_new)