In [None]:
import numpy as np
import pandas as pd

from datetime import datetime
from datetime import timedelta

import spacy
import string
import re

import torch
import torch.nn as nn

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [1]:
def patient_selection(admissions_csv, patients_csv, notes_csv):
    #loading the MIMIC tables as dataframes
    admissions = pd.read_csv(admissions_csv)
    patients = pd.read_csv(patients_csv)
    notes = pd.read_csv(notes_csv)
    
    #dropping unnecessary columns
    admissions = admissions.drop(['ROW_ID', 'LANGUAGE', 'MARITAL_STATUS', 'RELIGION',\
                              'ETHNICITY', 'EDREGTIME', 'EDOUTTIME',\
                              'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA',\
                              'DISCHARGE_LOCATION'], axis=1)
    patients = patients.drop(['ROW_ID', 'DOD', 'DOD_HOSP', \
                              'DOD_SSN', 'EXPIRE_FLAG'], axis =1)
    notes = notes.drop(['ROW_ID', 'STORETIME', 'DESCRIPTION', 'CGID'], axis=1)

    #dropping unnecessary rows
    notes = notes[notes.CATEGORY != 'Discharge summary']
    notes = notes[notes.ISERROR != 1]
    admissions.groupby('SUBJECT_ID')
    admissions = admissions.drop_duplicates(subset='SUBJECT_ID', keep=False)

    
    ############################
    ### combining the tables ###
    ############################
    
    
    #merging notes and admissions + dropping unusuable notes
    notes_adm = pd.merge(notes, admissions, on="HADM_ID", how="left")
    notes_adm = notes_adm[notes_adm['HADM_ID'].notna()]
    notes_adm = notes_adm[notes_adm['CHARTTIME'].notna()]
    
    #converting the timedata to usable format
    notes_adm['CHARTDATE'] = pd.to_datetime(notes_adm['CHARTDATE'], format='%Y-%m-%d %H:%M:%S')
    notes_adm['CHARTTIME'] = pd.to_datetime(notes_adm['CHARTTIME'], format='%Y-%m-%d %H:%M:%S')
    notes_adm['ADMITTIME'] = pd.to_datetime(notes_adm['ADMITTIME'], format='%Y-%m-%d %H:%M:%S')
    
    #filtering out the notes that are needed
    notes_adm = notes_adm.loc[notes_adm.CHARTTIME >= notes_adm.ADMITTIME]
    notes_24h = notes_adm[(notes_adm.CHARTTIME - notes_adm.ADMITTIME <= timedelta(hours=24)) & (notes_adm.CHARTTIME > notes_adm.ADMITTIME)]

    patients['DOB'] = pd.to_datetime(patients['DOB'], format='%Y-%m-%d %H:%M:%S')
    notes_24h = notes_24h.rename(columns={'SUBJECT_ID_x': 'SUBJECT_ID'})
    notes_24h = pd.merge(notes_24h, patients, on="SUBJECT_ID", how="left")
    notes_24h = notes_24h.drop(['SUBJECT_ID_y'], axis =1)
    notes_24h = notes_24h.assign(AGE = lambda x: (x['ADMITTIME']).astype('int64') - (x['DOB']).astype('int64'))
    notes_24h.AGE = notes_24h.AGE.apply(lambda x: int((((x*1e-9))/(60*60*24*365.242))))
    notes_24h.AGE = notes_24h.AGE.abs()
    
    bins = [-1, 17, 34, 49, 64, 89, np.inf]
    cat = ['0-17', '18-34', '35-49', '50-64', '65-89', '90+']
    notes_24h['AGERANGE'] = pd.cut(notes_24h['AGE'], bins, labels=cat)
    notes_24h = notes_24h[notes_24h.AGERANGE != '0-17']
    
    #creating binary mortality feature
    notes_24h['MORTALITY'] = notes_24h['DEATHTIME'].replace(np.nan, 0)
    notes_24h['MORTALITY'] = np.where(notes_24h['MORTALITY'] != 0.0, 1, 0)
    notes_24h = notes_24h.drop('DEATHTIME', 1)
    
    return notes_24h

In [None]:
notes_24h = patient_selection('admissions.csv', 'patients.csv', 'notes.csv')

In [None]:
def truncation(notes):
    notes.sort_values(by=['SUBJECT_ID', 'CHARTTIME'], ascending = True)
    g = notes.groupby('SUBJECT_ID')
    notes_fl = (pd.concat([g.head(1), g.tail(1)]).drop_duplicates().sort_values('SUBJECT_ID').reset_index(drop=True))
    notes = notes_f1
    
    notes.TEXT = notes.TEXT.map(lambda x: x[1:] if x[0] == ' ' else x)
    notes.TEXT = notes.TEXT.map(lambda x: x[:-1] if x[-1] == ' ' else x)
    
    nlp = spacy.load('en_core_sci_lg')
    with nlp.select_pipes(disable = ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']):
        spacy_tok = nlp.tokenizer
    
    #creating tokens based on spacy tokenizer
    def collect_tokens(text):
    tokens = []
    for token in spacy_tok(text):
        tokens.append(token.text)
    return tokens

    notes['sp_tok'] = notes.TEXT.map(collect_tokens)
    notes['num_sp_tok'] = notes.sp_tok.apply(lambda x: len(x))
    
    
    # creating tokens based on spacy tokenizer without stopwords
    def tokens_nostop(text):
        tokens = []
        for token in spacy_tok(text):
            if token.is_stop == False:
                tokens.append(token.text)
        return tokens

    notes['sp_nostop_tok'] = notes.TEXT.map(tokens_nostop)
    notes['num_sp_nostop_tok'] = notes.sp_nostop_tok.apply(lambda x: len(x))
    
    MAX_SIZE_NOTE = np.percentile(num_spacy, 90)
    MAX_SIZE_NOTE_ns = np.percentile(num_nostop, 90)

    # truncating the notes
    notes['sp_tok_trun'] = notes.sp_tok.apply(lambda x: x[0:MAX_SIZE_NOTE])
    notes['n_sp_tok_trun'] = notes.sp_tok_trun.apply(lambda x: len(x))
    notes['sp_nostop_tok_trun'] = notes.sp_nostop_tok.apply(lambda x: x[0:MAX_SIZE_NOTE_ns])
    notes['n_sp_nostop_tok_trun'] = notes.sp_nostop_tok_trun.apply(lambda x: len(x))
    
    notes['TEXT_sp_trun'] = notes.sp_tok_trun.apply(lambda x: " ".join(x))
    notes['TEXT_sp_nostop_trun'] = notes.sp_nostop_tok_trun.apply(lambda x: " ".join(x))
    
    notes_sp_trun = notes.groupby(['SUBJECT_ID'], as_index=False).agg({'TEXT_sp_trun': lambda x: '\n\n\n newnote \n\n\n'.join(x),'sp_tok_trun': 'sum'})
    notes_sp_trun['NUM_sp_trun'] = notes_sp_trun.sp_tok_trun.apply(lambda x: len(x))
    notes_sp_trun.rename(columns = {'sp_tok_trun':'TOK_sp_trun'}, inplace = True)
    
    notes_sp_nostop_trun = notes.groupby(['SUBJECT_ID'], as_index=False).agg({'TEXT_sp_nostop_trun': lambda x: '\n\n\n newnote \n\n\n'.join(x),'sp_nostop_tok_trun': 'sum'})
    notes_sp_nostop_trun['NUM_sp_nostop_trun'] = notes_sp_nostop_trun.sp_nostop_tok_trun.apply(lambda x: len(x))
    notes_sp_nostop_trun.rename(columns = {'sp_nostop_tok_trun':'TOK_sp_nostop_trun'}, inplace = True)
    
    not_notes = notes[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE',\
                      'DIAGNOSIS', 'GENDER', 'DOB', 'AGE','AGERANGE', 'MORTALITY'] ]
    not_notes = not_notes.drop_duplicates(subset=['SUBJECT_ID'], keep='last')
    
    notes_sp_trun = notes_sp_trun.merge(not_notes, how='left', on='SUBJECT_ID')
    notes_sp_nostop_trun = notes_sp_nostop_trun.merge(not_notes, how='left', on='SUBJECT_ID')

    return notes_sp_trun, notes_sp_nostop_trun

In [None]:
notes_trun, notes_nostop_trun = truncation(notes_24h)

In [None]:
def truncation_nopunc(notes):
    notes.sort_values(by=['SUBJECT_ID', 'CHARTTIME'], ascending = True)
    g = notes.groupby('SUBJECT_ID')
    notes_fl = (pd.concat([g.head(1), g.tail(1)]).drop_duplicates().sort_values('SUBJECT_ID').reset_index(drop=True))
    notes = notes_fl
    
    def preprocess_text(i):
        reject_punc = '''!"#$%&'()-*+/;<=>?@\^_`{|}~,.[]:'''
        i = i.lower().translate(str.maketrans(dict.fromkeys(reject_punc, ' ')))
        i = re.sub(r' \n', ' ', i)
        i = re.sub(r' \r', ' ', i)
        return i
    notes.TEXT = notes.TEXT.map(preprocess_text)

    notes.TEXT = notes.TEXT.map(lambda x: x[1:] if x[0] == ' ' else x)
    notes.TEXT = notes.TEXT.map(lambda x: x[:-1] if x[-1] == ' ' else x)
    
    nlp = spacy.load('en_core_sci_lg')
    with nlp.select_pipes(disable = ['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner']):
        spacy_tok = nlp.tokenizer
        
    # creating tokens based on spacy tokenizer
    def collect_tokens(text):
        tokens = []
        for token in spacy_tok(text):
            tokens.append(token.text)
        return tokens

    notes['sp_tok'] = notes.TEXT.map(collect_tokens)
    notes['num_sp_tok'] = notes.sp_tok.apply(lambda x: len(x))
    
    # creating tokens based on spacy tokenizer without stopwords
    def tokens_nostop(text):
        tokens = []
        for token in spacy_tok(text):
            if token.is_stop == False:
                tokens.append(token.text)
        return tokens

    notes['sp_nostop_tok'] = notes.TEXT.map(tokens_nostop)
    notes['num_sp_nostop_tok'] = notes.sp_nostop_tok.apply(lambda x: len(x))
    
    MAX_SIZE_NOTE = np.percentile(num_spacy, 90)
    MAX_SIZE_NOTE_ns = np.percentile(num_nostop, 90)
    
    # truncating the notes
    notes['sp_tok_trun'] = notes.sp_tok.apply(lambda x: x[0:MAX_SIZE_NOTE])
    notes['n_sp_tok_trun'] = notes.sp_tok_trun.apply(lambda x: len(x))
    notes['sp_nostop_tok_trun'] = notes.sp_nostop_tok.apply(lambda x: x[0:MAX_SIZE_NOTE_ns])
    notes['n_sp_nostop_tok_trun'] = notes.sp_nostop_tok_trun.apply(lambda x: len(x))
    
    notes['TEXT_sp_trun'] = notes.sp_tok_trun.apply(lambda x: " ".join(x))
    notes['TEXT_sp_nostop_trun'] = notes.sp_nostop_tok_trun.apply(lambda x: " ".join(x))
    notes_sp_trun = notes.groupby(['SUBJECT_ID'], as_index=False).agg({'TEXT_sp_trun': lambda x: '\n\n\n newnote \n\n\n'.join(x),'sp_tok_trun': 'sum'})
    notes_sp_trun['NUM_sp_trun'] = notes_sp_trun.sp_tok_trun.apply(lambda x: len(x))
    notes_sp_trun.rename(columns = {'sp_tok_trun':'TOK_sp_trun'}, inplace = True)
    notes_sp_nostop_trun = notes.groupby(['SUBJECT_ID'], as_index=False).agg({'TEXT_sp_nostop_trun': lambda x: '\n\n\n newnote \n\n\n'.join(x),'sp_nostop_tok_trun': 'sum'})
    notes_sp_nostop_trun['NUM_sp_nostop_trun'] = notes_sp_nostop_trun.sp_nostop_tok_trun.apply(lambda x: len(x))
    notes_sp_nostop_trun.rename(columns = {'sp_nostop_tok_trun':'TOK_sp_nostop_trun'}, inplace = True)
    not_notes = notes[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE',\
                      'DIAGNOSIS', 'GENDER', 'DOB', 'AGE','AGERANGE', 'MORTALITY'] ]
    not_notes = not_notes.drop_duplicates(subset=['SUBJECT_ID'], keep='last')
    
    notes_sp_trun = notes_sp_trun.merge(not_notes, how='left', on='SUBJECT_ID')
    notes_sp_nostop_trun = notes_sp_nostop_trun.merge(not_notes, how='left', on='SUBJECT_ID')
    
    return notes_sp_trun, notes_sp_nostop_trun

In [None]:
notes_trun_nopunc, notes_nostop_trun_nopunc = truncation_nopunc(notes_24h)

In [None]:
def note_selector(notes):
    #separating dying and surviving patients
    pos_notes = notes.loc[notes.MORTALITY == 1]
    neg_notes = notes.loc[notes.MORTALITY == 0]
    
    #sampling pos and neg
    pos_notes_rand = pos_notes.sample(n=25)
    neg_notes_rand = neg_notes.sample(n=25)
    
    #combining pos and neg + shuffling and index reset
    notes_physician = pd.concat([pos_notes_rand, neg_notes_rand])
    notes_physician = notes_physician.sample(frac=1).reset_index(drop=True) 
    
    return notes_physician

In [None]:
notes_physician = note_selector(notes_trun)

In [None]:
#select a vectorizer by commenting the others
def vectorizer(notes):    
    ######################
    ### TF-IDF Vectors ###
    ######################
    vect = TfidfVectorizer(lowercase=True, max_features=20000)
    wm = vect.fit_transform(notes.TOK_sp_trun.values)
    vect_tokens = vect.get_feature_names()
    
    #####################
    ### Count Vectors ###
    #####################
#     vect = CountVectorizer(lowercase=True, max_features=20000)
#     wm = vect.fit_transform(notes.TOK_sp_trun.values)
#     vect_tokens = vect.get_feature_names()
    
    ######################
    ### Binary Vectors ###
    ######################
#     vect = CountVectorizer(binary=True, lowercase=True, max_features=20000)
#     wm = vect.fit_transform(notes.TOK_sp_trun.values)
#     vect_tokens = vect.get_feature_names()

    return wm, vect_tokens

In [None]:
wm, vect_tokens = vectorizer(notes_trun)

In [None]:
def training_prep(notes, batch1, wm):
    batch1 = batch1.drop(['HADM_ID', 'ADMITTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE', 'DIAGNOSIS', 'DOB', 'AGE'], axis=1)
    subjects1 = list(batch1.SUBJECT_ID.values)
    
    labels = notes.MORTALITY.values
    subjects = notes.SUBJECT_ID.values
    
    #making the vectors usable as input for a model
    def vec_processing(wm):
        input_vecs = []
        for i in wm:
            input_vecs.append(i.toarray().tolist())

        def flatten(l):
            flat_list = []
            for sublist in l:
                for item in sublist:
                    flat_list.append(item)
            return flat_list

        flat_vecs = []
        for i in tqdm(input_vecs):
            flat_i = flatten(i)
            flat_vecs.append(flat_i)
        return flat_vecs

    flat_vecs = vec_processing(wm)
    
    data = pd.DataFrame()
    data['subjects'] = subjects
    data['vecs']  = flat_vecs
    data['labels'] = labels

    batch1_data = data[data.subjects.isin(subjects1)]
    data = data[~data.subjects.isin(subjects1)]
    
    #train and test set
    data = data.sample(n=len(data), random_state=25)
    data = data.reset_index(drop = True)
    data_test = data.sample(frac=0.2, random_state=25)
    data_train_all = data.drop(data_test.index)
    
    #undersampling the negatives in training
    rows_pos = data_train_all.labels == 1
    train_pos = data_train_all.loc[rows_pos]
    train_neg = data_train_all.loc[-rows_pos]

    #merge balance
    data_train = pd.concat([train_pos, train_neg.sample(n=len(train_pos), random_state=25)], axis=0)
    data_train = data_train.sample(n=len(data_train), random_state=25).reset_index(drop=True)
    
    return data_train, data_test, batch1_data

In [None]:
data_train, data_test, batch1_data = training_prep(notes, batch1, wm)

In [None]:
def model_training(data_train, data_test):
    #prepare data
    X_train_usc, y_train = data_train.vecs.values, data_train.labels.values
    n_samples = len(data_train.vecs.values)
    n_features = len(tfidf_tokens)

    X_test_usc, y_test = data_test.vecs.values, data_test.labels.values

    X_train = np.array([np.array(xi) for xi in X_train_usc])
    X_test = np.array([np.array(xi) for xi in X_test_usc])

    X_train = torch.from_numpy(X_train.astype(np.float32))
    X_test = torch.from_numpy(X_test.astype(np.float32))
    y_train = torch.from_numpy(y_train.astype(np.float32))
    y_test = torch.from_numpy(y_test.astype(np.float32))

    y_train = y_train.view(y_train.shape[0],1)
    y_test = y_test.view(y_test.shape[0],1)

    # 1) model
    class LogisticRegression(nn.Module):
        def __init__(self, n_input_features):
            super(LogisticRegression, self).__init__()
            self.linear = nn.Linear(n_input_features, 1)

        def forward(self, x):
            y_predicted = torch.sigmoid(self.linear(x))
            return y_predicted

    model = LogisticRegression(n_features)

    # 2) loss and optimizer
    lr = 0.01
    criterion = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    # 3) training loop
    num_epochs = 10000
    for epoch in tqdm(range(num_epochs)):
        y_predicted = model(X_train)
        loss = criterion(y_predicted, y_train)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if (epoch+1) % 1000 == 0:
            print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')
            
    
    # 4) testing + metrics
    with torch.no_grad():
        y_predicted = model(X_test)
        y_predicted_cls = y_predicted.round()

        accuracy = y_predicted_cls.eq(y_test).sum()/float(y_test.shape[0])
        print(f'accuracy = {acc:.4f}')
    rec = recall_score(y_test, y_predicted_cls)
    print(f'recall = {rec:.4f}')
    prec = precision_score(y_test, y_predicted_cls)
    print(f'precision = {prec:.4f}')
    f1 = f1_score(y_test, y_predicted_cls)
    print(f'f1-score = {f1:.4f}')
    
    return model

In [None]:
model = model_training(data_train, data_test)

In [None]:
def LOO_method(model, batch1_data):
    batch1_X_test_usc, batch1_y_test = batch1_data.vecs.values, batch1_data.labels.values
    batch1_X_test = np.array([np.array(xi) for xi in batch1_X_test_usc])
    batch1_X_test = torch.from_numpy(batch1_X_test.astype(np.float32))
    batch1_y_test = torch.from_numpy(batch1_y_test.astype(np.float32))
    batch1_y_test = batch1_y_test.view(batch1_y_test.shape[0],1)
    
    d = {'patient': [], 'token as vect': [], 'token as word': [],\
     'og prob': [], 'new prob': [], 'prob diff': []}
    df_scores = pd.DataFrame(data=d)

    for patient in (range(0, len(batch1_X_test))): #for each note in X_test
        note_id = round(patient) 
        og_pred = model(batch1_X_test[patient]).detach().numpy() #predict original probability
        vecs_patient = ((batch1_X_test[patient] != 0).nonzero(as_tuple=True)[0]) # collect all the words that were vectorized (only top 10k in whole corpus)

        for tens_pos in vecs_patient: # for each present vec in note
            batch1_X_test_pert = batch1_X_test #resets input when loop is iterated
            arr = tens_pos.numpy() #to usuable array
            word_ = tfidf_tokens[arr] #saves word

            clone = torch.clone(batch1_X_test_pert[patient][tens_pos]) # copy vect to print later
            batch1_X_test_pert[patient][tens_pos] = 0 #leaves word out

            new_pred = model(batch1_X_test_pert[patient]) #model predicts probability of note again
            new_pred = new_pred.detach().numpy() #prediction is turned into array

            batch1_X_test[patient][tens_pos] = clone #returns vect to input
            dict = {'token as word': word_,\
                    'token as vect': clone.item(),\
                    'patient': round(note_id+1),\
                    'og prob': float(og_pred[0]), 'new prob': float(new_pred[0]),\
                    'prob diff': float((og_pred-new_pred)[0])}
            df_scores = df_scores.append(dict, ignore_index = True)
    
    return df_scores

In [None]:
results = LOO_method(model, batch1_data)