In [1]:
import fasttext
import itertools
import os
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from collections import Counter

### Config

In [None]:
path = './data/'#the folder where your dataset is saved(assume train, val, test are saved under the same dir)
time = '24h'
train_path = path + 'train_' + time + '.txt'
val_path = path + 'val_' + time + '.txt'

In [2]:
epoch_para = [10, 15, 20, 25, 30]
lr_para = [0.001,0.005,0.01, 0.05, 0.1, 0.15, 0.2]
ws_para = [5, 10, 15, 20, 25,30]
x = []
x.append(epoch_para)
x.append(lr_para)
x.append(ws_para)
para_pair = list(itertools.product(*x))

### Helper Function

In [3]:
new_voca = np.load('../model/debugdata/voc_100.npy')

word2idx = np.load('../model/debugdata/word_to_ix.npy')
word2idx = word2idx.reshape(1,-1)[0][0]

#change for idx to idx2word
idx2word = {}
for key, value in word2idx.items():
    idx2word[value] = key

In [4]:
path = '/Users/leilei/Documents/DS1011Project/Project1011_Mortality_Predcition_old/model/debugdata/voc_100_downsample'

In [6]:
def data_readin(time, path):
    '''
    time: is the time you want to show, i.e. 15m, 12h, 24h, a string
    path: the folder these time data are saved, a string
    '''
    train = np.load(path + "/train_{}.npy".format(time)).reshape(1,-1)[0][0]
    val = np.load(path + "/val_{}.npy".format(time)).reshape(1,-1)[0][0]
    test = np.load(path + "/test_{}.npy".format(time)).reshape(1,-1)[0][0]
    
    #mortality label
    train_label = ['__label__DEAD' if i == 1 else '__label__LIVE' for i in train['MORTALITY_LABEL'] ]
    val_label = ['__label__DEAD' if i == 1 else '__label__LIVE' for i in val['MORTALITY_LABEL']]
    test_label = ['__label__DEAD' if i == 1 else '__label__LIVE' for i in test['MORTALITY_LABEL']]
    
    #Data
    train_data = notes_formatting(train['DATA'],train_label)
    val_data = notes_formatting(val['DATA'],val_label)
    test_data = notes_formatting(test['DATA'],test_label)
    
    #save it
    save_file(train_data, time, path,'train')
    save_file(val_data, time, path, 'val')
    save_file(test_data, time, path,'test')
    return train_data, val_data, test_data

In [7]:
def save_file(data, time, path, label):
    '''
    data: the data needs to be written out
    time: the string, i.e. 15m, 12h
    path: the path to save files
    label: trian/val/test
    '''
    with open(path + '_' + time + '_' + label + '.txt', 'w') as f:
        for i in data:
            f.write(i +'\n')

In [8]:
def notes_formatting(data_array, label_array):
    '''
    data_array: the data array from data
    return: patient note list, for each patient, their notes are concatnated and are turned into words
    '''
    patient_notes = []
    for i in range(len(data_array)):
        #patient is a list of notes, for each note, it is a list of sentences.
        list_li = [idx2word[word] for note in data_array[i] for sentence in note for word in sentence]
        patient_notes.append(' '.join(list_li) + ' ' + label_array[i])
    return patient_notes

In [10]:
_, val, test = data_readin('15m',path)

In [11]:
val[0]

'venteddid molecular clue thicknesses apancreatitis puppils tounge crazy pheocromocytoma treeinbuds airsp evne artifactual renalfluidsginutrition healthtlow keys hypocapnia slpet brush nonradiating han conmprogest airsp let crazy aspira insexp sobfevertachycardia pheocromocytoma treeinbuds airsp consentable tylenolbenzo mccopen brush airsp nonradiating dnrhospital wheeze caervue conmprogest syphillis thicknesses apancreatitis clue pvdangioplasty greek satifactory pancreatitisno gondal aspira otc snorring pancreatitisno pcvwas namedc conmprogest clipbaord datefibroidsiron intactsm greek uncharacteristic mgand hemiarch ooyzing intersti connect sbpct primarymetastatic piccpicc numberobtained promote digdced clue uncharacteristic drysteristrips sbpct vsarrthymias snorring sbpct sceroderma appreciatedpt numberobtained promote independentlyfoley snorring pancreatitisno feversro pulsesall hydralzine conmprogest sanguinus trachycardic givenno socialdisposister numberobtained marital bleedpleas

### DATA

In [34]:
_, _, _ = data_readin('15m', path)
_, _, _ = data_readin('6h', path)
_, _, _ = data_readin('12h', path)
_, _, _ = data_readin('24h', path)

### Train
You should refer to the scripts for training part, below is a scratch of building a model using fast text

In [35]:
time = '15m'
label = 'val'
val_path = path + '_' + time + '_val.txt'
train_path = path + '_' + time + '_train.txt'

In [None]:
#tune the parameters for above and save these models
for epoch_, lr_, ws_ in para_pair:
    classifier = fasttext.supervised(train_path, 'model_{}_{}_{}_{}'.format(time, epoch_, lr_, ws_), epoch = epoch_, lr = lr_, ws = ws_, label_prefix='__label__')

In [19]:
#input the validation set and split the data and labels(now label and data comes together)
def readin_split(path):
    '''
    path: get the val/test set
    return: list of texts, corresponding labels
    '''
    text_val = []
    with open(path,'r') as f:
        for line in f:
            text_val.append(line)

    labels_val = []
    for i in range(len(text_val)):
        temp = text_val[i].strip('\n').split('__')
        text_val[i] = temp[0]
        labels_val.append(temp[-1])
    return text_val, labels_val

In [31]:
text_val, label_val = readin_split(val_path)

In [32]:
text_val[0]

'venteddid molecular clue thicknesses apancreatitis puppils tounge crazy pheocromocytoma treeinbuds airsp evne artifactual renalfluidsginutrition healthtlow keys hypocapnia slpet brush nonradiating han conmprogest airsp let crazy aspira insexp sobfevertachycardia pheocromocytoma treeinbuds airsp consentable tylenolbenzo mccopen brush airsp nonradiating dnrhospital wheeze caervue conmprogest syphillis thicknesses apancreatitis clue pvdangioplasty greek satifactory pancreatitisno gondal aspira otc snorring pancreatitisno pcvwas namedc conmprogest clipbaord datefibroidsiron intactsm greek uncharacteristic mgand hemiarch ooyzing intersti connect sbpct primarymetastatic piccpicc numberobtained promote digdced clue uncharacteristic drysteristrips sbpct vsarrthymias snorring sbpct sceroderma appreciatedpt numberobtained promote independentlyfoley snorring pancreatitisno feversro pulsesall hydralzine conmprogest sanguinus trachycardic givenno socialdisposister numberobtained marital bleedpleas

In [40]:
label_val[0] #for calculating auc

'LIVE'

### Select

In [None]:
filelist = os.listdir(path)
folder_li = [i for i in folder_li if i.endswith('.bin')]

In [None]:
model_li = []
for path in folder_li: #the models' name corresponding to the name of the path
    model = fasttext.load_model(path, label_prefix='__label__')
    model_li.append(model)

In [None]:
binary_label_val = [0 if i == 'LIVE' else 1 for i in label_val]

predict_list = classifier.predict(text_val)

In [None]:
score = []
num = 0
for i in tqdm(model_li):
    predict_list = classifier.predict(text_val)
    #print(num)
    num += 1
    predict_list = list(itertools.chain.from_iterable(predict_list))
    binary_predict_li = [0  if i == 'LIVE' else 1 for i in predict_list ]
    score.append(roc_auc_score(binary_label_val, binary_predict_li))

In [None]:
plt.plot(score)
plt.show();

In [None]:
para_pair_df = pd.DataFrame(para_pair)
para_pair_df.columns = ['epoch_para', 'lr_para', 'ws_para']
para_pair_df['score'] = score

In [None]:
para_pair_df[(para_pair_df.epoch_para == 10)&(para_pair_df.ws_para ==5)]

In [None]:
### though it didn't take avg(seems the minimum), cause for two types, there are a few results, 
###which the below result only takes part of it. but here just have an idea

In [None]:
max(score)

In [None]:
para_pair_df.to_csv('performance_{}.csv'.format(time))

### light visual

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

trace = go.Heatmap(z=para_pair_df.score,x=para_pair_df.epoch_para,y=para_pair_df.lr_para,colorscale='rgb(50, 204, 153)')
data=[trace]
py.iplot(data, filename='epoch_lr_para',image_width=800, image_height=200)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

trace = go.Heatmap(z=para_pair_df.score,x=para_pair_df.epoch_para,y=para_pair_df.ws_para,colorscale='rgb(50, 204, 153)')
data=[trace]
py.iplot(data, filename='epoch_ws_para',image_width=800, image_height=200)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

trace = go.Heatmap(z=para_pair_df.score,x=para_pair_df.ws_para,y=para_pair_df.lr_para,colorscale='rgb(50, 204, 153)')
data=[trace]
py.iplot(data, filename='epoch_lr_para',image_width=800, image_height=200)

In [None]:
therefore, go with epoch == 25 - 30,或者10, learning rate 0.05, ws = 20. and now train new combination(see around where increase color)

### retrain and test

In [None]:
#tune the parameters for above and save these models
epoch_para = [25,27,29, 31] #3
lr_para = [0.03,0.05,0.07] #3
ws_para = [18,20,22,24]#7
x = []
x.append(epoch_para)
x.append(lr_para)
x.append(ws_para)
para_pair_new = list(itertools.product(*x))
para_pair_new = [i for i in para_pair_new if i not in para_pair] 
#therefore avoid retrain and easy to get the score
#previously has to find the missing because filtering use the original list

In [None]:
for epoch_, lr_, ws_ in para_pair_new:
    classifier = fasttext.supervised(train_path, 'model_{}_{}_{}_{}'.format(time,epoch_, lr_, ws_), epoch = epoch_, lr = lr_, ws = ws_, label_prefix='__label__')

In [None]:
### don't use this, becasue if the above has run, the combination won't be added, but don't know which one is missing
folder_li2 = os.listdir('/Users/leilei/Documents/DS1011Project')
folder_li2 = [i for i in folder_li2 if (i.endswith('.bin')) and (i not in folder_li)]
model_li_new = []
for path in folder_li2: #the models' name corresponding to the name of the path
    model = fasttext.load_model(path, label_prefix='__label__')
    model_li_new.append(model)

In [None]:
len(model_li_new),len(para_pair_new),len(folder_li2)

In [None]:
score_new = []
for i in tqdm(model_li_new):
    predict_list = i.predict(text_val)
    predict_list = list(itertools.chain.from_iterable(predict_list))
    binary_predict_li = [0  if i == 'LIVE' else 1 for i in predict_list ]
    score_new.append(roc_auc_score(binary_label_val, binary_predict_li))

In [None]:
max(score), max(score_new)

In [None]:
plt.plot(score_new)
plt.show();

In [None]:
np.where(score_new ==max(score_new))

In [None]:
para_pair_new[37]

In [None]:
para_pair_df_new = pd.DataFrame(para_pair_new)
para_pair_df_new.columns = ['epoch_para', 'lr_para', 'ws_para']
para_pair_df_new['score'] = score_new

In [None]:
para_pair_df_new.to_csv('performance_{}_2.csv'.format(time))

In [None]:
#guess: when the time is longer, the optimum range of window size needs to be longer as the length of the notes.
#not really, the best is 5