In [None]:
import numpy as np
import pandas as pd
from util import *

#modify code from line 55 and line 59

#from config import Config
from patient_data_reader import PatientReader

rare_word = 100
stop_word = 1e4
unknown = 1

input_file = '/content/drive/MyDrive/S1_File.txt'
vocab_file = '/content/vocab.txt'
stop_file = '/content/stopwords.txt'
vocab_pkl = '/content/vocab.pkl'


def dump_vocab():
    df = pd.read_csv(input_file, sep='\t', header=0)
    print(df[0:3])

    # .to_frame(): indexed by the groups, with a custom name
    # .reset_index(): set the groups to be columns again
    hist = df.groupby('DX_GROUP_DESCRIPTION').size().to_frame('SIZE').reset_index()
    print(hist[0:3])

    # show some stats
    hist_sort = hist.sort_values(by='SIZE', ascending=False)
    print(hist_sort[0:3])
    count = hist.groupby('SIZE').size().to_frame('COUNT').reset_index()
    print(count)

    # filter
    hist = hist[hist['SIZE'] > rare_word]
    print(hist)

    # dump
    vocab = hist.sort_values(by='SIZE').reset_index()['DX_GROUP_DESCRIPTION']
    vocab.index += 2  # reserve 1 to unk
    vocab.to_csv(vocab_file, sep='\t', header=False, index=True)

    # stop word
    hist[hist['SIZE'] > stop_word].reset_index()['DX_GROUP_DESCRIPTION']\
        .to_csv(stop_file, sep='\t', header=False, index=False)

#####################



import tensorflow as tf
##############################

def load_vocab():
    word_to_index = {}
    with tf.io.gfile.GFile(vocab_file, mode='r') as f:
        line = f.readline()
        while line != '':
            tokens = line.strip().split('\t')
            word_to_index[tokens[0]] = tokens[0]
            line = f.readline()
    print('dict size: ' + str(len(word_to_index)))
    save_pkl(vocab_pkl, {v: k for k, v in word_to_index.items()})
    return word_to_index


def convert_format(word_to_index, events):
    # order by PID, DAY_ID
    with open(input_file, mode='r') as f:
        # header
        header = f.readline().strip().split('\t')
        print(header)
        pos = {}
        for key, value in enumerate(header):
            pos[value] = key
        print(pos)

        docs = []
        doc = []
        sent = []
        labels = []
        label = []

        # init
        line = f.readline()
        tokens = line.strip().split('\t')
        pid = tokens[pos['PID']]
        day_id = tokens[pos['DAY_ID']]
        label.append(tag(events, pid, day_id))

        while line != '':
            tokens = line.strip().split('\t')
            c_pid = tokens[pos['PID']]
            c_day_id = tokens[pos['DAY_ID']]

            # closure
            if c_pid != pid:
                doc.append(sent)
                docs.append(doc)
                sent = []
                doc = []
                pid = c_pid
                day_id = c_day_id
                labels.append(label)
                label = [tag(events, pid, day_id)]
            else:
                if c_day_id != day_id:
                    doc.append(sent)
                    sent = []
                    day_id = c_day_id
                    label.append(tag(events, pid, day_id))

            word = tokens[pos['DX_GROUP_DESCRIPTION']]
            try:
                sent.append(word_to_index[word])
            except KeyError:
                sent.append(unknown)

            line = f.readline()

        # closure
        doc.append(sent)
        docs.append(doc)
        labels.append(label)

    return docs, labels


def split_data(docs, labels):
    # train, validate, test
    # X, Y,
    # TODO: YY
    print(len(docs))
    #print(docs)
    print(len(labels))
    print(docs)
    print(labels)
    #print(labels)

    save_pkl('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_train.pkl', docs[:2800])
    save_pkl('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_train.pkl', labels[:2800])
    save_pkl('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_valid.pkl', docs[2800:2900])
    save_pkl('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_valid.pkl', labels[2800:2900])
    save_pkl('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_test.pkl', docs[2900:])
    save_pkl('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_test.pkl', labels[2900:])


def extract_events():
    # extract event "INPATIENT HOSPITAL"
    target_event = 'INPATIENT HOSPITAL'

    df = pd.read_csv(input_file, sep='\t', header=0)
    events = df[df['SERVICE_LOCATION'] == target_event]

    events = events.groupby(['PID', 'DAY_ID', 'SERVICE_LOCATION']).size().to_frame('COUNT').reset_index()\
        .sort_values(by=['PID', 'DAY_ID'], ascending=True)\
        .set_index('PID')

    return events


def tag(events, pid, day_id):
    return 1 if tag_logic(events, pid, day_id) else 0


def tag_logic(events, pid, day_id):
    try:
        patient = events.loc[int(pid)]

        # test whether have events within 30 days
        if isinstance(patient, pd.Series):
            return (int(day_id) <= patient.DAY_ID) & (patient.DAY_ID < int(day_id) + 30)

        return patient.loc[(int(day_id) <= patient.DAY_ID) & (patient.DAY_ID < int(day_id) + 30)].shape[0] > 0
    except KeyError:
        # the label is not in the [index]
        return False


def main():
    # dump_vocab()
    word_to_index = load_vocab()
    events = extract_events()

    docs, labels = convert_format(word_to_index, events)
    split_data(docs, labels)
if __name__ == '__main__':
    main()

dict size: 30522
 [*] save /content/vocab.pkl
['PID', 'DAY_ID', 'DX_GROUP_DESCRIPTION', 'SERVICE_LOCATION', 'OP_DATE']
{'PID': 0, 'DAY_ID': 1, 'DX_GROUP_DESCRIPTION': 2, 'SERVICE_LOCATION': 3, 'OP_DATE': 4}


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 [*] save /content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_train.pkl
 [*] save /content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_train.pkl
 [*] save /content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_valid.pkl
 [*] save /content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_valid.pkl
 [*] save /content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_test.pkl
 [*] save /content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_test.pkl


In [None]:
def prepare_data(seqs, labels, vocabsize, maxlen=None):
    """Create the matrices from the datasets.
    This pad each sequence to the same lenght: the lenght of the
    longuest sequence or maxlen.
    if maxlen is set, we will cut all sequence to this maximum
    lenght.
    This swap the axis!
    """
    # x: a list of sentences
    lengths = [len(s) for s in seqs]

    eventSeq = []

    for seq in seqs:
        t = []
        for visit in seq:
            t.extend(visit)
        eventSeq.append(t)
    eventLengths = [len(s) for s in eventSeq]


    if maxlen is not None:
        new_seqs = []
        new_lengths = []
        new_labels = []
        for l, s, la in zip(lengths, seqs, labels):
            if l < maxlen:
                new_seqs.append(s)
                new_lengths.append(l)
                new_labels.append(la)
            else:
                new_seqs.append(s[:maxlen])
                new_lengths.append(maxlen)
                new_labels.append(la[:maxlen])
        lengths = new_lengths
        seqs = new_seqs
        labels = new_labels

        if len(lengths) < 1:
            return None, None, None

    n_samples = len(seqs)
    maxlen = np.max(lengths)

    x = np.zeros((n_samples, maxlen, vocabsize)).astype('int64')
    x_mask = np.zeros((n_samples, maxlen)).astype('float64')
    y = np.zeros((n_samples, maxlen)).astype('int64')
    for idx, s in enumerate(seqs):
        x_mask[idx, :lengths[idx]] = 1
        for j, sj in enumerate(s):
            for tsj in sj:
                x[idx, j, tsj-1] = 1
    for idx, t in enumerate(labels):
        y[idx,:lengths[idx]] = t
        # if lengths[idx] < maxlen:
        #     y[idx,lengths[idx]:] = t[-1]

    return x, x_mask, y, lengths, eventLengths


In [None]:
import numpy as np

In [None]:
import pickle

with open('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_train.pkl', 'rb') as f:
    datax_train = pickle.load(f)

with open('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_train.pkl', 'rb') as f:
    datay_train = pickle.load(f)
###################################################
with open('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_valid.pkl', 'rb') as f:
    datax_valid = pickle.load(f)

with open('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_valid.pkl', 'rb') as f:
    datay_valid = pickle.load(f)
#####################################################
with open('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/X_test.pkl', 'rb') as f:
    datax_test = pickle.load(f)
    
with open('/content/drive/MyDrive/Redrabbit RPDCECC/Data Preparation/Y_test.pkl', 'rb') as f:
    datay_test = pickle.load(f)

In [None]:
training=prepare_data(datax_train,datay_train,100, 50)
testing=prepare_data(datax_test,datay_test,100,50)
validation=prepare_data(datax_valid,datay_valid,100,50)

In [None]:
train_X=training[0]
train_X=train_X.reshape(2800,50*100)
train_Y=datay_train
###################
valid_X=validation[0]
valid_X=valid_X.reshape(100,50*100)
valid_Y=datay_valid
####################
test_X=testing[0]
test_X=test_X.reshape(100,50*100)
test_Y=datay_test