In [None]:
import sys,os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from google.colab import drive, files
import pickle as pickle


In [None]:
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if __name__=='__main__':
    print('Using device:', device)

Using device: cuda


In [None]:
DATA_PATH = '/content/drive/My Drive/BiteNetProject/data_processing/'

data = pickle.load(open(os.path.join(DATA_PATH,'data.pkl'), 'rb'))


In [None]:
## list of patient visits, where each visit is a list of medical codes 
seqs= [i[2] for i in data]


## target label of readmission 
readmission = [i[4] for i in data]


## number of unique medical codes
num_codes = max(set([code for visits in seqs for visit in visits for code in visit])) + 1


print(num_codes)



assert len(seqs) == len(readmission)

3874


In [None]:
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, seqs, readmission):
        self.x = seqs
        self.y = readmission
    
    def __len__(self):
        
        return len(self.x)
    
    def __getitem__(self, index):

        return self.x[index],self.y[index]
        
data = CustomDataset(seqs, readmission)


In [None]:
from torch.utils.data.dataset import random_split

train_test_split = int(len(data)*0.8)
lengths = [train_test_split, len(data) - train_test_split]
train_data, test_data = random_split(data, lengths)


train_val_split = int(len(train_data)*0.5)
lengths = [train_val_split, len(train_data) - train_val_split]
train_data, val_data = random_split(train_data, lengths)

print(train_data)
print("Length of train dataset:", len(train_data))
print("Length of val dataset:", len(val_data))
print("Length of test dataset:", len(test_data))



<torch.utils.data.dataset.Subset object at 0x7ff3e0e83b10>
Length of train dataset: 2998
Length of val dataset: 2998
Length of test dataset: 1500


In [None]:
def collate_fn(data):
  sequences, labels = zip(*data)

  num_patients = len(sequences)
  num_visits = len(sequences[0])
  num_codes = len(sequences[0][0])

  x = torch.zeros((num_patients, num_visits, num_codes), dtype=torch.long)
  
  y = torch.tensor(labels, dtype=torch.float)

  for i_patient, patient in enumerate(sequences):
        for j_visit, visit in enumerate(patient):            
            x[i_patient][j_visit] = torch.tensor(sequences[i_patient][j_visit],dtype=torch.long)


  return x, y



In [None]:
from torch.utils.data import DataLoader

def load_data(train_data, val_data, test_data, collate_fn):
    
    batch_size = 32
    ## iter will get a batch of size 32 [10 visits x 39 codes ] 

    train_loader = DataLoader(dataset = train_data, batch_size = 32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(dataset = val_data, batch_size = 32, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(dataset = test_data, batch_size = 32, shuffle=False, collate_fn=collate_fn)

    
    return train_loader, val_loader, test_loader


train_loader, val_loader, test_loader = load_data(train_data, val_data, test_data, collate_fn)






In [None]:
def get_last_visit_state(x,hidden_states):
  ## for each patient in batch, get index of last visit
  ## input x 32,10,39  hidden states 32 10 128
  ## output 32 1

  x = torch.sum(x,2)


  index = torch.zeros(x.shape,dtype=torch.int64)
  index[x!=0]=1
  index = torch.sum(index,1)
  index = torch.add(index,-1)

  last_visit_state = hidden_states[range(len(hidden_states)),index,:]
  
  return last_visit_state,index





In [None]:
def mask_sum(new_x,original_x):
  ## originalx 32 10 39 
  ## newx 32 10 39 128
  ## output 32 10 128

  mask = torch.ones(new_x.shape,dtype=torch.int64)

  for i in range(new_x.shape[0]):
    for j in range(new_x.shape[1]):
      if torch.sum(original_x[i,j,:]) == 0:
        mask[i,j] = torch.zeros(new_x.shape[2],new_x.shape[3])

  new_x = torch.mul(new_x,mask)

  new_x = torch.sum(new_x,2)

  return new_x


In [None]:
class NaiveRNN(nn.Module):
    
    def __init__(self, num_codes, dropout = 0.5):
        super().__init__()

        self.embedding_medcode = nn.Embedding(num_embeddings = num_codes, embedding_dim = 128)
        self.rnn_medcode = nn.GRU(128, hidden_size = 128, dropout = dropout, bidirectional = False, batch_first = True)
        self.fc = nn.Linear(128,1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x): 
        #print(x)
        original_x = x         ##print("start x",x.shape) ## 32,10,39 ##Each of the 39s is a medical code
        batch_size = x.shape[0]

        x = self.embedding_medcode(x) ##print("post embedding",x.shape) ## 32, 10, 39, 128

        x = mask_sum(x,original_x) ##print("after sum",x.shape) ## 32 10 128
 
        rnn_medcode_output, last_h_n = self.rnn_medcode(x) #print("after rnn output",rnn_medcode_output.shape )  ## 32 10 128   
        
        rnn_medcode_last_hs,index = get_last_visit_state(original_x,rnn_medcode_output) ##True last hidden state 32 128

        logits = self.fc(rnn_medcode_last_hs)     #print("after linear layer shape",logits)    ## 32 1 
        
        probs = self.sigmoid(logits)

        result = probs.view(batch_size) ## 32


        return result


model = NaiveRNN(num_codes = num_codes)
model


  "num_layers={}".format(dropout, num_layers))


NaiveRNN(
  (embedding_medcode): Embedding(3874, 128)
  (rnn_medcode): GRU(128, 128, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [None]:
from sklearn.utils.validation import indexable
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve, auc

def eval_model(model, loader):
    model.eval()
    y_pred = torch.LongTensor()
    y_score = torch.Tensor()
    y_true = torch.LongTensor()
    model.eval()
    for x, y in loader:
        y_hat = model(x) 
        y_score = torch.cat((y_score,  y_hat.detach().to('cpu')), dim=0)
        y_hat = (y_hat > 0.5).int()

        y_pred = torch.cat((y_pred,  y_hat.detach().to('cpu')), dim=0)
        y_true = torch.cat((y_true, y.detach().to('cpu')), dim=0)

    precision, recall, thresholds = precision_recall_curve(y_true, y_score)
    pr_auc = auc(recall, precision)
    


    return pr_auc

In [None]:
def train(model, train_loader, val_loader, n_epochs, print_train_results=True):
    for epoch in range(n_epochs):
      model.train()
      train_loss = 0
      for x, y in train_loader:
        loss = None
        optimizer.zero_grad()
        y_hat = model(x)
        
        
        loss = criterion(y_hat,y)
        loss.backward()
        optimizer.step()
        # your code here
        
        train_loss += loss.item()
      train_loss = train_loss / len(train_loader)
      if print_train_results==True:
        print('Epoch: {} \t Training Loss: {:.6f}'.format(epoch+1, train_loss))
      pr_auc = eval_model(model, val_loader)
      if print_train_results==True:
        print('Epoch: {} \t Validation pr_auc:{:.3f}'.format(epoch+1,pr_auc))
      


In [None]:
n_epochs = 10
train(model, train_loader, val_loader, n_epochs)

Epoch: 1 	 Training Loss: 0.507619
Epoch: 1 	 Validation pr_auc:0.213
Epoch: 2 	 Training Loss: 0.503174
Epoch: 2 	 Validation pr_auc:0.237
Epoch: 3 	 Training Loss: 0.503799
Epoch: 3 	 Validation pr_auc:0.212
Epoch: 4 	 Training Loss: 0.499537
Epoch: 4 	 Validation pr_auc:0.241
Epoch: 5 	 Training Loss: 0.500588
Epoch: 5 	 Validation pr_auc:0.212
Epoch: 6 	 Training Loss: 0.501210
Epoch: 6 	 Validation pr_auc:0.209
Epoch: 7 	 Training Loss: 0.499407
Epoch: 7 	 Validation pr_auc:0.211
Epoch: 8 	 Training Loss: 0.501604
Epoch: 8 	 Validation pr_auc:0.251
Epoch: 9 	 Training Loss: 0.499128
Epoch: 9 	 Validation pr_auc:0.223
Epoch: 10 	 Training Loss: 0.499701
Epoch: 10 	 Validation pr_auc:0.236


In [None]:
def test(model, data, test_number):
      pr_auc = eval_model(model, test_loader)
      print('Test number: {} \t test pr_auc:{:.3f}'
            .format(test_number+1,pr_auc))
      


In [None]:
test_number = 3
for i in range(test_number):
  train_test_split = int(len(data)*0.8)
  lengths = [train_test_split, len(data) - train_test_split]
  train_data, test_data = random_split(data, lengths)


  train_val_split = int(len(train_data)*0.5)
  lengths = [train_val_split, len(train_data) - train_val_split]
  train_data, val_data = random_split(train_data, lengths)

  train_loader, val_loader, test_loader = load_data(train_data, val_data, test_data, collate_fn)

  newmodel = NaiveRNN(num_codes = num_codes)
  criterion = nn.BCELoss()
  optimizer = optim.Adam(newmodel.parameters(), lr=0.001)

  n_epochs = 10
  train(newmodel, train_loader, val_loader, n_epochs,print_train_results=False)
  test(newmodel, test_loader, i)

  "num_layers={}".format(dropout, num_layers))


Test number: 1 	 test pr_auc:0.198


  "num_layers={}".format(dropout, num_layers))


Test number: 2 	 test pr_auc:0.205


  "num_layers={}".format(dropout, num_layers))


Test number: 3 	 test pr_auc:0.206
