## !!! Baseline RNN Model !!!

In [20]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn

import random
import os

In [21]:
seed = 99
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

### Loading data
* Re-using the saved pre-processed data of patients grouped by visits. 

In [22]:
X = torch.load("processed-data/p_x.pt")
Y = torch.load("processed-data/p_y.pt")

In [23]:
print(len(Y))
print(len(X))
print(set(Y))

184123
184123
{'I10', 'I120', 'E142', 'N189', 'N188', 'N039', 'N083', 'N180', 'N088', 'E102'}


In [24]:
print(X.shape)
print(len(Y))

(184123, 40, 42)
184123


### Number of patients in each target class

In [25]:
for l in set(Y):
    print(f"{l}:", len(np.where(np.array(Y) == l)[0]))

I10: 360
I120: 1189
E142: 178
N189: 2690
N188: 307
N039: 316
N083: 314
N180: 178540
N088: 129
E102: 100


### Converting target lables to one-hot encoding 

In [26]:
y_labels = list(set(Y))
Y_oh = np.zeros((len(Y), len(y_labels)))
for idx, y in enumerate(Y):
    Y_oh[idx][y_labels.index(y)] = 1

In [27]:
print(Y_oh.shape)
Y_oh

(184123, 10)


array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

### Converting patient data (X) and target one-hot encoded data (Y_oh) as tensors to build the model

In [28]:
Xt = torch.tensor(X)
Yt = torch.tensor(Y_oh)
Xt = Xt.type(torch.FloatTensor)
Yt = Yt.type(torch.LongTensor)
print("Xt shape:", Xt.shape)
print("Yt shape:", Yt.shape)

Xt shape: torch.Size([184123, 40, 42])
Yt shape: torch.Size([184123, 10])


### A custom dataset to load data. Data is shuffled by dataloader for training 

In [29]:
from torch.utils.data import Dataset
from torch.utils.data import random_split

class NephrologyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.n = len(X)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        
        return x, y

    def get_splits(self, n_test=0.2):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        
        return random_split(self, [train_size, test_size])

In [30]:
from torch.utils.data import DataLoader

dataset = NephrologyDataset(Xt, Yt)
train, test = dataset.get_splits()
train_dl = DataLoader(train, batch_size=32, shuffle=True)
test_dl = DataLoader(test, batch_size=32, shuffle=False)
print("# of train batches:", len(train_dl))
print("# of test batches:", len(test_dl))

# of train batches: 4604
# of test batches: 1151


In [31]:
train_iter = iter(train_dl)
x, y = next(train_iter)

print('Shape of a batch x:', x.shape)
print('Shape of a batch y:', y.shape)

Shape of a batch x: torch.Size([32, 40, 42])
Shape of a batch y: torch.Size([32, 10])


### RNN Model

In [32]:
from torch.nn import Module
from torch import nn

class NephrologyRNN(Module):
    def __init__(self):
        super(NephrologyRNN, self).__init__()
        self.rnn = nn.RNN(input_size=42, hidden_size=10, num_layers=1, batch_first=True)

        self.softmax = nn.Softmax(dim=1)
 
    def forward(self, x):
        out, h_n = self.rnn(x)
        h_n = h_n.squeeze()

        h_n = self.softmax(h_n)
        return h_n

In [33]:
model = NephrologyRNN()
print(model)

NephrologyRNN(
  (rnn): RNN(42, 10, batch_first=True)
  (softmax): Softmax(dim=1)
)


In [34]:
from sklearn.metrics import accuracy_score, \
precision_recall_fscore_support, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

def model_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    return acc, p, r, f1

In [35]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.01)

In [36]:
from numpy import vstack
from numpy import argmax

def evaluate(model, dl):
    model.eval()
    all_y_pred, all_y_true = list(), list()
    for x, y in dl:
        y_hat = model(x)

        y_true = y
        y_pred = y_hat
        
        y_pred = (y_pred > 0.5).type(torch.FloatTensor)

        all_y_pred.append(y_pred)
        all_y_true.append(y_true)
    all_y_pred, all_y_true = vstack(all_y_pred), vstack(all_y_true)
    acc, p, r, f1 = model_metrics(all_y_true.flatten(), all_y_pred.flatten())
    print(f"acc: {acc:.4f}, precision: {p:.4f}, recall: {r:.4f}, f1: {f1:.4f}")  

In [37]:
def train():
    n_epochs = 10
    model.train()
    train_loss_arr = []
    for epoch in range(n_epochs):
        train_loss = 0
        for x, y in train_dl:
            optimizer.zero_grad()
            y_hat = model(x)

            y_true = y.type(torch.FloatTensor)
            y_pred = y_hat.type(torch.FloatTensor)
            
            loss = criterion(y_pred, y_true)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_dl)
        print('Epoch: {} \tTraining Loss: {:.4f}'.format(epoch, train_loss))
    evaluate(model, test_dl)

### Training and evaluating the model

In [38]:
%%time
train()

Epoch: 0 	Training Loss: 1.9723
Epoch: 1 	Training Loss: 1.9710
Epoch: 2 	Training Loss: 1.9711
Epoch: 3 	Training Loss: 1.9710
Epoch: 4 	Training Loss: 1.9710
Epoch: 5 	Training Loss: 1.9710
Epoch: 6 	Training Loss: 1.9710
Epoch: 7 	Training Loss: 1.9710
Epoch: 8 	Training Loss: 1.9710
Epoch: 9 	Training Loss: 1.9710
acc: 0.9000, precision: 0.4500, recall: 0.5000, f1: 0.4737
CPU times: user 1min 50s, sys: 1.28 s, total: 1min 52s
Wall time: 1min 52s
