## !!! Baseline CNN model !!!
This is using a 2d convolution

In [1]:
import numpy as np
import pandas as pd
import torch 
import torch.nn as nn

import random
import os

In [2]:
seed = 99
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

### Loading data
* Re-using the saved pre-processed data of patients grouped by visits. 

In [3]:
X = torch.load("processed-data/p_x.pt")
Y = torch.load("processed-data/p_y.pt")

In [4]:
print(len(Y))
print(len(X))
print(set(Y))

184123
184123
{'N180', 'I120', 'I10', 'N039', 'N188', 'N083', 'E102', 'N189', 'E142', 'N088'}


In [5]:
print(X.shape)
print(len(Y))

(184123, 40, 42)
184123


### Number of patients in each target class

In [6]:
for l in set(Y):
    print(f"{l}:", len(np.where(np.array(Y) == l)[0]))

N180: 178540
I120: 1189
I10: 360
N039: 316
N188: 307
N083: 314
E102: 100
N189: 2690
E142: 178
N088: 129


### Converting target lables to one-hot encoding 

In [7]:
y_labels = list(set(Y))
Y_oh = np.zeros((len(Y), len(y_labels)))
for idx, y in enumerate(Y):
    Y_oh[idx][y_labels.index(y)] = 1

In [8]:
print(Y_oh.shape)
Y_oh

(184123, 10)


array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

### Converting patient data (X) and target one-hot encoded data (Y_oh) as tensors to build the model

In [9]:
Xt = torch.tensor(X)
Yt = torch.tensor(Y_oh)
Xt = Xt.type(torch.FloatTensor)
Yt = Yt.type(torch.LongTensor)
print("Xt shape:", Xt.shape)
print("Yt shape:", Yt.shape)

Xt shape: torch.Size([184123, 40, 42])
Yt shape: torch.Size([184123, 10])


### A custom dataset to load data. Data is shuffled by dataloader for training 

In [10]:
from torch.utils.data import Dataset
from torch.utils.data import random_split

class NephrologyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.n = len(X)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        y = self.y[idx]
        
        return x, y

    def get_splits(self, n_test=0.2):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        
        return random_split(self, [train_size, test_size])

In [11]:
from torch.utils.data import DataLoader

dataset = NephrologyDataset(Xt, Yt)
train, test = dataset.get_splits()
train_dl = DataLoader(train, batch_size=32, shuffle=True)
test_dl = DataLoader(test, batch_size=32, shuffle=False)
print("# of train batches:", len(train_dl))
print("# of test batches:", len(test_dl))

# of train batches: 4604
# of test batches: 1151


In [12]:
train_iter = iter(train_dl)
x, y = next(train_iter)

print('Shape of a batch x:', x.shape)
print('Shape of a batch y:', y.shape)

Shape of a batch x: torch.Size([32, 40, 42])
Shape of a batch y: torch.Size([32, 10])


### CNN Model

In [13]:
class NephrologyCNN(nn.Module):
    def __init__(self):
        super(NephrologyCNN, self).__init__()
        self.conv = nn.Conv2d(1, 128, (3, 6), stride=2)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d((3, 6), stride=2)
                
        self.fc = nn.Linear(8064, 10)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = x[:, None, :]

        x = self.conv(x)
        x = self.relu(x)
        x = self.maxpool(x)

        f_out = x.flatten(start_dim=1)
        x_out = self.fc(f_out)

        y = self.softmax(x_out)
        
        return y

In [14]:
model = NephrologyCNN()

In [15]:
from sklearn.metrics import accuracy_score, \
precision_recall_fscore_support, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

def model_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    return acc, p, r, f1

In [16]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.01)

In [17]:
from numpy import vstack
from numpy import argmax

def evaluate(model, dl):
    model.eval()
    all_y_pred, all_y_true = list(), list()
    for x, y in dl:
        y_hat = model(x)

        y_true = y.type(torch.FloatTensor)
        y_pred = y_hat
        
        y_pred = (y_pred > 0.5).type(torch.FloatTensor)

        all_y_pred.append(y_pred)
        all_y_true.append(y_true)
    all_y_pred, all_y_true = vstack(all_y_pred), vstack(all_y_true)
    acc, p, r, f1 = model_metrics(all_y_true.flatten(), all_y_pred.flatten())
    print(f"acc: {acc:.4f}, precision: {p:.4f}, recall: {r:.4f}, f1: {f1:.4f}")  

In [18]:
def train():
    n_epochs = 10
    model.train()
    train_loss_arr = []
    for epoch in range(n_epochs):
        train_loss = 0
        for x, y in train_dl:
            optimizer.zero_grad()
            y_hat = model(x)

            y_true = y.type(torch.FloatTensor)
            y_pred = y_hat
            
            loss = criterion(y_pred, y_true)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_dl)
        print('Epoch: {} \tTraining Loss: {:.4f}'.format(epoch, train_loss))
    evaluate(model, test_dl)

### Training and evaluating the model 

In [19]:
%%time
train()

Epoch: 0 	Training Loss: 1.4913
Epoch: 1 	Training Loss: 1.4911
Epoch: 2 	Training Loss: 1.4911
Epoch: 3 	Training Loss: 1.4911
Epoch: 4 	Training Loss: 1.4911
Epoch: 5 	Training Loss: 1.4911
Epoch: 6 	Training Loss: 1.4911
Epoch: 7 	Training Loss: 1.4911
Epoch: 8 	Training Loss: 1.4911
Epoch: 9 	Training Loss: 1.4911
acc: 0.9937, precision: 0.9824, recall: 0.9824, f1: 0.9824
CPU times: user 26min 47s, sys: 2min 8s, total: 28min 56s
Wall time: 15min 16s
