## !!! Baseline MLP model !!!

In [4]:
import torch 
import numpy as np
import pandas as pd

import random
import os

In [5]:
seed = 99
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

### Loading data
* Using normalized data from the pre-processing step.
* Target classes are encoded using lable encoding 

In [6]:
d_f = pd.read_csv("processed-data/data-normlized.csv.gz")
d_f = d_f.drop(["Unnamed: 0"], axis=1)

In [7]:
d_f.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,label,p_id,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
0,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0.3375,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0.3625,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0.325,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,1,0,0,0,0,0,0,0,0,0


In [8]:
import warnings
warnings.filterwarnings("ignore")

d_f["label"] = d_f["label"].astype('category')
d_f["label_cat"] = d_f["label"].cat.codes

In [9]:
d_f.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,label,p_id,...,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO,label_cat
0,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
1,0,0,0,0.3375,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
2,0,0,0,0.3625,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
3,0,0,0,0.325,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7
4,0,0,0,0.35,0.25,0.596637,0.553054,0.297873,N180,37378,...,0,0,0,0,0,0,0,0,0,7


In [10]:
df_s = d_f.sample(frac=0.1)
print(len(df_s))
print(len(d_f))
Y = df_s["label_cat"]
X = df_s.drop(['label', 'label_cat', 'p_id'], axis=1)

339338
3393385


In [11]:
X.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,Procedure_305010026,Procedure_305010069,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
2220386,0,0,0,0.625,0.25,0.596637,0.553068,0.297873,0,0,...,0,0,0,0,0,0,0,0,0,0
2867700,0,0,0,0.9375,0.25,0.596637,0.553068,0.297878,0,0,...,0,1,0,0,0,0,0,0,0,0
1590571,0,0,0,0.575,0.25,0.596637,0.553068,0.297873,0,0,...,0,0,0,0,0,0,0,0,1,0
59167,0,0,0,0.5375,0.25,0.596637,0.553068,0.297873,0,0,...,0,0,0,0,0,0,0,0,0,0
2138291,0,0,0,0.85,0.25,0.596637,0.553068,0.297874,0,0,...,0,1,0,0,0,0,0,0,0,0


In [12]:
num_classes = len(Y.unique())
print("number of classes:", num_classes)

number of classes: 10


### Converting patient data (X) and target (Y) as tensors to build the model

In [13]:
X = torch.tensor(X.values)
Y = torch.tensor(Y.values)
X = X.type(torch.FloatTensor)
Y = Y.type(torch.LongTensor)
print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: torch.Size([339338, 42])
Y shape: torch.Size([339338])


In [14]:
from sklearn.metrics import accuracy_score, \
precision_recall_fscore_support, recall_score, f1_score

def model_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    return acc, p, r, f1

### Custom dataset

In [15]:
from torch.utils.data import Dataset
from torch.utils.data import random_split

class NephrologyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    def get_splits(self, n_test=0.33):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        
        return random_split(self, [train_size, test_size])

### MLP model

In [16]:
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Softmax
from torch.nn import Module

class MLP(Module):
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        self.hidden1 = Linear(n_inputs, 64)
        self.act1 = ReLU()

        self.hidden2 = Linear(64, 16)
        self.act2 = ReLU()

        self.hidden3 = Linear(16, 10)
        self.act3 = Softmax(dim=1)
 
    def forward(self, X):
        X = self.hidden1(X)
        X = self.act1(X)
        X = self.hidden2(X)
        X = self.act2(X)
        X = self.hidden3(X)
        X = self.act3(X)
        return X

model = MLP(42)
print(model)

MLP(
  (hidden1): Linear(in_features=42, out_features=64, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=64, out_features=16, bias=True)
  (act2): ReLU()
  (hidden3): Linear(in_features=16, out_features=10, bias=True)
  (act3): Softmax(dim=1)
)


In [17]:
from torch.utils.data import DataLoader

dataset = NephrologyDataset(X, Y)
train, test = dataset.get_splits()
train_dl = DataLoader(train, batch_size=32, shuffle=True)
test_dl = DataLoader(test, batch_size=32, shuffle=False)
print("# of train batches:", len(train_dl))
print("# of test batches:", len(test_dl))

# of train batches: 7105
# of test batches: 3500


In [18]:
train_iter = iter(train_dl)
x, y = next(train_iter)

print('Shape of a batch x:', x.shape)
print('Shape of a batch y:', y.shape)

Shape of a batch x: torch.Size([32, 42])
Shape of a batch y: torch.Size([32])


In [19]:
from torch.optim import SGD
from torch.nn import CrossEntropyLoss

criterion = CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)

In [20]:
from numpy import vstack
from numpy import argmax

def evaluate(model, dl):
    model.eval()
    all_y_pred, all_y_true = list(), list()
    for x, y in dl:
        y_pred = model(x)
        y_pred = y_pred.detach().numpy()
        y_true = y.numpy()
        y_pred = argmax(y_pred, axis=1)
        y_true = y_true.reshape((len(y_true), 1))
        y_pred = y_pred.reshape((len(y_pred), 1))
        
        all_y_pred.append(y_pred)
        all_y_true.append(y_true)
    all_y_pred, all_y_true = vstack(all_y_pred), vstack(all_y_true)
    acc, p, r, f1 = model_metrics(all_y_true, all_y_pred)
    print(f"acc: {acc:.4f}, precision: {p:.4f}, recall: {r:.4f}, f1: {f1:.4f}")           

In [21]:
def train():
    n_epochs = 10
    model.train()
    train_loss_arr = []
    for epoch in range(n_epochs):
        train_loss = 0
        for x, y in train_dl:
            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_dl)
        print('Epoch: {} \tTraining Loss: {:.4f}'.format(epoch, train_loss))
    evaluate(model, test_dl)

### Training and evaluating the model

In [22]:
%%time
train()

Epoch: 0 	Training Loss: 1.4887
Epoch: 1 	Training Loss: 1.4788
Epoch: 2 	Training Loss: 1.4788
Epoch: 3 	Training Loss: 1.4788
Epoch: 4 	Training Loss: 1.4788
Epoch: 5 	Training Loss: 1.4788
Epoch: 6 	Training Loss: 1.4788
Epoch: 7 	Training Loss: 1.4788
Epoch: 8 	Training Loss: 1.4788
Epoch: 9 	Training Loss: 1.4788
acc: 0.9829, precision: 0.9661, recall: 0.9829, f1: 0.9744
CPU times: user 49.5 s, sys: 2.46 s, total: 51.9 s
Wall time: 50.7 s
