In [22]:
import torch 
import numpy as np
import pandas as pd

In [23]:
data = pd.read_csv("data-1649721068217.csv.gz")

In [24]:
data.rename(columns={"an_hcv": "HIC_antibodies",
                    "an_hiv": "HIV",
                    "an_hbsag": "HbsAg",
                    "ap_nuidade": "Age", 
                    "ap_coduni": "Hc_u",
                    "ap_pripal": "Procedure",
                    "ap_motsai": "r_f_d",
                    "estado": "State",
                    "an_tru": "u_red_r",
                    "an_intfis": "v_f_amount",
                    "an_diures": "Vlm",
                    "ap_cidpri": "label"}, inplace=True)

In [25]:
data["label"] = data["label"].str.strip()

In [26]:
def to_bool(x):
    if x == 'N':
        x = 0
    else:
        x = 1
    return x

In [27]:
def trim(x):
    x = x.strip(' ')
    x = x.lstrip('0')
    if len(x) == 0:
        x = 0
    else:
        if ',' in x:
            #print(x)
            tokens = x.strip(',')
            if len(tokens) > 0:
                x = tokens[0]
            else:
                x = 0
        if x == '-':
            x = 0
        try:
            x = int(x)
        except:
            #print(x)
            x = 0
    return(x)

In [28]:
data["HIC_antibodies"] = data["HIC_antibodies"].apply(to_bool)
data["HIV"] = data["HIV"].apply(to_bool)
data["HbsAg"] = data["HbsAg"].apply(to_bool)

In [29]:
labels = ["E102", "E142", "I10", "I120", "N039", "N083", "N088", "N180", "N188", "N189"]

In [30]:
data_f = data[data["label"].isin(labels)]

In [31]:
data_f['Vlm'] = data_f['Vlm'].apply(trim)
data_f['v_f_amount'] = data_f['v_f_amount'].apply(trim)
data_f['u_red_r'] = data_f['u_red_r'].apply(trim)

data_f['Vlm'] = data_f['Vlm'].astype(int)
data_f['v_f_amount'] = data_f['v_f_amount'].astype(int)
data_f['u_red_r'] = data_f['u_red_r'].astype(int)

In [32]:
d_f = data_f[(data_f["Vlm"] != 0) | (data_f["v_f_amount"] != 0) | (data_f["u_red_r"] != 0)]

In [33]:
import warnings
warnings.filterwarnings("ignore")

d_f["label"] = d_f["label"].astype('category')
d_f["label_cat"] = d_f["label"].cat.codes
d_f = pd.get_dummies(d_f, columns=["Hc_u"])
d_f = pd.get_dummies(d_f, columns=["Procedure"])
d_f = pd.get_dummies(d_f, columns=["State"])
d_f.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,label,label_cat,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
1,0,0,0,24,21,72,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,40,21,71,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,65,21,63,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,39,21,59,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,49,21,63,0,0,N180,7,...,0,0,0,0,0,0,0,0,0,0


In [34]:
df_s = d_f.sample(frac=0.1)
print(len(df_s))
print(len(d_f))
Y = df_s["label_cat"]
X = df_s.drop(['label', 'label_cat'], axis=1)

339342
3393421


In [35]:
X.head()

Unnamed: 0,HIC_antibodies,HIV,HbsAg,Age,r_f_d,u_red_r,v_f_amount,Vlm,Hc_u_004a13c3db6768e6,Hc_u_014bfb51b0b9c3e3,...,State_PR,State_RJ,State_RN,State_RO,State_RR,State_RS,State_SC,State_SE,State_SP,State_TO
2070742,0,0,0,79,21,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
389604,0,0,0,56,21,66,0,2000,0,0,...,0,0,0,0,0,0,0,0,0,0
2136137,1,0,0,42,21,74,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3213221,0,0,0,71,21,49,1,400,0,0,...,0,0,0,0,0,0,0,0,1,0
3725747,0,0,0,68,21,77,1,200,0,0,...,0,0,0,0,0,0,0,0,1,0


In [36]:
num_classes = len(Y.unique())
print("number of classes:", num_classes)

number of classes: 10


In [37]:
X = torch.tensor(X.values)
Y = torch.tensor(Y.values)
X = X.type(torch.FloatTensor)
Y = Y.type(torch.LongTensor)
print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: torch.Size([339342, 734])
Y shape: torch.Size([339342])


In [38]:
from sklearn.metrics import accuracy_score, \
precision_recall_fscore_support, recall_score, f1_score

def model_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    return acc, p, r, f1

In [39]:
from torch.utils.data import Dataset
from torch.utils.data import random_split

class NephrologyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]

    def get_splits(self, n_test=0.33):
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        
        return random_split(self, [train_size, test_size])

In [40]:
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Softmax
from torch.nn import Module

class MLP(Module):
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        self.hidden1 = Linear(n_inputs, 64)
        self.act1 = ReLU()

        self.hidden2 = Linear(64, 16)
        self.act2 = ReLU()

        self.hidden3 = Linear(16, 10)
        self.act3 = Softmax(dim=1)
 
    def forward(self, X):
        X = self.hidden1(X)
        X = self.act1(X)
        X = self.hidden2(X)
        X = self.act2(X)
        X = self.hidden3(X)
        X = self.act3(X)
        return X

model = MLP(734)
print(model)

MLP(
  (hidden1): Linear(in_features=734, out_features=64, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=64, out_features=16, bias=True)
  (act2): ReLU()
  (hidden3): Linear(in_features=16, out_features=10, bias=True)
  (act3): Softmax(dim=1)
)


In [41]:
from torch.utils.data import DataLoader

dataset = NephrologyDataset(X, Y)
train, test = dataset.get_splits()
train_dl = DataLoader(train, batch_size=32, shuffle=True)
test_dl = DataLoader(test, batch_size=32, shuffle=False)
print("# of train batches:", len(train_dl))
print("# of test batches:", len(test_dl))

# of train batches: 7105
# of test batches: 3500


In [42]:
train_iter = iter(train_dl)
x, y = next(train_iter)

print('Shape of a batch x:', x.shape)
print('Shape of a batch y:', y.shape)

Shape of a batch x: torch.Size([32, 734])
Shape of a batch y: torch.Size([32])


In [43]:
from torch.optim import SGD
from torch.nn import CrossEntropyLoss

criterion = CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)

In [44]:
from numpy import vstack
from numpy import argmax

def evaluate(model, dl):
    model.eval()
    all_y_pred, all_y_true = list(), list()
    for x, y in dl:
        y_pred = model(x)
        y_pred = y_pred.detach().numpy()
        y_true = y.numpy()
        y_pred = argmax(y_pred, axis=1)
        y_true = y_true.reshape((len(y_true), 1))
        y_pred = y_pred.reshape((len(y_pred), 1))
        
        all_y_pred.append(y_pred)
        all_y_true.append(y_true)
    all_y_pred, all_y_true = vstack(all_y_pred), vstack(all_y_true)
    acc, p, r, f1 = model_metrics(all_y_true, all_y_pred)
    print(f"acc: {acc:.4f}, precision: {p:.4f}, recall: {r:.4f}, f1: {f1:.4f}")           

In [45]:
def train():
    n_epochs = 2
    model.train()
    train_loss_arr = []
    for epoch in range(n_epochs):
        train_loss = 0
        for x, y in train_dl:
            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss = train_loss / len(train_dl)
        print('Epoch: {} \tTraining Loss: {:.4f}'.format(epoch, train_loss))
        evaluate(model, test_dl)

In [46]:
train()

Epoch: 0 	Training Loss: 1.4791
acc: 0.9826, precision: 0.0983, recall: 0.1000, f1: 0.0991
Epoch: 1 	Training Loss: 1.4789
acc: 0.9826, precision: 0.0983, recall: 0.1000, f1: 0.0991
