In [None]:
import numpy as np
import argparse
import os
import imp
import re
import pickle
import datetime
import random
import math
import copy

from sklearn.model_selection import KFold, StratifiedKFold
import torch
from torch import nn
import torch.nn.utils.rnn as rnn_utils
from torch.utils import data
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader,TensorDataset,random_split,SubsetRandomSampler, ConcatDataset, Subset


from utils import utils
from utils.readers import InHospitalMortalityReader
from utils.preprocessing import Discretizer, Normalizer
from utils import metrics
from utils import common_utils

### Base GRU model

In [None]:
input_dim = 25
pad_token = np.zeros(input_dim)
def pad_sents(sents, pad_token):

    sents_padded = []

    max_length = max([len(_) for _ in sents])
    for i in sents:
        padded = list(i) + [pad_token]*(max_length-len(i))
        sents_padded.append(np.array(padded))

    return np.array(sents_padded)

In [None]:
def get_bce_loss(y_pred, y_true):
    loss = torch.nn.BCELoss()
    return loss(y_pred, y_true)

In [None]:
class Dataset(data.Dataset):
    def __init__(self, x_lab, x_demo, y):
        self.x_lab = x_lab
        self.x_demo = x_demo
        self.y = y

    def __getitem__(self, index): # 返回的是tensor
        return self.x_lab[index], self.x_demo[index], self.y[index]

    def __len__(self):
        return len(self.x_lab)

In [None]:
data_path = './dataset/tongji/processed_data/'
file_name = './ckpt/gru.pth'

batch_size = 32
num_epochs = 3
device = torch.device("cuda:0" if torch.cuda.is_available() == True else 'cpu')
#device = torch.device('cpu')
print("available device: {}".format(device))


x_lab = pickle.load(open('./dataset/tongji/processed_data/train_x_labtest_outcome.pkl', 'rb'))
x_lab = np.array(x_lab, dtype=object)

x_demo = pickle.load(open('./dataset/tongji/processed_data/train_x_demographic_outcome.pkl', 'rb'))
x_demo = np.array(x_demo)

y = pickle.load(open('./dataset/tongji/processed_data/train_y_outcome.pkl', 'rb'))
y = np.array(y)

# print(len(x_lab[0][0]))
x_lab = pad_sents(x_lab, pad_token)
# len(x_lab[7])

In [None]:
train_dataset = Dataset(x_lab, x_demo, y)
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

print(x_lab.shape)
print(x_demo.shape)
print(y.shape)
# print(y)


In [None]:
class GRU(nn.Module):
    def __init__(self, input_lab_dim, input_demo_dim, hidden_dim, output_dim, act_layer=nn.GELU, drop=0.):
        super(GRU, self).__init__()

        # hyperparameters
        self.input_lab_dim = input_lab_dim
        self.input_demo_dim = input_demo_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.demo_proj = nn.Linear(input_demo_dim, hidden_dim)
        self.lab_proj = nn.Linear(input_lab_dim, hidden_dim)

        self.gru = nn.GRU(input_size = hidden_dim, hidden_size = hidden_dim, num_layers = 1, batch_first = True)
        
        self.act = act_layer()
        self.fc = nn.Linear(2*hidden_dim, output_dim)
        self.drop = nn.Dropout(drop)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x_lab, x_demo):

        x_lab = self.lab_proj(x_lab)
        x_lab = self.act(x_lab) 

        _, x_lab = self.gru(x_lab) # (1, batch_size, hidden_dim)
        x_lab = x_lab[0] # (batch_size, hidden_dim)

        x_demo = self.demo_proj(x_demo)
        x_demo = self.act(x_demo) # (batch_size, hidden_dim)

        x = torch.cat((x_lab, x_demo), 1) # (batch_size, 2*hidden_dim)

        x = self.drop(x)
        x = self.fc(x)
        x = self.drop(x)
        
        x = self.sigmoid(x)
        return x


In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED) # numpy
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED) # cpu
torch.cuda.manual_seed(RANDOM_SEED) # gpu
torch.backends.cudnn.deterministic=True # cudnn
np.set_printoptions(threshold=np.inf, precision=2, suppress=True)


def train_epoch(model, device, dataloader, loss_fn, optimizer):
    train_loss = []
    model.train()
    for step, data in enumerate(dataloader):   
        batch_x_lab, batch_x_demo, batch_y = data
        batch_x_lab, batch_x_demo, batch_y = batch_x_lab.float().to(device), batch_x_demo.float().to(device), batch_y.float()
        batch_y = batch_y.unsqueeze(-1)
        optimizer.zero_grad()
        output = model(batch_x_lab, batch_x_demo)
        loss = loss_fn(output, batch_y)
        train_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    return np.array(train_loss).mean()

def val_epoch(model, device, dataloader, loss_fn):
    val_loss = []
    y_pred = []
    y_true = []
    evaluation_scores = {}
    model.eval()
    with torch.no_grad():
        for step, data in enumerate(dataloader):   
            batch_x_lab, batch_x_demo, batch_y = data
            batch_x_lab, batch_x_demo, batch_y = batch_x_lab.float().to(device), batch_x_demo.float().to(device), batch_y.float()
            batch_y = batch_y.unsqueeze(-1)
            output = model(batch_x_lab, batch_x_demo)
            loss = loss_fn(output, batch_y)
            val_loss.append(loss.item())

            y_pred += list(output.cpu().numpy().flatten())
            y_true += list(batch_y.cpu().numpy().flatten())
    y_pred = np.array(y_pred)
    y_pred = np.stack([1 - y_pred, y_pred], axis=1)
    evaluation_scores = metrics.print_metrics_binary(y_true, y_pred)
    return np.array(val_loss).mean(), evaluation_scores

def test_epoch(model, device, dataloader, loss_fn):
    test_loss = []
    y_pred = []
    y_true = []
    evaluation_scores = {}
    model.eval()
    with torch.no_grad():
        for step, data in enumerate(dataloader):   
            batch_x_lab, batch_x_demo, batch_y = data
            batch_x_lab, batch_x_demo, batch_y = batch_x_lab.float().to(device), batch_x_demo.float().to(device), batch_y.float()
            batch_y = batch_y.unsqueeze(-1)
            output = model(batch_x_lab, batch_x_demo)
            loss = loss_fn(output, batch_y)
            test_loss.append(loss.item())

            y_pred += list(output.cpu().numpy().flatten())
            y_true += list(batch_y.cpu().numpy().flatten())
    y_pred = np.array(y_pred)
    y_pred = np.stack([1 - y_pred, y_pred], axis=1)
    evaluation_scores = metrics.print_metrics_binary(y_true, y_pred)
    return np.array(test_loss).mean(), evaluation_scores

# train_epoch(model, device, train_loader, get_bce_loss, optimizer)

### 选超参

In [None]:
num_folds = 10
kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=RANDOM_SEED)
dataset = train_dataset
# print(dataset.y)
train_and_val_idx, test_idx = next(kfold.split(np.arange(len(train_dataset)), train_dataset.y))
# print(len(train_and_val_idx), len(test_idx))

kfold = StratifiedKFold(n_splits=num_folds-1, shuffle=True, random_state=RANDOM_SEED)
sub_dataset = Dataset(dataset.x_lab[train_and_val_idx], dataset.x_demo[train_and_val_idx], dataset.y[train_and_val_idx])


In [None]:
fold_performance={}
dataset = sub_dataset

for fold, (train_idx, val_idx) in enumerate(kfold.split(np.arange(len(dataset)), dataset.y)):

    print('Fold {}'.format(fold + 1))

    print(type(val_idx))
    train_sampler = SubsetRandomSampler(train_idx)
    val_sampler = SubsetRandomSampler(val_idx)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)
    
    # 模型的超参数可在此处修改和调整
    model = GRU(input_lab_dim=25, input_demo_dim=2, hidden_dim=32, output_dim=1, act_layer=nn.GELU, drop=0.).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = get_bce_loss

    history = {'train_loss': [], 'val_loss': [],'val_accuracy':[], 'val_auroc':[], 'val_auprc':[]}

    for epoch in range(num_epochs):
        train_loss=train_epoch(model, device, train_loader, criterion, optimizer)
        val_loss, val_evaluation_scores=val_epoch(model, device, val_loader, criterion)
        # print('len:', len(train_loader.sampler), len(test_loader.sampler))

        print("Epoch:{}/{} AVG Training Loss:{:.3f} AVG Val Loss:{:.3f}".format(epoch + 1, num_epochs, train_loss, val_loss))
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_evaluation_scores['acc'])
        history['val_auroc'].append(val_evaluation_scores['auroc'])
        history['val_auprc'].append(val_evaluation_scores['auprc'])
    fold_performance['fold{}'.format(fold+1)] = history  

# torch.save(model,'gru.pth')

In [None]:
# print(fold_performance['fold7']['val_auroc'])

# at the same epoch, calculate average auroc of each fold, set the best one as the best model
best_epoch = 0
best_performance = 0.0
for step in range(num_epochs):
    auroc_list = []
    for fold in range(num_folds-1):
        auroc_list.append(fold_performance['fold{}'.format(fold+1)]['val_auroc'][step])
    performance = np.array(auroc_list).mean()
    if performance > best_performance:
        best_performance = performance
        best_epoch = step + 1
    print('Epoch:{}/{} AVG AUROC:{:.3f}'.format(step+1, num_epochs, performance))
print(f"Best epoch:{best_epoch} Best performance:{best_performance}")


### 已确定超参，评估模型性能

In [None]:
# kfold=KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
num_folds = 10
kfold = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=RANDOM_SEED)

fold_performance={}
dataset = train_dataset

for fold, (train_idx, test_idx) in enumerate(kfold.split(np.arange(len(dataset)), dataset.y)):

    print('Fold {}'.format(fold + 1))

    print(type(val_idx))
    train_sampler = SubsetRandomSampler(train_idx)
    test_sampler = SubsetRandomSampler(test_idx)
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
    test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler)
    
    model = GRU(input_lab_dim=25, input_demo_dim=2, hidden_dim=32, output_dim=1, act_layer=nn.GELU, drop=0.).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = get_bce_loss

    history = {'train_loss': [], 'test_loss': [],'test_accuracy':[], 'test_auroc':[], 'test_auprc':[]}

    for epoch in range(num_epochs):
        train_loss=train_epoch(model, device, train_loader, criterion, optimizer)
        test_loss, test_evaluation_scores=test_epoch(model, device, test_loader, criterion)
        # print('len:', len(train_loader.sampler), len(test_loader.sampler))

        print("Epoch:{}/{} AVG Training Loss:{:.3f} AVG Test Loss:{:.3f}".format(epoch + 1, num_epochs, train_loss, test_loss))
        history['train_loss'].append(train_loss)
        history['test_loss'].append(test_loss)
        history['test_accuracy'].append(test_evaluation_scores['acc'])
        history['test_auroc'].append(test_evaluation_scores['auroc'])
        history['test_auprc'].append(test_evaluation_scores['auprc'])
    fold_performance['fold{}'.format(fold+1)] = history  

# torch.save(model,'gru.pth')

In [None]:
# at the same epoch, calculate average auroc of each fold, set the best one as the best model
best_epoch = 0
best_performance = 0.0
for step in range(num_epochs):
    auroc_list = []
    for fold in range(num_folds):
        auroc_list.append(fold_performance['fold{}'.format(fold+1)]['test_auroc'][step])
    performance = np.array(auroc_list).mean()
    if performance > best_performance:
        best_performance = performance
        best_epoch = step + 1
    print('Epoch:{}/{} AVG AUROC:{:.3f}'.format(step+1, num_epochs, performance))
print(f"Best epoch:{best_epoch} Best performance:{best_performance}")
