In [None]:
import numpy as np
import argparse
import os
import imp
import re
import pickle
import datetime
import random
import math
import copy


import torch
from torch import nn
import torch.nn.utils.rnn as rnn_utils
from torch.utils import data
from torch.autograd import Variable
import torch.nn.functional as F


from utils import utils
from utils.readers import InHospitalMortalityReader
from utils.preprocessing import Discretizer, Normalizer
from utils import metrics
from utils import common_utils

### Prepare

In [None]:
data_path = './dataset/tongji/processed_data/'
file_name = './ckpt/concare.pth'
small_part = False
arg_timestep = 1.0
batch_size = 256
epochs = 100

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() == True else 'cpu')
#device = torch.device('cpu')
print("available device: {}".format(device))

### Base GRU model

In [None]:
input_dim = 27
pad_token = np.zeros(input_dim)
def pad_sents(sents, pad_token):

    sents_padded = []

    max_length = max([len(_) for _ in sents])
    for i in sents:
        padded = list(i) + [pad_token]*(max_length-len(i))
        sents_padded.append(np.array(padded))

    return np.array(sents_padded)

In [None]:
def get_loss(y_pred, y_true):
    loss = torch.nn.BCELoss()
    return loss(y_pred, y_true)

In [None]:
class Dataset(data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, index):# 返回的是tensor
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [None]:
import pickle
import numpy as np

x = pickle.load(open('/home/zhuyh/projects/covid-emr/dataset/tongji/processed_data/train_x_outcome_prediction.pkl', 'rb'))
x = np.array(x)

y = pickle.load(open('/home/zhuyh/projects/covid-emr/dataset/tongji/processed_data/train_y_outcome_prediction.pkl', 'rb'))
y = np.array(y)

# print(len(x[0]))
x = pad_sents(x, pad_token)
# len(x[7])

In [None]:
train_dataset = Dataset(x, y)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

print(x.shape)
print(y.shape)
# print(y)


In [88]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GRU, self).__init__()

        # hyperparameters
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.gru = nn.GRU(input_size = hidden_dim, hidden_size = hidden_dim, num_layers = 1, batch_first = True)

        self.softmax = nn.Softmax()
        self.sigmoid = nn.Sigmoid()

        self.linear2 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):

        # input shape [batch_size, timestep, feature_dim]
        x = self.linear1(x)

        # forward
        output, x = self.gru(x)
        x = self.linear2(x[0])
        x = self.sigmoid(x)
        return x


### Run

In [93]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED) #numpy
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED) # cpu
torch.cuda.manual_seed(RANDOM_SEED) #gpu
torch.backends.cudnn.deterministic=True # cudnn

model = GRU(input_dim = input_dim, hidden_dim = 64, output_dim = 1).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

max_roc = 0
max_prc = 0
train_loss = []
train_model_loss = []
train_decov_loss = []
valid_loss = []
valid_model_loss = []
valid_decov_loss = []
history = []
np.set_printoptions(threshold=np.inf)
np.set_printoptions(precision=2)
np.set_printoptions(suppress=True)

for each_epoch in range(100):
    batch_loss = []
    model_batch_loss = []
    decov_batch_loss = []

    model.train()
 
    for step, (batch_x, batch_y) in enumerate(train_loader):   
        optimizer.zero_grad()
        batch_x = batch_x.float().to(device)
        batch_y = batch_y.float().to(device)

        output = model(batch_x)
        
        # print(output.shape, batch_y.unsqueeze(-1).shape)
        model_loss = get_loss(output, batch_y.unsqueeze(-1))
        loss = model_loss
        
        batch_loss.append(loss.cpu().detach().numpy())
        model_batch_loss.append(model_loss.cpu().detach().numpy())
        loss.backward()
        optimizer.step()
        
        if step % 30 == 0:
            print('Epoch %d Batch %d: Train Loss = %.4f'%(each_epoch, step, np.mean(np.array(batch_loss))))
            print('Model Loss = %.4f'%(np.mean(np.array(model_batch_loss))))
    train_loss.append(np.mean(np.array(batch_loss)))
    train_model_loss.append(np.mean(np.array(model_batch_loss)))

    
 

Epoch 0 Batch 0: Train Loss = 0.6931
Model Loss = 0.6931
Epoch 1 Batch 0: Train Loss = 0.6894
Model Loss = 0.6894
Epoch 2 Batch 0: Train Loss = 0.6839
Model Loss = 0.6839
Epoch 3 Batch 0: Train Loss = 0.6757
Model Loss = 0.6757
Epoch 4 Batch 0: Train Loss = 0.6737
Model Loss = 0.6737
Epoch 5 Batch 0: Train Loss = 0.6727
Model Loss = 0.6727
Epoch 6 Batch 0: Train Loss = 0.6584
Model Loss = 0.6584
Epoch 7 Batch 0: Train Loss = 0.6449
Model Loss = 0.6449
Epoch 8 Batch 0: Train Loss = 0.6190
Model Loss = 0.6190
Epoch 9 Batch 0: Train Loss = 0.5937
Model Loss = 0.5937
Epoch 10 Batch 0: Train Loss = 0.5522
Model Loss = 0.5522
Epoch 11 Batch 0: Train Loss = 0.5096
Model Loss = 0.5096
Epoch 12 Batch 0: Train Loss = 0.4438
Model Loss = 0.4438
Epoch 13 Batch 0: Train Loss = 0.3678
Model Loss = 0.3678
Epoch 14 Batch 0: Train Loss = 0.3075
Model Loss = 0.3075
Epoch 15 Batch 0: Train Loss = 0.2305
Model Loss = 0.2305
Epoch 16 Batch 0: Train Loss = 0.1912
Model Loss = 0.1912
Epoch 17 Batch 0: Train 

In [None]:
batch_loss = []
model_batch_loss = []
decov_batch_loss = []

y_true = []
y_pred = []
with torch.no_grad():
    model.eval()
    for step, (batch_x, batch_y, batch_name) in enumerate(valid_loader):
        batch_x = batch_x.float().to(device)
        batch_y = batch_y.float().to(device)
        batch_demo = []
        for i in range(len(batch_name)):
            cur_id, cur_ep, _ = batch_name[i].split('_', 2)
            cur_idx = cur_id + '_' + cur_ep
            cur_demo = torch.tensor(demographic_data[idx_list.index(cur_idx)], dtype=torch.float32)
            batch_demo.append(cur_demo)

        batch_demo = torch.stack(batch_demo).to(device)
        output,decov_loss = model(batch_x, batch_demo)
        
        model_loss = get_loss(output, batch_y.unsqueeze(-1))

        loss = model_loss + 10* decov_loss
        batch_loss.append(loss.cpu().detach().numpy())
        model_batch_loss.append(model_loss.cpu().detach().numpy())
        decov_batch_loss.append(decov_loss.cpu().detach().numpy())
        y_pred += list(output.cpu().detach().numpy().flatten())
        y_true += list(batch_y.cpu().numpy().flatten())
        
valid_loss.append(np.mean(np.array(batch_loss)))
valid_model_loss.append(np.mean(np.array(model_batch_loss)))
valid_decov_loss.append(np.mean(np.array(decov_batch_loss)))

print("\n==>Predicting on validation")
print('Valid Loss = %.4f'%(valid_loss[-1]))
print('valid_model Loss = %.4f'%(valid_model_loss[-1]))
print('valid_decov Loss = %.4f'%(valid_decov_loss[-1]))
y_pred = np.array(y_pred)
y_pred = np.stack([1 - y_pred, y_pred], axis=1)
ret = metrics.print_metrics_binary(y_true, y_pred)
history.append(ret)
print()

cur_auroc = ret['auroc']

if cur_auroc > max_roc:
    max_roc = cur_auroc
    state = {
        'net': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': each_epoch
    }
    torch.save(state, file_name)
    print('\n------------ Save best model ------------\n')

### Run for test

In [None]:
batch_loss = []
y_true = []
y_pred = []
with torch.no_grad():
    model.eval()
    for step, (batch_x, batch_y, batch_name) in enumerate(test_loader):
        batch_x = batch_x.float().to(device)
        batch_y = batch_y.float().to(device)
        batch_demo = []
        for i in range(len(batch_name)):
            cur_id, cur_ep, _ = batch_name[i].split('_', 2)
            cur_idx = cur_id + '_' + cur_ep
            cur_demo = torch.tensor(demographic_data[idx_list.index(cur_idx)], dtype=torch.float32)
            batch_demo.append(cur_demo)

        batch_demo = torch.stack(batch_demo).to(device)
        output = model(batch_x, batch_demo)[0]

        loss = get_loss(output, batch_y.unsqueeze(-1))
        batch_loss.append(loss.cpu().detach().numpy())
        y_pred += list(output.cpu().detach().numpy().flatten())
        y_true += list(batch_y.cpu().numpy().flatten())

print("\n==>Predicting on test")
print('Test Loss = %.4f'%(np.mean(np.array(batch_loss))))
y_pred = np.array(y_pred)
y_pred = np.stack([1 - y_pred, y_pred], axis=1)
test_res = metrics.print_metrics_binary(y_true, y_pred)

In [None]:
checkpoint = torch.load(file_name)
save_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['net'])
optimizer.load_state_dict(checkpoint['optimizer'])
model.eval()

test_reader = InHospitalMortalityReader(dataset_dir=os.path.join(data_path, 'test'),
                                            listfile=os.path.join(data_path, 'test_listfile.csv'),
                                            period_length=48.0)
test_raw = utils.load_data(test_reader, discretizer, normalizer, small_part, return_names=True)
test_dataset = Dataset(test_raw['data'][0], test_raw['data'][1], test_raw['names'])
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Bootstrap
N = len(y_true)
N_idx = np.arange(N)
K = 1000

auroc = []
auprc = []
minpse = []
for i in range(K):
    boot_idx = np.random.choice(N_idx, N, replace=True)
    boot_true = np.array(y_true)[boot_idx]
    boot_pred = y_pred[boot_idx, :]
    test_ret = metrics.print_metrics_binary(boot_true, boot_pred, verbose=0)
    auroc.append(test_ret['auroc'])
    auprc.append(test_ret['auprc'])
    minpse.append(test_ret['minpse'])
    print('%d/%d'%(i+1,K))
    
print('auroc %.4f(%.4f)'%(np.mean(auroc), np.std(auroc)))
print('auprc %.4f(%.4f)'%(np.mean(auprc), np.std(auprc)))
print('minpse %.4f(%.4f)'%(np.mean(minpse), np.std(minpse)))