In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pathlib import Path
from sklearn import metrics
import random

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import torchvision

from datetime import datetime
from collections import OrderedDict

In [2]:
from scipy import stats

In [3]:
import pickle

In [4]:
PATH = Path("/data2/yinterian/multi-task-romain")

## Computing stats and maps

In [5]:
def get_data_gap(PATH, gap="60min"):
    filename = "data_train_{gap}.pickle".format(gap=gap)
    with open(PATH/filename, 'rb') as f:
        train = pickle.load(f)
    filename = "data_valid_{gap}.pickle".format(gap=gap)
    with open(PATH/filename, 'rb') as f:
        valid = pickle.load(f)
    return train, valid

In [6]:
def get_test_data_gap(PATH, gap="60min"):
    filename1 = "data_validation_{gap}.pickle".format(gap=gap)
    with open(PATH/filename1, 'rb') as f:
        test_ext = pickle.load(f)
    
    filename2 = "data_test_{gap}.pickle".format(gap=gap)
    with open(PATH/filename2, 'rb') as f:
        test = pickle.load(f)
    print(filename1, filename2)
    return test_ext, test

In [7]:
def get_mean_std_series(train):
    ss = np.concatenate(train.series.values)
    ss = ss.reshape(-1,5)
    return ss.mean(axis=0), ss.std(axis=0)

In [8]:
def get_mean_std_static(train):
    res = {}
    for name in ["age", "sapsii", "sofa"]:
        values = train[name].values
        res[name] = (values.mean(), values.std())
    res["series"] = get_mean_std_series(train)
    return res

In [9]:
def stats_dict(train):
    subject_id_list = np.sort(np.unique(train.subject_id.values))
    id2index = {v: k+1 for k,v in enumerate(subject_id_list)}
    num_subjects = len(subject_id_list)
    norm_dict = get_mean_std_static(train)
    care2id = {v:k for k,v in enumerate(np.unique(train.care_unit.values))}
    return norm_dict, care2id, id2index, num_subjects

## Dataset

In [10]:
class MultiTask(Dataset):
    def __init__(self, df, norm_dict, id2index, care2id,  k=20, train=True):
        """
        Args:
            df: dataframe with data
            norm_dict: mean and std of all variables to normalize
            
        """
        self.norm_dict = norm_dict
        self.df = df
        self.df["care_unit"] = self.df["care_unit"].apply(lambda x: care2id[x])
        self.names = ["age", "sapsii", "sofa"] ## needs normalization
        self.names_binary = ["gender", "amine", "sedation", "ventilation"]
        self.id2index = id2index
        self.train = train
        self.pick_a_sample(k)
            
    def pick_a_sample(self, k=20):
        """ Picks sample with the same number of observations per patient"""
        #if not self.train: # fix seed for validation and test
        #    np.random.seed(3)
        sample = self.df.groupby("subject_id", group_keys=False).apply(lambda x: x.sample(k, replace=True))
        sample = sample.copy()
        if self.train:
            self.subject_index = [self.id2index[subject_id] for subject_id in sample.subject_id.values]
            self.random = np.random.choice(2, sample.shape[0], p=[0.1, 0.9])
            self.subject_index = self.subject_index*self.random
        self.df_sample = sample
        

    def __getitem__(self, index):
        row = self.df_sample.iloc[index,:]
        x_series = (row.series - self.norm_dict["series"][0])/self.norm_dict["series"][1]
        x_cont = [(row[name]-self.norm_dict[name][0])/self.norm_dict[name][1] for name in self.names]
        x_binary = [row[name] for name in self.names_binary]
        subject_index = 0
        if self.train:
            subject_index = self.subject_index[index]
        x_cat = np.array([row["care_unit"], subject_index])
        x_cont = np.array(x_cont + x_binary)
        return x_series, x_cont, x_cat, row["prediction_mean_HR"], row["prediction_mean_MAP"]

    def __len__(self):
        return self.df_sample.shape[0]

In [11]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

## Model 3

In [12]:
class EventModel3(nn.Module):
    def __init__(self, num_subjects, hidden_size=100, num2=50):
        super(EventModel3, self).__init__()
        self.embedding1 = nn.Embedding(5, 1)
        self.embedding2 = nn.Embedding(num_subjects+1, 5)
        
        self.gru = nn.GRU(5, hidden_size, num_layers=2, batch_first=True,
                          dropout=0.3)
        self.num1 = hidden_size + 1 + 5 + 7
        self.num2 = num2
        self.linear1 = nn.Linear(self.num1, self.num2)
        self.linear2 = nn.Linear(self.num2, self.num2)
        self.out1 = nn.Linear(self.num2, 1)
        self.out2 = nn.Linear(self.num2, 1)
        self.bn1 = nn.BatchNorm1d(self.num2)
        self.bn2 = nn.BatchNorm1d(self.num2)
        
    def forward(self, x_series, x_cont, x_cat):
        _, ht = self.gru(x_series)
        x_cat_1 = self.embedding1(x_cat[:,0])
        x_cat_2 = self.embedding2(x_cat[:,1])
        x = torch.cat((ht[-1], x_cat_1, x_cat_2, x_cont), 1)
        x = self.bn1(F.relu(self.linear1(x)))
        x = self.bn2(F.relu(self.linear2(x)))
        return self.out1(x), self.out2(x)

In [13]:
class EventModel3_single(nn.Module):
    def __init__(self, num_subjects, hidden_size=100, num2=50):
        super(EventModel3_single, self).__init__()
        self.embedding1 = nn.Embedding(5, 1)
        self.embedding2 = nn.Embedding(num_subjects+1, 5)
        
        self.gru = nn.GRU(5, hidden_size, num_layers=2, batch_first=True,
                          dropout=0.3)
        self.num1 = hidden_size + 1 + 5 + 7
        self.num2 = num2
        self.linear1 = nn.Linear(self.num1, self.num2)
        self.linear2 = nn.Linear(self.num2, self.num2)
        self.out = nn.Linear(self.num2, 1)
        self.bn1 = nn.BatchNorm1d(self.num2)
        self.bn2 = nn.BatchNorm1d(self.num2)
        
    def forward(self, x_series, x_cont, x_cat):
        _, ht = self.gru(x_series)
        x_cat_1 = self.embedding1(x_cat[:,0])
        x_cat_2 = self.embedding2(x_cat[:,1])
        x = torch.cat((ht[-1], x_cat_1, x_cat_2, x_cont), 1)
        x = self.bn1(F.relu(self.linear1(x)))
        x = self.bn2(F.relu(self.linear2(x)))
        return self.out(x)

In [25]:
class EventModel33(nn.Module):
    def __init__(self, num_subjects, hidden_size=100, num2=50, single=False):
        super(EventModel33, self).__init__()
        self.single = single
        self.embedding1 = nn.Embedding(5, 1)
        self.embedding2 = nn.Embedding(num_subjects+1, 5)

        self.gru = nn.GRU(5, hidden_size, num_layers=2, batch_first=True,
                          dropout=0.3)
        self.num1 = hidden_size + 1 + 5 + 7
        self.num2 = num2
        self.linear1 = nn.Linear(self.num1, self.num2)
        self.linear2 = nn.Linear(self.num2, self.num2)
        self.out1 = nn.Linear(self.num2, 1)
        self.out2 = nn.Linear(self.num2, 1)
        self.bn1 = nn.BatchNorm1d(self.num2)
        self.bn2 = nn.BatchNorm1d(self.num2)

    def forward(self, x_series, x_cont, x_cat):
        _, ht = self.gru(x_series)
        x_cat_1 = self.embedding1(x_cat[:,0])
        x_cat_2 = self.embedding2(x_cat[:,1])
        x = torch.cat((ht[-1], x_cat_1, x_cat_2, x_cont), 1)
        x = self.bn1(F.relu(self.linear1(x)))
        x = self.bn2(F.relu(self.linear2(x)))
        if self.single:
            return self.out1(x)
        else:
            return self.out1(x), self.out2(x)

## Functions for confidence intervals

In [14]:
def test_metrics(model, modelS, test_ds):
    model.eval()
    modelS.eval()
    test_ds.pick_a_sample()
    test_dl = DataLoader(test_ds, batch_size=len(test_ds))
    for x_series, x_cont, x_cat, y1, y2 in test_dl:
        batch = y1.shape[0]
        x_series = x_series.float().cuda()
        x_cont = x_cont.float().cuda()
        x_cat = x_cat.long().cuda()
        y1 = y1.float().cuda()
        out1, out2 = model(x_series, x_cont, x_cat)
        out = modelS(x_series, x_cont, x_cat)
       
    y_hat = out2.view(-1).detach().cpu().numpy()
    y_hatS = out.view(-1).detach().cpu().numpy()
    ys2 = y2.view(-1).cpu().numpy()
    
    r2 = metrics.r2_score(ys2, y_hat)
    r2S = metrics.r2_score(ys2, y_hatS)
    return r2, r2S

In [15]:
def boostrap_test_metrics(model, modelS, test_ds, N=1000):
    r2s = []
    r2Ss = []
    for i in range(N):
        r2, r2S = test_metrics(model, modelS, test_ds)
        r2s.append(r2)
        r2Ss.append(r2S)
    return r2s, r2Ss

## Gap = "60min"

In [65]:
train, valid = get_data_gap(gap="60min")
norm_dict, care2id, id2index, num_subjects = stats_dict(train)

In [66]:
test_ext, test = get_test_data_gap(gap="60min")

data_validation_60min.pickle data_test_60min.pickle


In [68]:
test_ds = MultiTask(test, norm_dict, id2index, care2id, k=13, train=False)
test_ext_ds = MultiTask(test_ext, norm_dict, id2index, care2id, k=13, train=False)
test.shape, len(test_ds), len(test_ext_ds)

((1461, 17), 2184, 559)

In [69]:
path = PATH/"models/model3_60min_r2_58_71.pth"
model = EventModel3(num_subjects).cuda()
load_model(model, path)

In [70]:
path = PATH/"models/model3_single_60min_r2_70.pth"
modelS = EventModel3_single(num_subjects).cuda()
load_model(modelS, path)

In [90]:
r2, r2S = boostrap_test_metrics(model, modelS, test_ds, N=100)
stats.ttest_rel(r2, r2S)

Ttest_relResult(statistic=-27.21127432542325, pvalue=9.458268957079656e-48)

In [91]:
np.quantile(np.array(r2) - np.array(r2S), [0.025, 0.5, 0.975])

array([-0.01420232, -0.0079482 , -0.0017322 ])

In [92]:
r2, r2S = boostrap_test_metrics(model, modelS, test_ext_ds, N=100)
stats.ttest_rel(r2, r2S)

Ttest_relResult(statistic=51.68264094650364, pvalue=1.9553414107002766e-73)

In [93]:
np.quantile(np.array(r2) - np.array(r2S), [0.025, 0.5, 0.975])

array([0.01958162, 0.03161031, 0.04319205])

## Gap = "30min"

In [94]:
gap="30min"
train, valid = get_data_gap(gap=gap)
norm_dict, care2id, id2index, num_subjects = stats_dict(train)
test_ext, test = get_test_data_gap(gap=gap)

test_ds = MultiTask(test, norm_dict, id2index, care2id, k=13, train=False)
test_ext_ds = MultiTask(test_ext, norm_dict, id2index, care2id, k=13, train=False)
test.shape, len(test_ds), len(test_ext_ds)

data_validation_30min.pickle data_test_30min.pickle


((2733, 17), 2379, 637)

In [97]:
path = PATH/"models/model3_30min_r2_87_78.pth"
pathS = PATH/"models/model3_single_30min_r2_78.pth"
model = EventModel3(num_subjects).cuda()
load_model(model, path)
modelS = EventModel3_single(num_subjects).cuda()
load_model(modelS, pathS)

In [98]:
r2, r2S = boostrap_test_metrics(model, modelS, test_ds, N=100)
stats.ttest_rel(r2, r2S)

Ttest_relResult(statistic=-0.11315492642648424, pvalue=0.9101368707014197)

In [99]:
np.quantile(np.array(r2) - np.array(r2S), [0.025, 0.5, 0.975])

array([-0.00443137, -0.00021609,  0.00461935])

In [100]:
r2, r2S = boostrap_test_metrics(model, modelS, test_ext_ds, N=100)
stats.ttest_rel(r2, r2S)

Ttest_relResult(statistic=83.48001787893983, pvalue=1.412715132017392e-93)

In [101]:
np.quantile(np.array(r2) - np.array(r2S), [0.025, 0.5, 0.975])

array([0.04302278, 0.05549248, 0.06590367])

## Gap = "15min"

In [18]:
gap="15min"
train, valid = get_data_gap(PATH, gap=gap)
norm_dict, care2id, id2index, num_subjects = stats_dict(train)
test_ext, test = get_test_data_gap(PATH, gap=gap)

test_ds = MultiTask(test, norm_dict, id2index, care2id, k=13, train=False)
test_ext_ds = MultiTask(test_ext, norm_dict, id2index, care2id, k=13, train=False)
test.shape, len(test_ds), len(test_ext_ds)

data_validation_15min.pickle data_test_15min.pickle


((4442, 17), 2587, 637)

In [24]:
path = PATH/"models/modelX3_15min_r2_91_85.pth"
pathS = path = PATH/"models/modelX3_single_15min_r2_85.pth"
model = EventModel33(num_subjects).cuda()
load_model(model, path)
modelS = EventModel33(num_subjects, single=True).cuda()
load_model(modelS, pathS)

RuntimeError: Error(s) in loading state_dict for EventModel33:
	Missing key(s) in state_dict: "out2.weight", "out2.bias". 