In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pathlib import Path
from sklearn import metrics
import random

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import torchvision

from datetime import datetime
from collections import OrderedDict

In [2]:
import pickle

In [3]:
PATH = Path("/data2/yinterian/multi-task-romain")

In [4]:
gap = "5min"
gap

'5min'

In [5]:
filename = "data_train_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    train = pickle.load(f)

In [6]:
filename = "data_valid_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    valid = pickle.load(f)

In [7]:
train.shape, valid.shape

((59742, 14), (7086, 14))

In [8]:
subject_id_list = np.sort(np.unique(train.subject_id.values))
id2index = {v: k+1 for k,v in enumerate(subject_id_list)}
num_subjects = len(subject_id_list)

In [9]:
num_subjects

2295

## Dataset

In [10]:
def get_mean_std_series(train):
    ss = np.concatenate(train.series.values)
    ss = ss.reshape(-1,5)
    return ss.mean(axis=0), ss.std(axis=0)

In [11]:
def get_mean_std_static(train):
    res = {}
    for name in ["age", "sapsii", "sofa"]:
        values = train[name].values
        res[name] = (values.mean(), values.std())
    res["series"] = get_mean_std_series(train)
    return res

In [12]:
norm_dict = get_mean_std_static(train)
norm_dict 

{'age': (64, 15.073998327645949),
 'sapsii': (33, 14.215114554630107),
 'sofa': (4, 3.7687923741651197),
 'series': (array([ 83.25271123,  93.7286662 , 120.81020051,  58.76277023,
          78.52866913]),
  array([16.10279665, 17.32261077, 21.2893833 , 12.28384779, 14.32805636]))}

In [13]:
class MultiTask(Dataset):
    def __init__(self, df, norm_dict, id2index, k=20, train=True):
        """
        Args:
            df: dataframe with data
            norm_dict: mean and std of all variables to normalize
            
        """
        self.norm_dict = norm_dict
        self.df = df
        self.names = ["age", "sapsii", "sofa"] ## needs normalization
        self.names_binary = ["gender", "amine", "sedation", "ventilation"]
        self.id2index = id2index
        self.train = train
        self.df_sample = self.pick_a_sample(k)
            
    def pick_a_sample(self, k=20):
        """ Picks sample with the same number of observations per patient"""
        if not self.train: # fix seed for validation and test
            np.random.seed(3)
        sample = self.df.groupby("subject_id", group_keys=False).apply(lambda x: x.sample(min(len(x), k)))
        sample = sample.copy()
        if self.train:
            self.subject_index = [self.id2index[subject_id] for subject_id in sample.subject_id.values]
            self.random = np.random.choice(2, sample.shape[0], p=[0.1, 0.9])
            self.subject_index = self.subject_index*self.random
        return sample

    def __getitem__(self, index):
        row = self.df_sample.iloc[index,:]
        x_series = (row.series - self.norm_dict["series"][0])/self.norm_dict["series"][1]
        x_cont = [(row[name]-self.norm_dict[name][0])/self.norm_dict[name][1] for name in self.names]
        x_binary = [row[name] for name in self.names_binary]
        subject_index = 0
        if self.train:
            subject_index = self.subject_index[index]
        x_cat = np.array([row["care_unit"], subject_index])
        x_cont = np.array(x_cont + x_binary)
        return x_series, x_cont, x_cat, row["prediction_mean_HR"], row["prediction_mean_MAP"]

    def __len__(self):
        return self.df_sample.shape[0]

In [14]:
train_ds = MultiTask(train, norm_dict, id2index)
valid_ds = MultiTask(valid, norm_dict, id2index, train=False)

## Model

In [15]:
def save_model(m, p): torch.save(m.state_dict(), p)
    
def load_model(m, p): m.load_state_dict(torch.load(p))

In [16]:
def val_metrics(model, valid_dl, which_y="y1"):
    model.eval()
    total = 0
    sum_loss = 0
    y_hat = []
    ys = []
    for x_series, x_cont, x_cat, y1, y2 in valid_dl:
        batch = y1.shape[0]
        x_series = x_series.float().cuda()
        x_cont = x_cont.float().cuda()
        x_cat = x_cat.long().cuda()
        y1 = y1.float().cuda()
        y2 = y2.float().cuda()
        out = model(x_series, x_cont, x_cat)
        if which_y=="y1":
            mse_loss = F.mse_loss(out, y1.unsqueeze(-1))
            ys.append(y1.view(-1).cpu().numpy())
        else:
            mse_loss = F.mse_loss(out, y2.unsqueeze(-1))
            ys.append(y2.view(-1).cpu().numpy())
        sum_loss += batch*(mse_loss.item())
        total += batch
        y_hat.append(out.view(-1).detach().cpu().numpy())
    
    y_hat = np.concatenate(y_hat)
    ys = np.concatenate(ys)
    r2 = metrics.r2_score(ys, y_hat)
    
    return sum_loss/total, r2

In [17]:
def train_epochs(model, train_ds, optimizer, lr=1e-3, epochs = 30, which_y="y1"):
    prev_val_r2 = 0
    for i in range(epochs):
        sum_loss = 0
        total = 0
        train_ds.pick_a_sample()
        train_dl = DataLoader(train_ds, batch_size=5000, shuffle=True)
        for x_series, x_cont, x_cat, y1, y2 in train_dl:
            model.train()
            x_series = x_series.float().cuda()
            x_cont = x_cont.float().cuda()
            x_cat = x_cat.long().cuda()
            y1 = y1.float().cuda()
            y2 = y2.float().cuda()
            out = model(x_series, x_cont, x_cat)
            if which_y=="y1":
                loss = F.mse_loss(out, y1.unsqueeze(-1))
            else:
                loss = F.mse_loss(out, y2.unsqueeze(-1))
            sum_loss += len(y1) * loss.item()
            
            total += len(y1)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if i % 1 == 0:
            val_loss, val_r2= val_metrics(model, valid_dl, which_y=which_y)
            print("\tTrain loss: {:.3f} valid loss: {:.3f} valid r2 {:.3f}".format(
                sum_loss/total, val_loss, val_r2))
        if val_r2 > prev_val_r2:
            prev_val_r2 = val_r2
            if val_r2 > 0.7:
                filename = "single_model_" + which_y
                path = "{0}/models/{1}_r2_{2:.0f}.pth".format(PATH, filename, 100*val_r2) 
                save_model(model, path)
                print(path)

In [18]:
batch_size = 5000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

## Model 3

In [19]:
class EventModel3(nn.Module):
    def __init__(self, hidden_size=100, num2=50):
        super(EventModel3, self).__init__()
        self.embedding1 = nn.Embedding(5, 1)
        self.embedding2 = nn.Embedding(num_subjects+1, 5)
        self.gru = nn.GRU(5, hidden_size, batch_first=True)
        self.num1 = hidden_size + 1 + 5 + 7
        self.num2 = num2
        self.linear1 = nn.Linear(self.num1, self.num2)
        self.linear2 = nn.Linear(self.num2, self.num2)
        self.out = nn.Linear(self.num2, 1)
        self.bn1 = nn.BatchNorm1d(self.num2)
        self.bn2 = nn.BatchNorm1d(self.num2)
        
    def forward(self, x_series, x_cont, x_cat):
        _, ht = self.gru(x_series)
        x_cat_1 = self.embedding1(x_cat[:,0])
        x_cat_2 = self.embedding2(x_cat[:,1])
        x = torch.cat((ht[-1], x_cat_1, x_cat_2, x_cont), 1)
        x = self.bn1(F.relu(self.linear1(x)))
        x = self.bn2(F.relu(self.linear2(x)))
        return self.out(x)

In [26]:
# model for mean_HR
model = EventModel3().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.03, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=15)

	Train loss: 7027.162 valid loss: 6767.887 valid r2 -26.826
	Train loss: 6410.198 valid loss: 5251.317 valid r2 -20.591
	Train loss: 5351.167 valid loss: 4357.603 valid r2 -16.916
	Train loss: 3934.455 valid loss: 2791.278 valid r2 -10.476
	Train loss: 2273.082 valid loss: 1171.302 valid r2 -3.816
	Train loss: 797.363 valid loss: 131.669 valid r2 0.459
	Train loss: 81.363 valid loss: 174.506 valid r2 0.283
	Train loss: 118.801 valid loss: 606.704 valid r2 -1.494
	Train loss: 188.583 valid loss: 292.491 valid r2 -0.203
	Train loss: 69.507 valid loss: 29.622 valid r2 0.878
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_88.pth
	Train loss: 11.799 valid loss: 13.491 valid r2 0.945
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_94.pth
	Train loss: 24.767 valid loss: 17.606 valid r2 0.928
	Train loss: 23.798 valid loss: 11.641 valid r2 0.952
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_95.pth
	Train loss: 11.071 valid loss: 6.012 valid r2 0.975
/da

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.02, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=15)

	Train loss: 15.010 valid loss: 38.459 valid r2 0.842
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_84.pth
	Train loss: 9.652 valid loss: 28.744 valid r2 0.882
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_88.pth
	Train loss: 8.770 valid loss: 8.099 valid r2 0.967
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_97.pth
	Train loss: 8.543 valid loss: 9.486 valid r2 0.961
	Train loss: 8.205 valid loss: 7.106 valid r2 0.971
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_97.pth
	Train loss: 7.869 valid loss: 6.949 valid r2 0.971
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_97.pth
	Train loss: 7.743 valid loss: 6.283 valid r2 0.974
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_97.pth
	Train loss: 7.480 valid loss: 6.008 valid r2 0.975
/data2/yinterian/multi-task-romain/models/single_model_y1_r2_98.pth
	Train loss: 7.521 valid loss: 6.173 valid r2 0.975
	Train loss: 7.519 valid loss: 6.513 valid r2 0.973
	

In [28]:
# mean_MAP
model = EventModel3().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.03, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=15, which_y="y2")

	Train loss: 6136.813 valid loss: 6055.551 valid r2 -33.875
	Train loss: 5578.118 valid loss: 4739.408 valid r2 -26.295
	Train loss: 4633.098 valid loss: 3592.138 valid r2 -19.688
	Train loss: 3333.216 valid loss: 2427.078 valid r2 -12.978
	Train loss: 1821.558 valid loss: 1263.982 valid r2 -6.279
	Train loss: 544.337 valid loss: 119.247 valid r2 0.313
	Train loss: 50.737 valid loss: 54.172 valid r2 0.688
	Train loss: 149.538 valid loss: 89.306 valid r2 0.486
	Train loss: 161.458 valid loss: 39.973 valid r2 0.770
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_77.pth
	Train loss: 43.642 valid loss: 10.027 valid r2 0.942
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_94.pth
	Train loss: 17.790 valid loss: 39.479 valid r2 0.773
	Train loss: 31.561 valid loss: 37.628 valid r2 0.783
	Train loss: 23.532 valid loss: 14.506 valid r2 0.916
	Train loss: 13.011 valid loss: 11.678 valid r2 0.933
	Train loss: 13.557 valid loss: 13.187 valid r2 0.924


In [29]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.02, weight_decay=1e-5)
train_epochs(model, train_ds, optimizer, epochs=15, which_y="y2")

	Train loss: 15.322 valid loss: 34.684 valid r2 0.800
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_80.pth
	Train loss: 13.476 valid loss: 11.070 valid r2 0.936
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_94.pth
	Train loss: 12.788 valid loss: 10.266 valid r2 0.941
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_94.pth
	Train loss: 12.102 valid loss: 9.983 valid r2 0.943
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_94.pth
	Train loss: 11.832 valid loss: 9.566 valid r2 0.945
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_94.pth
	Train loss: 11.402 valid loss: 9.254 valid r2 0.947
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_95.pth
	Train loss: 11.088 valid loss: 9.242 valid r2 0.947
/data2/yinterian/multi-task-romain/models/single_model_y2_r2_95.pth
	Train loss: 11.048 valid loss: 9.779 valid r2 0.944
	Train loss: 10.700 valid loss: 9.541 valid r2 0.945
	Train loss: 10.631 valid loss: 9.818 valid 

## Test 

In [31]:
path = PATH/"models/single_model_y1_r2_98.pth"
model = EventModel3().cuda()
load_model(model, path)

In [32]:
filename = "data_test_{gap}.pickle".format(gap=gap)
with open(PATH/filename, 'rb') as f:
    test = pickle.load(f)
print(filename)

data_test_5min.pickle


In [33]:
test_ds = MultiTask(test, norm_dict, id2index, k=25, train=False)
test.shape, len(test_ds)

((8233, 14), 4561)

In [34]:
test_dl = DataLoader(test_ds, batch_size=4561)

In [35]:
val_metrics(model, valid_dl, which_y="y1")

(5.7878594398498535, 0.9762034642736148)

In [36]:
path = PATH/"models/single_model_y2_r2_95.pth"
load_model(model, path)

In [37]:
val_metrics(model, valid_dl, which_y="y2")

(9.241517066955566, 0.9467764570505772)

## looking at the data

In [38]:
filename = "data_train_{gap}.pickle".format(gap="5min")
with open(PATH/filename, 'rb') as f:
    train5 = pickle.load(f)

In [39]:
filename = "data_train_{gap}.pickle".format(gap="10min")
with open(PATH/filename, 'rb') as f:
    train10 = pickle.load(f)

In [50]:
cols = ["subject_id", "key", "prediction_mean_HR", "prediction_mean_MAP"]
train5_s = train5.loc[:, cols]
train10_s = train10.loc[:, cols]

In [51]:
train5_s.iloc[:30]

Unnamed: 0,subject_id,key,prediction_mean_HR,prediction_mean_MAP
0,10013,10013_18,95.44,55.88
1,10013,10013_19,107.4,68.4
2,10013,10013_20,102.08,68.28
3,10013,10013_23,99.14,86.14
4,10013,10013_24,108.1,69.96
5,10013,10013_25,106.44,57.74
6,10013,10013_27,95.24,58.78
7,10013,10013_28,91.94,60.92
8,10013,10013_29,88.9,55.74
9,10013,10013_30,89.8,59.42


In [52]:
train10_s.iloc[:30]

Unnamed: 0,subject_id,key,prediction_mean_HR,prediction_mean_MAP
0,10013,10013_16,95.44,55.88
1,10013,10013_17,83.92,70.28
2,10013,10013_18,100.14,65.08
3,10013,10013_21,102.86,70.6
4,10013,10013_22,109.3,61.86
5,10013,10013_25,91.32,60.22
6,10013,10013_26,91.34,56.04
7,10013,10013_27,89.42,57.34
8,10013,10013_28,85.9,59.22
9,10013,10013_29,83.0,52.0
