In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pathlib import Path
from sklearn import metrics
import random

import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from datetime import datetime
import cv2
from collections import OrderedDict

from tqdm import tqdm
from joblib import Parallel

In [2]:
import pickle

In [3]:
PATH = Path("/data2/yinterian/multi-task-romain")

In [4]:
filename = "train_val_test_split.pickle"

In [5]:
def get_train_val_test_plit():
    filename = "data_modelisation_5min_gap.csv"
    data = pd.read_csv(PATH/"data-jan-2019"/filename)
    np.random.seed(3)
    patients_ids = np.sort(data.subject_id.unique())
    group = np.random.choice(3, len(patients_ids ), p=[0.8, 0.1, 0.1])
    train_val_test_split = pd.DataFrame({"subject_id": patients_ids, "group": group})
    return train_val_test_split

#train_val_test_split = get_train_val_test_plit()
#with open(PATH/filename, 'wb') as f:
#    pickle.dump(train_val_test_split, f)

In [6]:
def process_periods(df, index):
    time_cols = ["hr", "spo2", "abp_sys", "abp_dias", "abp_mean"]
    static_cols = ["subject_id", "key", "gender", "age", "sapsii", "sofa", "care_unit", "amine", "sedation",\
                   "ventilation"]
    y_cols = ["prediction_mean_HR", "prediction_mean_MAP"]
    static_df = df.loc[:, static_cols + y_cols].iloc[0]
    time_df = df.loc[:, time_cols]
    time_series = [time_df.iloc[i,].values for i in range(time_df.shape[0])]
    static_df["series"] = time_series
    return static_df.copy().to_frame(index).T

In [7]:
def process_whole_list(obs):
    list_df = []
    obs_periods = np.unique(obs.key.values)
    for i in tqdm(range(len(obs_periods))):
        key = obs_periods[i]
        period = obs[obs["key"] == key]
        line = process_periods(period, i)
        list_df.append(line)
    return list_df

In [8]:
def create_dataset(gap):
    filename = "data_modelisation_" + gap + "_gap.csv"
    data = pd.read_csv(PATH/"data-jan-2019"/filename)
    print(filename, data.shape)
    data = data[data["window"] == "obs"]
    print(data.shape)
    list_obs = process_whole_list(data)
    data_df = pd.concat(list_obs)
    filename = "data_df_{gap}.pickle".format(gap=gap)
    print(filename)
    with open(PATH/filename, 'wb') as f:
        pickle.dump(data_df, f)

In [10]:
#create_dataset("5min")

In [11]:
#create_dataset("10min")

In [12]:
#create_dataset("15min")

## Split datasets

In [9]:
filename = "train_val_test_split.pickle"
with open(PATH/filename, 'rb') as f:
    train_val_test_split = pickle.load(f)
train_val_test_split.head()

Unnamed: 0,subject_id,group
0,20,0
1,107,0
2,138,0
3,208,0
4,217,1


In [10]:
def split_datasets(gap):
    filename = "data_df_{gap}.pickle".format(gap=gap)
    with open(PATH/filename, 'rb') as f:
        data_df = pickle.load(f)
    print(filename, data_df.shape)
    df_with_split = data_df.merge(train_val_test_split, left_on="subject_id", right_on="subject_id")
    #care2id = {v:k for k,v in enumerate(np.unique(df_with_split.care_unit.values))}
    #df_with_split["care_unit"] = df_with_split["care_unit"].apply(lambda x: care2id[x])
    train = df_with_split.loc[df_with_split["group"] == 0]
    valid = df_with_split.loc[df_with_split["group"] == 1]
    test = df_with_split.loc[df_with_split["group"] == 2]
    print(train.shape, valid.shape, test.shape)
    filename = "data_train_{gap}.pickle".format(gap=gap)
    with open(PATH/filename, 'wb') as f:
        pickle.dump(train, f)
    filename = "data_valid_{gap}.pickle".format(gap=gap)
    with open(PATH/filename, 'wb') as f:
        pickle.dump(valid, f)
    filename = "data_test_{gap}.pickle".format(gap=gap)
    with open(PATH/filename, 'wb') as f:
        pickle.dump(test, f)

In [11]:
split_datasets("5min")

data_df_5min.pickle (75061, 13)
(59742, 14) (7086, 14) (8233, 14)


In [12]:
split_datasets("10min")

data_df_10min.pickle (67239, 13)
(50255, 14) (5926, 14) (7002, 14)


In [13]:
split_datasets("15min")

data_df_15min.pickle (57434, 13)
(42830, 14) (5069, 14) (5933, 14)


## Looking at the data

In [20]:
gap = "5min"
filename = "data_modelisation_" + gap + "_gap.csv"
data = pd.read_csv(PATH/"data-jan-2019"/filename)
print(filename, data.shape)
data.shape

data_modelisation_5min_gap.csv (3002440, 22)


(3002440, 22)

In [21]:
data_20 = data[data["subject_id"] == 20]

In [22]:
data_20

Unnamed: 0,subject_id,key,gender,age,sapsii,sofa,care_unit,amine,sedation,ventilation,...,time_and_date,hr,spo2,abp_sys,abp_dias,abp_mean,observation_mean_MAP,prediction_mean_MAP,observation_mean_HR,prediction_mean_HR
0,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:27:59,80.0,96.3,125.9,59.2,77.3,75.81,66.34,80.026667,80.0
1,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:28:59,80.8,95.9,122.6,57.6,75.1,75.81,66.34,80.026667,80.0
2,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:29:59,80.0,96.0,119.6,56.4,73.2,75.81,66.34,80.026667,80.0
3,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:30:59,80.0,95.1,118.6,56.3,72.8,75.81,66.34,80.026667,80.0
4,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:31:59,80.0,95.0,119.0,56.4,73.0,75.81,66.34,80.026667,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,20,20_20,1,76,22,1,2,0,0,1,...,2183-04-29 07:02:59,80.0,96.0,135.9,52.0,74.7,76.25,74.10,80.003333,80.0
516,20,20_20,1,76,22,1,2,0,0,1,...,2183-04-29 07:03:59,80.0,96.6,133.7,51.6,73.9,76.25,74.10,80.003333,80.0
517,20,20_20,1,76,22,1,2,0,0,1,...,2183-04-29 07:04:59,80.0,96.8,136.7,52.6,75.6,76.25,74.10,80.003333,80.0
518,20,20_20,1,76,22,1,2,0,0,1,...,2183-04-29 07:05:59,80.0,96.2,136.9,52.8,75.9,76.25,74.10,80.003333,80.0


In [24]:
data_20.shape

(520, 22)

In [26]:
np.unique(data_20.key.values)

array(['20_10', '20_11', '20_12', '20_15', '20_16', '20_17', '20_18',
       '20_19', '20_2', '20_20', '20_5', '20_7', '20_9'], dtype=object)

In [28]:
np.unique(data_20.window)

array(['gap', 'obs', 'pred'], dtype=object)

In [23]:
data_20.columns

Index(['subject_id', 'key', 'gender', 'age', 'sapsii', 'sofa', 'care_unit',
       'amine', 'sedation', 'ventilation', 'periode', 'window',
       'time_and_date', 'hr', 'spo2', 'abp_sys', 'abp_dias', 'abp_mean',
       'observation_mean_MAP', 'prediction_mean_MAP', 'observation_mean_HR',
       'prediction_mean_HR'],
      dtype='object')

In [18]:
data = data[data["window"] == "obs"]
data.shape

(2251830, 22)

In [19]:
(3002440 - 2251830)/3002440

0.25

In [16]:
data.head()

Unnamed: 0,subject_id,key,gender,age,sapsii,sofa,care_unit,amine,sedation,ventilation,...,time_and_date,hr,spo2,abp_sys,abp_dias,abp_mean,observation_mean_MAP,prediction_mean_MAP,observation_mean_HR,prediction_mean_HR
0,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:27:59,80.0,96.3,125.9,59.2,77.3,75.81,66.34,80.026667,80.0
1,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:28:59,80.8,95.9,122.6,57.6,75.1,75.81,66.34,80.026667,80.0
2,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:29:59,80.0,96.0,119.6,56.4,73.2,75.81,66.34,80.026667,80.0
3,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:30:59,80.0,95.1,118.6,56.3,72.8,75.81,66.34,80.026667,80.0
4,20,20_2,1,76,22,1,2,0,0,1,...,2183-04-28 18:31:59,80.0,95.0,119.0,56.4,73.0,75.81,66.34,80.026667,80.0
