In [None]:
import os
import sys
import platform

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
system_name = platform.system()
if system_name == 'Linux':
    github_dir = '/home/gor/codes/dVLogger-Project'
elif system_name == 'Darwin':
    github_dir = '/Users/gor/codes/dVLogger-Project'
clinical_outcomes_xlsx = os.path.join(github_dir, 'data/raw/Clinical outcomes_0707.xlsx')
clinical_outcomes_npz = os.path.join(github_dir, 'data/clinical_outcomes_0707.npz')

In [None]:
df = pd.read_excel(clinical_outcomes_xlsx)
df = df.drop(0) # drop row that contains data_type

In [None]:
temp = df['Surgical drainage volume (24hr)']
fig = plt.figure(figsize=(14, 4))
plt.subplot(1, 2, 1)
pd.to_numeric(temp).hist(bins=50)
plt.subplot(1, 2, 2)
np.log(pd.to_numeric(temp)).hist(bins=50)

In [None]:
case = df['Case ID'].astype(np.int32)

In [None]:
y_ot = df['Operation time'].astype(np.float32)
y_log_ot = np.log(y_ot)
y_ot_1qrt = y_ot.quantile(0.25, interpolation='higher')
y_ot_3qrt = y_ot.quantile(0.75, interpolation='lower')
y_cls_ot = y_ot.copy()
y_cls_ot[~ y_cls_ot.isnull()] = 0
y_cls_ot[y_ot > y_ot_1qrt] = 1
y_cls_ot[y_ot > y_ot_3qrt] = 2

In [None]:
y_hs = df['Hospital stay'].astype(np.float32)
y_log_hs = np.log(y_hs)
y_hs_1qrt = y_hs.quantile(0.25, interpolation='higher')
y_hs_3qrt = y_hs.quantile(0.75, interpolation='lower')
y_cls_hs = y_hs.copy()
y_cls_hs[~ y_cls_hs.isnull()] = 0
y_cls_hs[y_hs > y_hs_1qrt] = 1
y_cls_hs[y_hs > y_hs_3qrt] = 2

In [None]:
y_ebl = df['Estimated blood loss'].astype(np.float32)
y_log_ebl = np.log(y_ebl)
y_ebl_1qrt = y_ebl.quantile(0.25, interpolation='higher')
y_ebl_3qrt = y_ebl.quantile(0.75, interpolation='lower')
y_cls_ebl = y_ebl.copy()
y_cls_ebl[~ y_cls_ebl.isnull()] = 0
y_cls_ebl[y_ebl > y_ebl_1qrt] = 1
y_cls_ebl[y_ebl > y_ebl_3qrt] = 2

In [None]:
y_sdv = df['Surgical drainage volume (24hr)'].astype(np.float32)
y_log_sdv = np.log(y_sdv)
y_sdv_1qrt = y_sdv.quantile(0.25, interpolation='higher')
y_sdv_3qrt = y_sdv.quantile(0.75, interpolation='lower')
y_cls_sdv = y_sdv.copy()
y_cls_sdv[~ y_cls_sdv.isnull()] = 0
y_cls_sdv[y_sdv > y_sdv_1qrt] = 1
y_cls_sdv[y_sdv > y_sdv_3qrt] = 2

In [None]:
y_dodt = df['Duration of drainage tube'].astype(np.float32)
y_log_dodt = np.log(y_dodt)
y_dodt_1qrt = y_dodt.quantile(0.25, interpolation='higher')
y_dodt_3qrt = y_dodt.quantile(0.75, interpolation='lower')
y_cls_dodt = y_dodt.copy()
y_cls_dodt[~ y_cls_dodt.isnull()] = 0
y_cls_dodt[y_dodt > y_dodt_1qrt] = 1
y_cls_dodt[y_dodt > y_dodt_3qrt] = 2

In [None]:
y_fcd = df['Foley catheter duration'].astype(np.float32)
y_log_fcd = np.log(y_fcd)
y_fcd_1qrt = y_fcd.quantile(0.25, interpolation='higher')
y_fcd_3qrt = y_fcd.quantile(0.75, interpolation='lower')
y_cls_fcd = y_fcd.copy()
y_cls_fcd[~ y_cls_fcd.isnull()] = 0
y_cls_fcd[y_fcd > y_fcd_1qrt] = 1
y_cls_fcd[y_fcd > y_fcd_3qrt] = 2

In [None]:
y_readmission = df['Readmission'].astype(np.int32)

In [None]:
Y = np.stack([
    y_ot,
    y_log_ot,
    y_cls_ot,
    y_hs,
    y_log_hs,
    y_cls_hs,
    y_ebl,
    y_log_ebl,
    y_cls_ebl,
    y_sdv,
    y_log_sdv,
    y_cls_sdv,
    y_dodt,
    y_log_dodt,
    y_cls_dodt,
    y_fcd,
    y_log_fcd,
    y_cls_fcd,
    y_readmission],
    axis=1
)
Y_case = case
Y_col = [
    'y_ot',
    'y_log_ot',
    'y_cls_ot',
    'y_hs',
    'y_log_hs',
    'y_cls_hs',
    'y_ebl',
    'y_log_ebl',
    'y_cls_ebl',
    'y_sdv',
    'y_log_sdv',
    'y_cls_sdv',
    'y_dodt',
    'y_log_dodt',
    'y_cls_dodt',
    'y_fcd',
    'y_log_fcd',
    'y_cls_fcd',
    'y_readmission'
]
Y_train_task_type = [
    'regression',
    'regression',
    'classification',
    'regression',
    'regression',
    'classification',
    'regression',
    'regression',
    'classification',
    'regression',
    'regression',
    'classification',
    'regression',
    'regression',
    'classification',
    'regression',
    'regression',
    'classification',
    'classification'
]

In [None]:
np.savez_compressed(clinical_outcomes_npz,
                    Y=Y,
                    Y_case=Y_case,
                    Y_col=Y_col,
                    Y_train_task_type=Y_train_task_type)