In [1]:
_EPSILON = 1e-08

import os

import numpy as np
import pandas as pd
import random
import os

In [2]:
# Controls (Set then rerun notebook).
# MICE vs NON-MICE.
USE_MICE_VERSION = False

In [3]:
if not USE_MICE_VERSION:
    data_processed_dir = "after imputation"
    out_dir = "version1 (integrated)"
else:
    data_processed_dir = "after imputation (MICE)"
    out_dir = "version1 (integrated) (MICE)"

_out_dir = f'./data/processed/final/{out_dir}/'
if not os.path.exists(_out_dir):
    os.makedirs(_out_dir)

In [4]:
df_s  = pd.read_csv(f'./data/processed/{data_processed_dir}/baseline.csv')
df_t  = pd.read_csv(f'./data/processed/{data_processed_dir}/temporal.csv')
df_t  = df_t.drop_duplicates(subset=['New ID', 'Days Since Diagnosis'], keep='last').reset_index(drop=True)

In [5]:
first = df_t.drop_duplicates(subset=['New ID',], keep='first')[['Days Since Diagnosis']]
diff  = df_t['Days Since Diagnosis'].diff()
diff[first.index] = first['Days Since Diagnosis']

time  = df_t[['New ID', 'Days Since Diagnosis']]

df_t['Days Since Diagnosis'] = diff
df_t = df_t.rename(columns={'Days Since Diagnosis': 'Delta'})

In [6]:
np.log(1)

0.0

In [7]:
feat_static = [
    'Exact age at diagnosis', 'Number of negative biopsies before diagnosis',
    'Number of MRI-visible lesions', 'Ethnicity', 'Family History of Prostate Cancer',
    'CPG', 'PI-RADS score', 'STRATCANS (simplified)',
]

feat_timevarying = [
     'Delta', 'Repeat PSA',
     'Repeat Biopsy Core Total', 'Repeat Biopsy Core Positive', 'Repeat Biopsy Primary Gleason', 
     'Repeat Biopsy Secondary Gleason', 'Repeat Biopsy Grade Group',
     'Repeat MRI PRECISE Scoring', 'Repeat MRI Stage','Repeat MRI Volume', 'Repeat MRI PSAd'
]

feat_label = [
    'Coding.3', 'Days since diagnosis.3'
]


xt_bin_list = []
xt_con_list = [0,1,2,3,4,5,6,7,8,9]


data_s = df_s[['New ID'] + feat_static]
data_t = df_t[['New ID'] + feat_timevarying]
label  = df_s[['New ID'] + feat_label]

data_s = data_s[data_s['New ID'].isin(data_t['New ID'].unique())].reset_index(drop=True)
label  = label[label['New ID'].isin(data_t['New ID'].unique())].reset_index(drop=True)

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler_s                 = MinMaxScaler()
scaler_t                 = MinMaxScaler()

data_t.loc[data_t['Delta'] == 0, 'Delta'] = 1. #0 makes -inf!
data_t['Delta']          = np.log(data_t['Delta'])  ### this is to deal with very skewed deltas...
data_s[feat_static]      = scaler_s.fit_transform(data_s[feat_static])
data_t[feat_timevarying] = scaler_t.fit_transform(data_t[feat_timevarying])

In [9]:
xs     = np.asarray(data_s)[:, 1:]

In [10]:
id_list    = data_t['New ID'].unique()
grouped    = data_t.groupby(by=['New ID'])
max_length = grouped.count()['Delta'].max() #max_length = 48

xt         = np.zeros([len(id_list), max_length, len(feat_timevarying)])
for i, pid in enumerate(id_list):
    tmp    = np.asarray(grouped.get_group(pid))[:, 1:]
    xt[i, :np.shape(tmp)[0], :] = tmp

In [11]:
id_list    = time['New ID'].unique()
grouped    = time.groupby(by=['New ID'])

t          = np.zeros([len(id_list), max_length, 1])
for i, pid in enumerate(id_list):
    tmp    = np.asarray(grouped.get_group(pid))[:, 1:]
    t[i, :np.shape(tmp)[0], :] = tmp

In [12]:
y   = np.asarray(label[['Coding.3']])
tte = np.asarray(label[['Days since diagnosis.3']])

In [13]:
import pickle

np.savez(
    f'./data/processed/final/{out_dir}/dataset.npz',
    data_xs   = xs,
    data_xt   = xt,
    data_time = t,
    data_y    = y,
    data_tte  = tte,
    feat_static = feat_static,
    feat_timevarying = feat_timevarying,
    
    xt_bin_list = xt_bin_list,
    xt_con_list = xt_con_list
)

pickle.dump(scaler_s, open(f'./data/processed/final/{out_dir}/scaler_s.pkl', 'wb'))
pickle.dump(scaler_t, open(f'./data/processed/final/{out_dir}/scaler_t.pkl', 'wb'))