# Discretizer Transform

In this notebook, we are going to develop and inspect the transform method of our discretizer class. We will use data from the mimic dataset, by simply importing a timeseries episode. 

In [21]:
from sklearn.preprocessing import OneHotEncoder


import os
import numpy as np
import json
import pandas as pd
import pdb
import time
from pathlib import Path

In [22]:
X = pd.read_csv(Path("resources", "episode1_timeseries.csv")).set_index('Hours')
with open(Path("resources", "discretizer_config.json")) as file: 
    config = json.load(file)

In [23]:
X.head()

Unnamed: 0_level_0,Capillary refill rate,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
Hours,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0.305833,,,,,,,,,,,,,,,,52.3,
0.739167,,57.0,,,,,,,,,70.0,,,115.0,,,
0.7725,,,,,,,,,149.0,,,94.0,24.0,,,,
1.239167,,,,,,,,132.0,,,,,,,,,
1.239167,,,,,,,,132.0,,,,,,,,,


## Inputs and Constants

The transorm function can be customized using inputs such as an end time (data length) and the config.json.

In [24]:
eps = 1e-6
timestep = 1
possible_values = config['possible_values']
is_categorical = config['is_categorical_channel']

In [25]:
N_channels = X.shape[1]
ts = list(X.index)
tsid_to_bins = list(map(lambda x:  int(x / timestep - eps), ts))
start_timestamp = 0
max_hour = ts[-1] - start_timestamp
N_bins = int(max_hour / timestep + 1.0 - eps)
begin_pos = [0 for i in range(N_channels + 1)]
cur_len = begin_pos[-1] + 1

Compute start indices of one-hot encoded columns and id-column mappings.

In [26]:
for index, column in enumerate(X.columns):
    if is_categorical[column]:
        begin_pos[index + 1] = begin_pos[index] + len(possible_values[column])
    else:
        begin_pos[index + 1] = begin_pos[index] + 1
        
discretized_data_length = begin_pos[-1]
column_to_id = dict(zip(X.columns, begin_pos))
id_to_column = dict(zip(begin_pos, X.columns))
ranges = tuple(zip(begin_pos, ))


In [27]:
begin_pos

[0, 2, 3, 4, 12, 24, 37, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]

In [28]:
id_to_column

{0: 'Capillary refill rate',
 2: 'Diastolic blood pressure',
 3: 'Fraction inspired oxygen',
 4: 'Glascow coma scale eye opening',
 12: 'Glascow coma scale motor response',
 24: 'Glascow coma scale total',
 37: 'Glascow coma scale verbal response',
 49: 'Glucose',
 50: 'Heart Rate',
 51: 'Height',
 52: 'Mean blood pressure',
 53: 'Oxygen saturation',
 54: 'Respiratory rate',
 55: 'Systolic blood pressure',
 56: 'Temperature',
 57: 'Weight',
 58: 'pH'}

## Binning

This step will not be necessary for the smartmeter dataset, as the binning is already done on load data. For mimic we will need to bin the data into equidistant time ranges.

In [29]:
reduced_data = X.copy()
reduced_data['bins'] = tsid_to_bins
reduced_data['bins']

Hours
0.305833        0
0.739167        0
0.772500        0
1.239167        1
1.239167        1
             ... 
107.305833    107
108.305833    108
108.405833    108
109.305833    109
109.705833    109
Name: bins, Length: 203, dtype: int64

In [30]:
reduced_data = reduced_data.groupby('bins').ffill().assign(bins=reduced_data.bins)          

In [31]:
reduced_data.head()

Unnamed: 0_level_0,Capillary refill rate,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH,bins
Hours,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0.305833,,,,,,,,,,,,,,,,52.3,,0
0.739167,,57.0,,,,,,,,,70.0,,,115.0,,52.3,,0
0.7725,,57.0,,,,,,,149.0,,70.0,94.0,24.0,115.0,,52.3,,0
1.239167,,,,,,,,132.0,,,,,,,,,,1
1.239167,,,,,,,,132.0,,,,,,,,,,1


In [32]:
reduced_data = reduced_data.drop_duplicates(subset='bins', keep='last')

In [33]:
reduced_data = reduced_data.set_index('bins').reindex(range(N_bins))
reduced_data.head()

Unnamed: 0_level_0,Capillary refill rate,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,Glascow coma scale motor response,Glascow coma scale total,Glascow coma scale verbal response,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,,57.0,,,,,,,149.0,,70.0,94.0,24.0,115.0,,52.3,
1,,51.0,,,,,,113.0,130.0,,-8.0,96.0,20.0,85.0,38.944444,,7.31
2,,52.0,,,,,,,122.0,,61.0,99.0,17.0,89.0,,,
3,,60.0,,Spontaneously,Obeys Commands,,Oriented,,118.0,,74.0,97.0,16.0,99.0,38.166667,,
4,,51.0,,,,,,,116.0,,63.0,97.0,22.0,80.0,,,


In [34]:
reduced_data.columns

Index(['Capillary refill rate', 'Diastolic blood pressure',
       'Fraction inspired oxygen', 'Glascow coma scale eye opening',
       'Glascow coma scale motor response', 'Glascow coma scale total',
       'Glascow coma scale verbal response', 'Glucose', 'Heart Rate', 'Height',
       'Mean blood pressure', 'Oxygen saturation', 'Respiratory rate',
       'Systolic blood pressure', 'Temperature', 'Weight', 'pH'],
      dtype='object')

## Imputation

We impute the dataframe using replace values from the discretizer_config file.

In [35]:
reduced_data = reduced_data.ffill()
for column in reduced_data:
    reduced_data[column] = reduced_data[column].replace(np.nan, config['normal_values'][column])

    reduced_data.head()

## One-Hot Encoding
Next we will expand the dataframe columns and one-hot encode the categorical data.

In [36]:
disc_columns = [id_to_column[i] if i in id_to_column.keys() else str(i) for i in range(discretized_data_length)]
discretized_data = pd.DataFrame(np.zeros(shape=(N_bins, discretized_data_length), dtype=float), columns=disc_columns)

for column in discretized_data:
    if column in X.columns:
        discretized_data[column] = reduced_data[column]
        
mask = np.zeros(shape=(N_bins, N_channels), dtype=int)

In [37]:
discretized_data.head()

Unnamed: 0,Capillary refill rate,1,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,5,6,7,8,9,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,0.0,0.0,57.0,0.21,4 Spontaneously,0.0,0.0,0.0,0.0,0.0,...,128.0,149.0,170.0,70.0,94.0,24.0,115.0,36.6,52.3,7.4
1,0.0,0.0,51.0,0.21,4 Spontaneously,0.0,0.0,0.0,0.0,0.0,...,113.0,130.0,170.0,-8.0,96.0,20.0,85.0,38.944444,52.3,7.31
2,0.0,0.0,52.0,0.21,4 Spontaneously,0.0,0.0,0.0,0.0,0.0,...,113.0,122.0,170.0,61.0,99.0,17.0,89.0,38.944444,52.3,7.31
3,0.0,0.0,60.0,0.21,Spontaneously,0.0,0.0,0.0,0.0,0.0,...,113.0,118.0,170.0,74.0,97.0,16.0,99.0,38.166667,52.3,7.31
4,0.0,0.0,51.0,0.21,Spontaneously,0.0,0.0,0.0,0.0,0.0,...,113.0,116.0,170.0,63.0,97.0,22.0,80.0,38.166667,52.3,7.31


In [38]:
for column in reduced_data:
    if not is_categorical[column]:
        continue
    categ = possible_values[column]
    start_index = column_to_id[column]
    column_range = (start_index, start_index + len(possible_values[column])) 

    oe_style = OneHotEncoder(categories=list(np.array(categ).reshape(1, len(categ))), handle_unknown='ignore')
    oe_results = oe_style.fit_transform(reduced_data[column].astype(str).values.reshape(-1, 1))
    discretized_data.iloc[:,column_range[0]:column_range[1]] = pd.DataFrame(oe_results.toarray(), columns=categ)

In [39]:
discretized_data.head()

Unnamed: 0,Capillary refill rate,1,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,5,6,7,8,9,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,1.0,0.0,57.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,128.0,149.0,170.0,70.0,94.0,24.0,115.0,36.6,52.3,7.4
1,1.0,0.0,51.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,130.0,170.0,-8.0,96.0,20.0,85.0,38.944444,52.3,7.31
2,1.0,0.0,52.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,122.0,170.0,61.0,99.0,17.0,89.0,38.944444,52.3,7.31
3,1.0,0.0,60.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,118.0,170.0,74.0,97.0,16.0,99.0,38.166667,52.3,7.31
4,1.0,0.0,51.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,116.0,170.0,63.0,97.0,22.0,80.0,38.166667,52.3,7.31


## All at once & Timing

In [40]:
start_total = time.time()

# Constants and mappings
N_channels = X.shape[1]
ts = list(X.index)
tsid_to_bins = list(map(lambda x:  int(x / timestep - eps), ts))
start_timestamp = 0
max_hour = ts[-1] - start_timestamp
N_bins = int(max_hour / timestep + 1.0 - eps)
begin_pos = [0 for i in range(N_channels + 1)]
cur_len = begin_pos[-1] + 1

start = time.time()
for index, column in enumerate(X.columns):
    if is_categorical[column]:
        begin_pos[index + 1] = begin_pos[index] + len(possible_values[column])
    else:
        begin_pos[index + 1] = begin_pos[index] + 1
        
discretized_data_length = begin_pos[-1]
column_to_id = dict(zip(X.columns, begin_pos))
id_to_column = dict(zip(begin_pos, X.columns))

end = time.time()
print(f"Constants: {end-start_total}")

# Binning
start = time.time()

reduced_data = X.copy()
reduced_data['bins'] = tsid_to_bins

reduced_data = reduced_data.groupby('bins').ffill().assign(bins=reduced_data.bins)    
reduced_data = reduced_data.drop_duplicates(subset='bins', keep='last')
reduced_data = reduced_data.set_index('bins').reindex(range(N_bins))

end = time.time()
print(f"Binning: {end-start}")

# Imputation
start = time.time()
reduced_data = reduced_data.ffill()
for column in reduced_data:
    reduced_data[column] = reduced_data[column].replace(np.nan, config['normal_values'][column])

end = time.time()
print(f"Imputation: {end-start}")

# One-Hot encoding
start = time.time()
disc_columns = [id_to_column[i] if i in id_to_column.keys() else str(i) for i in range(discretized_data_length)]
discretized_data = pd.DataFrame(np.zeros(shape=(N_bins, discretized_data_length), dtype=float), columns=disc_columns)

for column in discretized_data:
    if column in X.columns:
        discretized_data[column] = reduced_data[column]
        
mask = np.zeros(shape=(N_bins, N_channels), dtype=int)

for column in reduced_data:
    if not is_categorical[column]:
        continue
    categ = possible_values[column]
    start_index = column_to_id[column]
    column_range = (start_index, start_index + len(possible_values[column])) 

    oe_style = OneHotEncoder(categories=list(np.array(categ).reshape(1, len(categ))), handle_unknown='ignore')
    oe_results = oe_style.fit_transform(reduced_data[column].astype(str).values.reshape(-1, 1))
    discretized_data.iloc[:,column_range[0]:column_range[1]] = pd.DataFrame(oe_results.toarray(), columns=categ)

end_total = time.time()
print(f"One-Hot Encoding: {end_total-start}")
print(f"Total: {end_total-start_total}")

Constants: 0.0004382133483886719
Binning: 0.007100105285644531
Imputation: 0.006123065948486328
One-Hot Encoding: 0.009690523147583008
Total: 0.023807048797607422


# Original Transform Function

In [53]:
eps = 1e-6
timestep = 1
N_channels = X.shape[1]
start_timestamp = 0
max_hour = ts[-1] - start_timestamp
N_bins = int(max_hour / timestep + 1.0 - eps)
begin_pos = [0 for i in range(N_channels + 1)]
cur_len = begin_pos[-1] + 1
possible_values = config['possible_values']
is_categorical = config['is_categorical_channel']
normal_values = config['normal_values']
id_to_channel = config['id_to_channel']
channel_to_id = dict(zip(id_to_channel, range(len(id_to_channel))))
header = ['Hours'] + id_to_channel
done_count = 0
empty_bins_sum = 0
unused_data_sum =  0
end = None
start_time = "zero"
impute_strategy = "previous"
store_masks = False

In [54]:
X = pd.read_csv(Path("resources", "episode1_timeseries.csv"))
X = X.astype(str).replace({'nan': ''})
X.head()
X = X.values
X

array([['0.3058333333333333', '', '', ..., '', '52.3', ''],
       ['0.7391666666666666', '', '57.0', ..., '', '', ''],
       ['0.7725000000000001', '', '', ..., '', '', ''],
       ...,
       ['108.40583333333332', '', '53.0', ..., '', '', ''],
       ['109.30583333333334', '', '', ..., '', '', ''],
       ['109.70583333333336', '', '54.0', ..., '', '', '']], dtype=object)

In [55]:
start = time.time()
assert header[0] == "Hours"
eps = 1e-6


# number of basechannels
N_channels = len(id_to_channel)
# timestamps as list
ts = [float(row[0]) for row in X]
# check timesteps are in series
for i in range(len(ts) - 1):
    assert ts[i] < ts[i+1] + eps

if start_time == 'relative':
    first_time = ts[0]
elif start_time == 'zero':
    first_time = 0
else:
    raise ValueError("start_time is invalid")


if end is None:
    max_hours = max(ts) - first_time
else:
    max_hours = end - first_time
# starts at zero, ends at the last hour

# ceil max_hours
N_bins = int(max_hours / timestep + 1.0 - eps)


cur_len = 0
# position of the beginning in the column stace
begin_pos = [0 for i in range(N_channels)]
# unused
end_pos = [0 for i in range(N_channels)]
for i in range(N_channels):
    channel = id_to_channel[i]
    begin_pos[i] = cur_len
    if is_categorical[channel]:
        end_pos[i] = begin_pos[i] + len(possible_values[channel])
    else:
        end_pos[i] = begin_pos[i] + 1
    cur_len = end_pos[i]


data = np.zeros(shape=(N_bins, cur_len), dtype=float)
mask = np.zeros(shape=(N_bins, N_channels), dtype=int)
original_value = [["" for j in range(N_channels)] for i in range(N_bins)]
total_data = 0
unused_data = 0


def write(data, bin_id, channel, value, begin_pos):
    # 
    channel_id = channel_to_id[channel]
    if is_categorical[channel]:
        category_id = possible_values[channel].index(value)
        N_values = len(possible_values[channel])
        one_hot = np.zeros((N_values,))
        one_hot[category_id] = 1
        for pos in range(N_values):
            data[bin_id, begin_pos[channel_id] + pos] = one_hot[pos]
    else:
        data[bin_id, begin_pos[channel_id]] = float(value)
    # 
fill_patern = dict()

for index, row in enumerate(X):
    # current time step
    t = float(row[0]) - first_time
    if t > max_hours + eps:
        continue
    # There might be less bins than rows
    bin_id = int(t / timestep - eps)
    assert 0 <= bin_id < N_bins
    fill_patern[index] = bin_id

    for j in range(1, len(row)):
        if row[j] == "":
            continue
        channel = header[j]
        channel_id = channel_to_id[channel]

        total_data += 1
        # If the mask has been set to one, the values has already been writen
        if mask[bin_id][channel_id] == 1:
            unused_data += 1
        mask[bin_id][channel_id] = 1

        # Write or overwrite
        write(data, bin_id, channel, row[j], begin_pos)
        original_value[bin_id][channel_id] = row[j]



# impute missing values

if impute_strategy not in ['zero', 'normal_value', 'previous', 'next']:
    raise ValueError("impute strategy is invalid")

if impute_strategy in ['normal_value', 'previous']:
    prev_values = [[] for i in range(len(id_to_channel))]
    for bin_id in range(N_bins):
        for channel in id_to_channel:
            channel_id = channel_to_id[channel]
            if mask[bin_id][channel_id] == 1:
                prev_values[channel_id].append(original_value[bin_id][channel_id])
                continue
            if impute_strategy == 'normal_value':
                imputed_value = normal_values[channel]
            if impute_strategy == 'previous':
                if len(prev_values[channel_id]) == 0:
                    imputed_value = normal_values[channel]
                else:
                    imputed_value = prev_values[channel_id][-1]
            write(data, bin_id, channel, imputed_value, begin_pos)


if impute_strategy == 'next':
    prev_values = [[] for i in range(len(id_to_channel))]
    for bin_id in range(N_bins-1, -1, -1):
        for channel in id_to_channel:
            channel_id = channel_to_id[channel]
            if mask[bin_id][channel_id] == 1:
                prev_values[channel_id].append(original_value[bin_id][channel_id])
                continue
            if len(prev_values[channel_id]) == 0:
                imputed_value = normal_values[channel]
            else:
                imputed_value = prev_values[channel_id][-1]
            write(data, bin_id, channel, imputed_value, begin_pos)


empty_bins = np.sum([1 - min(1, np.sum(mask[i, :])) for i in range(N_bins)])
done_count += 1
empty_bins_sum += empty_bins / (N_bins + eps)
unused_data_sum += unused_data / (total_data + eps)

if store_masks:
    data = np.hstack([data, mask.astype(np.float32)])

# create new header
new_header = []
for channel in id_to_channel:
    if is_categorical[channel]:
        values = possible_values[channel]
        for value in values:
            new_header.append(channel + "->" + value)
    else:
        new_header.append(channel)

if store_masks:
    for i in range(len(id_to_channel)):
        channel = id_to_channel[i]
        new_header.append("mask->" + channel)

new_header = ",".join(new_header)
end = time.time()
print(end-start)

0.006739616394042969


In [56]:
header

['Hours',
 'Capillary refill rate',
 'Diastolic blood pressure',
 'Fraction inspired oxygen',
 'Glascow coma scale eye opening',
 'Glascow coma scale motor response',
 'Glascow coma scale total',
 'Glascow coma scale verbal response',
 'Glucose',
 'Heart Rate',
 'Height',
 'Mean blood pressure',
 'Oxygen saturation',
 'Respiratory rate',
 'Systolic blood pressure',
 'Temperature',
 'Weight',
 'pH']

In [57]:
pd.DataFrame(data).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,57.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,128.0,149.0,170.0,70.0,94.0,24.0,115.0,36.6,52.3,7.4
1,1.0,0.0,51.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,130.0,170.0,-8.0,96.0,20.0,85.0,38.944444,52.3,7.31
2,1.0,0.0,52.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,122.0,170.0,61.0,99.0,17.0,89.0,38.944444,52.3,7.31
3,1.0,0.0,60.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,118.0,170.0,74.0,97.0,16.0,99.0,38.166667,52.3,7.31
4,1.0,0.0,51.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,116.0,170.0,63.0,97.0,22.0,80.0,38.166667,52.3,7.31


In [58]:
correspondance_map = data == discretized_data.astype(float).values

In [59]:
compare_disc = discretized_data.copy()
compare_disc.columns = range(compare_disc.shape[1])

In [60]:
compare_disc.astype(float).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,49,50,51,52,53,54,55,56,57,58
0,1.0,0.0,57.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,128.0,149.0,170.0,70.0,94.0,24.0,115.0,36.6,52.3,7.4
1,1.0,0.0,51.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,130.0,170.0,-8.0,96.0,20.0,85.0,38.944444,52.3,7.31
2,1.0,0.0,52.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,122.0,170.0,61.0,99.0,17.0,89.0,38.944444,52.3,7.31
3,1.0,0.0,60.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,118.0,170.0,74.0,97.0,16.0,99.0,38.166667,52.3,7.31
4,1.0,0.0,51.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,116.0,170.0,63.0,97.0,22.0,80.0,38.166667,52.3,7.31


In [61]:
comp_df = pd.DataFrame(data) != compare_disc.astype(float)

In [62]:
comp_df.any().any()

False

In [64]:
discretized_data.to_csv("discretized_data.csv", index=False)