# Normalizer
In this notebook, we are going to develop and inspect the discretizer using the template provided by the mimic benchmark. The normalizer is the last step to achieve the expected results with LSTM models.

In [118]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import os
import numpy as np
import json
import pandas as pd
import pdb
import time
import pickle
from pathlib import Path

In [14]:
X = pd.read_csv(Path("resources", "discretized_data.csv"))
X.head()

Unnamed: 0,Capillary refill rate,1,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,5,6,7,8,9,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,1.0,0.0,57.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,128.0,149.0,170.0,70.0,94.0,24.0,115.0,36.6,52.3,7.4
1,1.0,0.0,51.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,130.0,170.0,-8.0,96.0,20.0,85.0,38.944444,52.3,7.31
2,1.0,0.0,52.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,122.0,170.0,61.0,99.0,17.0,89.0,38.944444,52.3,7.31
3,1.0,0.0,60.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,118.0,170.0,74.0,97.0,16.0,99.0,38.166667,52.3,7.31
4,1.0,0.0,51.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,116.0,170.0,63.0,97.0,22.0,80.0,38.166667,52.3,7.31


In [15]:
with open(Path("resources", "discretizer_config.json")) as file: 
    config = json.load(file)

## Original Code

In [16]:
class Normalizer:
    def __init__(self, fields=None):
        self._means = None
        self._stds = None
        self._fields = None
        if fields is not None:
            self._fields = [col for col in fields]

        self._sum_x = None
        self._sum_sq_x = None
        self._count = 0

    def _feed_data(self, x):
        x = np.array(x)
        self._count += x.shape[0]
        if self._sum_x is None:
            self._sum_x = np.sum(x, axis=0)
            self._sum_sq_x = np.sum(x**2, axis=0)
        else:
            self._sum_x += np.sum(x, axis=0)
            self._sum_sq_x += np.sum(x**2, axis=0)

    def _save_params(self, save_file_path):
        eps = 1e-7
        with open(save_file_path, "wb") as save_file:
            N = self._count
            self._means = 1.0 / N * self._sum_x
            self._stds = np.sqrt(1.0/(N - 1) * (self._sum_sq_x - 2.0 * self._sum_x * self._means + N * self._means**2))
            self._stds[self._stds < eps] = eps
            pickle.dump(obj={'means': self._means,
                             'stds': self._stds},
                        file=save_file,
                        protocol=2)

    def load_params(self, load_file_path):
        with open(load_file_path, "rb") as load_file:
            if platform.python_version()[0] == '2':
                dct = pickle.load(load_file)
            else:
                dct = pickle.load(load_file, encoding='latin1')
            self._means = dct['means']
            self._stds = dct['stds']

    def transform(self, X):
        if self._fields is None:
            fields = range(X.shape[1])
        else:
            fields = self._fields
        ret = 1.0 * X
        for col in fields:
            ret[:, col] = (X[:, col] - self._means[col]) / self._stds[col]
        return ret

## Exploration

In [41]:
is_categorical = config['is_categorical_channel']
count = 0
sum_x = None
sum_sq_x = None
stds = None

In [73]:
continuous_channels = [key for key, item in is_categorical.items() if not item]
fields = list(map(X.columns.get_loc, continuous_channels))
fields

[2, 3, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]

### Generating Parameters

In [43]:
X_in = X.values
count += X_in.shape[0]
if sum_x is None:
    sum_x = np.sum(X_in, axis=0)
    sum_sq_x = np.sum(X_in**2, axis=0)
else:
    sum_x += np.sum(X_in, axis=0)
    sum_sq_x += np.sum(X_in**2, axis=0)

In [45]:
eps = 1e-7        
N = count
means = 1.0 / N * sum_x
stds = np.sqrt(1.0/(N - 1) * (sum_sq_x - 2.0 * sum_x * means + N * means**2))
stds[stds < eps] = eps

### Transform

In [65]:
cols = X.columns
ret = 1.0 * X
for index in fields:
    column = cols[index]
    ret.loc[:, column] = (X[column] - means[index]) / stds[index]

In [85]:
ret.head()

Unnamed: 0,Capillary refill rate,1,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,5,6,7,8,9,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,1.0,0.0,-0.376345,-1.835508,0.0,0.0,0.0,1.0,0.0,0.0,...,0.571194,4.179109,0.0,-0.250161,-1.469068,-0.115226,0.250368,-1.482274,-1.134256,1.001994
1,1.0,0.0,-0.793803,-1.835508,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.261377,2.741595,0.0,-5.973833,-0.429856,-0.895219,-2.051064,2.845764,-1.134256,0.927111
2,1.0,0.0,-0.724227,-1.835508,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.261377,2.136325,0.0,-0.910584,1.128962,-1.480213,-1.744206,2.845764,-1.134256,0.927111
3,1.0,0.0,-0.167616,-1.835508,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.261377,1.833691,0.0,0.043361,0.08975,-1.675211,-0.977062,1.409922,-1.134256,0.927111
4,1.0,0.0,-0.793803,-1.835508,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.261377,1.682373,0.0,-0.763823,0.08975,-0.505222,-2.434636,1.409922,-1.134256,0.927111


In [86]:
X.head()

Unnamed: 0,Capillary refill rate,1,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,5,6,7,8,9,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,1.0,0.0,57.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,128.0,149.0,170.0,70.0,94.0,24.0,115.0,36.6,52.3,7.4
1,1.0,0.0,51.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,130.0,170.0,-8.0,96.0,20.0,85.0,38.944444,52.3,7.31
2,1.0,0.0,52.0,0.21,0.0,0.0,0.0,1.0,0.0,0.0,...,113.0,122.0,170.0,61.0,99.0,17.0,89.0,38.944444,52.3,7.31
3,1.0,0.0,60.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,118.0,170.0,74.0,97.0,16.0,99.0,38.166667,52.3,7.31
4,1.0,0.0,51.0,0.21,0.0,0.0,0.0,0.0,0.0,0.0,...,113.0,116.0,170.0,63.0,97.0,22.0,80.0,38.166667,52.3,7.31


In [84]:
stats = [(X[column].min(), X[column].max(), means[X.columns.get_loc(column)], stds[X.columns.get_loc(column)], column)  for column in continuous_channels]
pd.DataFrame(stats, columns=["min", "max", "mean", "std", "channel"]).set_index("channel")

Unnamed: 0_level_0,min,max,mean,std
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Diastolic blood pressure,48.0,122.0,62.409091,14.37269
Fraction inspired oxygen,0.21,0.4,0.356818,0.07998775
Glucose,91.0,161.0,117.709091,18.01648
Heart Rate,70.0,149.0,93.763636,13.21726
Height,170.0,170.0,170.0,1e-07
Mean blood pressure,-8.0,126.0,73.409091,13.62761
Oxygen saturation,90.0,100.0,96.827273,1.924535
Respiratory rate,15.0,40.0,24.590909,5.128255
Systolic blood pressure,80.0,139.0,111.736364,13.03536
Temperature,36.111111,38.944444,37.402929,0.5416876


### Load and Save Parameters

In [120]:
save_file_path = Path("resources", "normalizer_params.obj")
load_file_path = save_file_path

In [121]:
with open(save_file_path, "wb") as save_file:
    pickle.dump(obj={'means': means,
                     'stds': stds},
                file=save_file,
                protocol=2)

In [123]:
with open(load_file_path, "rb") as load_file:
    dct = pickle.load(load_file)
    means = dct['means']
    stds = dct['stds']

## Re-Implementation
It is notable, that the normalizer as implemented by the Stanford team is essentially a reimplementation of the standard scaler from sklearn. The feed data method combined with a subsequent call to _save_params is equal to the partial fit method of the Standardscaler. We will therefore simply make use of the sklearn utility.

### normalizer._feed_data & normalizer._save_params

In [100]:
scaler = StandardScaler()

In [124]:
# Split the frame to show that partial split works just fine
scaler.partial_fit(X[:50])
scaler.partial_fit(X[50:])

StandardScaler()

In [104]:
np.sqrt(scaler.var_)

array([ 0.        ,  0.        , 14.3072108 ,  0.07962334,  0.        ,
        0.        ,  0.        ,  0.16287703,  0.        ,  0.31175324,
        0.34317429,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.13360853,  0.        ,  0.2082989 ,
        0.        ,  0.16287703,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.16287703,  0.        ,  0.16287703,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , 17.93439836,
       13.15704255,  0.        , 13.56552814,  1.91576755,  5.10489151,
       12.97597589,  0.53921978,  1.01803733,  1.19640101])

In [105]:
np.sqrt(scaler.mean_)

array([ 1.        ,  0.        ,  7.89994246,  0.5973426 ,  0.        ,
        0.        ,  0.        ,  0.16514456,  0.        ,  0.33028913,
        0.92932038,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.13483997,  0.        ,  0.97700842,
        0.        ,  0.16514456,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.98626937,  0.        ,  0.16514456,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , 10.84938205,
        9.68316252, 13.03840481,  8.56791053,  9.840085  ,  4.95892217,
       10.57054226,  6.11579343,  7.31163456,  2.48912179])

### normalizer.transform

In [108]:
pd.DataFrame(scaler.transform(X), columns=ret.columns).head()

Unnamed: 0,Capillary refill rate,1,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,5,6,7,8,9,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,0.0,0.0,-0.378067,-1.843909,0.0,0.0,0.0,5.972158,0.0,-0.349927,...,0.573808,4.198236,0.0,-0.251305,-1.475791,-0.115754,0.251514,-1.489058,-1.139447,1.006579
1,0.0,0.0,-0.797436,-1.843909,0.0,0.0,0.0,5.972158,0.0,-0.349927,...,-0.262573,2.754142,0.0,-6.001174,-0.431823,-0.899316,-2.060451,2.858788,-1.139447,0.931354
2,0.0,0.0,-0.727542,-1.843909,0.0,0.0,0.0,5.972158,0.0,-0.349927,...,-0.262573,2.146103,0.0,-0.914752,1.134129,-1.486987,-1.752189,2.858788,-1.139447,0.931354
3,0.0,0.0,-0.168383,-1.843909,0.0,0.0,0.0,-0.167444,0.0,-0.349927,...,-0.262573,1.842083,0.0,0.04356,0.090161,-1.682878,-0.981534,1.416375,-1.139447,0.931354
4,0.0,0.0,-0.797436,-1.843909,0.0,0.0,0.0,-0.167444,0.0,-0.349927,...,-0.262573,1.690073,0.0,-0.767319,0.090161,-0.507535,-2.445779,1.416375,-1.139447,0.931354


### normalizer._save_params & normalizer.load_params

In [139]:
save_file_path = Path("resources", "normalizer_params.obj")
load_file_path = save_file_path

In [140]:
mean = scaler.mean_
std = scaler.var_

with open(save_file_path, "wb") as save_file:
    pickle.dump(obj={'mean': mean,
                     'std': std},
                file=save_file,
                protocol=2)

In [141]:
with open(load_file_path, "rb") as load_file:
    load_params = pickle.load(load_file)
    
scaler_loaded = StandardScaler()
scaler_loaded.mean_ = load_params['mean']
scaler_loaded.std_ = load_params['std']

In [144]:
pd.DataFrame(scaler.transform(X), columns=ret.columns).head()

Unnamed: 0,Capillary refill rate,1,Diastolic blood pressure,Fraction inspired oxygen,Glascow coma scale eye opening,5,6,7,8,9,...,Glucose,Heart Rate,Height,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
0,0.0,0.0,-0.378067,-1.843909,0.0,0.0,0.0,5.972158,0.0,-0.349927,...,0.573808,4.198236,0.0,-0.251305,-1.475791,-0.115754,0.251514,-1.489058,-1.139447,1.006579
1,0.0,0.0,-0.797436,-1.843909,0.0,0.0,0.0,5.972158,0.0,-0.349927,...,-0.262573,2.754142,0.0,-6.001174,-0.431823,-0.899316,-2.060451,2.858788,-1.139447,0.931354
2,0.0,0.0,-0.727542,-1.843909,0.0,0.0,0.0,5.972158,0.0,-0.349927,...,-0.262573,2.146103,0.0,-0.914752,1.134129,-1.486987,-1.752189,2.858788,-1.139447,0.931354
3,0.0,0.0,-0.168383,-1.843909,0.0,0.0,0.0,-0.167444,0.0,-0.349927,...,-0.262573,1.842083,0.0,0.04356,0.090161,-1.682878,-0.981534,1.416375,-1.139447,0.931354
4,0.0,0.0,-0.797436,-1.843909,0.0,0.0,0.0,-0.167444,0.0,-0.349927,...,-0.262573,1.690073,0.0,-0.767319,0.090161,-0.507535,-2.445779,1.416375,-1.139447,0.931354
