### Normalize From Raw Data

In [26]:
"""This notebook recognizes and replaces the row that 'o:Creatinine' = -0.1
from mimic-sepsis output csv file 'sepsis_final_data_withTimes_dtxh.csv'.
That problem causes log(0) error, leading to final 'o:Creatinine' fully zero.
Then it calculates normalized 2 columns and save it as a csv file.
TODO: Change 'raw_data_path' and 'save_path'.
"""

import numpy as np
import pandas as pd
from scipy import stats
from pathlib import Path

In [27]:
timestep = 8
raw_data_path = Path(rf'F:\time_step\OfflineRL_FactoredActions\RL_mimic_sepsis\data'
                     rf'\data_asNormThreshold_dt{timestep}h\sepsis_final_data_RAW_withTimes_dt{timestep}h.csv')

save_path = Path(rf'F:\time_step\OfflineRL_FactoredActions\RL_mimic_sepsis\data'
                 rf'\data_asNormThreshold_dt{timestep}h\sepsis_final_data_RAWNORM_withTimes_dt{timestep}h.csv')

raw_df = pd.read_csv(raw_data_path)
df = raw_df.copy()

COL_CREAT  = 'o:Creatinine'
COL_PF_RATIO = 'o:PaO2_FiO2'     

In [28]:
mask = df[COL_CREAT] == -0.1
bad_rows = df.index[mask].tolist()

print('Rows with o:Creatinine = -0.1', bad_rows)
print('Number of rows with o:Creatinine = -0.1:', len(bad_rows))

Rows with o:Creatinine = -0.1 []
Number of rows with o:Creatinine = -0.1: 0


In [None]:
# Add 1e-12 to those with 'o:Creatinine' = -0.1. 
creat_raw = df[COL_CREAT].astype(float)
creat_filled = creat_raw.copy()
creat_filled[creat_filled == -0.1] = -0.1 + 1e-12   

In [30]:
df[COL_PF_RATIO].isna().sum()

0

In [31]:
pf_median = df[COL_PF_RATIO].median()
print(f'pf_median = {pf_median}')
df[COL_PF_RATIO] = df[COL_PF_RATIO].fillna(pf_median)

df[COL_PF_RATIO].isna().sum()

pf_median = 274.10545791276274


0

In [32]:
# o:Creatinine log-plus-0.1 -> z-score.
creat_filled = np.log(0.1 + creat_filled)           
df[COL_CREAT] = stats.zscore(creat_filled, nan_policy='omit')

In [33]:
# 2) o:PaO2_FiO2 —— z-score.
pf_raw = df[COL_PF_RATIO].values.astype(float)
df[COL_PF_RATIO] = stats.zscore(pf_raw, nan_policy='omit')

In [34]:
nan_in_col = df.isna().any(axis=0)
print('Columns with NaN values:', nan_in_col[nan_in_col].index.tolist())

Columns with NaN values: []


In [35]:
df.to_csv(save_path, index=False)