# Purpose - prepare for machine learning
* drop hopeless columns with too many missing values
* impute the remaining values
* one-hot encode categorical features

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
train_df = pd.read_csv('../data/final/train_spliton_pat.csv')
val_df = pd.read_csv('../data/final/val_spliton_pat.csv')
test_df = pd.read_csv('../data/final/test_spliton_pat.csv')

In [18]:
pos_null_value_fract = train_df[train_df['aki_label']==1].isnull().sum()/len(train_df[train_df['aki_label']==1])
to_drop = pos_null_value_fract[pos_null_value_fract>0.1]
to_drop

dischargeweight                 0.378615
min_result_FiO2                 0.274378
min_result_HCO3                 0.299933
min_result_MCH                  0.151311
min_result_MPV                  0.442502
min_result_RDW                  0.137861
min_result_anion gap            0.212508
min_result_bedside glucose      0.229321
min_result_magnesium            0.209818
min_result_pH                   0.288500
min_result_paCO2                0.300605
min_result_paO2                 0.287828
max_result_FiO2                 0.274378
max_result_HCO3                 0.299933
max_result_MCH                  0.151311
max_result_MPV                  0.442502
max_result_RDW                  0.137861
max_result_anion gap            0.212508
max_result_bedside glucose      0.229321
max_result_magnesium            0.209818
max_result_pH                   0.288500
max_result_paCO2                0.300605
max_result_paO2                 0.287828
delta_result_FiO2               0.274378
delta_result_HCO

In [19]:
cols_to_drop = to_drop.index.tolist() + ['Unnamed: 0']
train_df.drop(columns=cols_to_drop, inplace=True)
val_df.drop(columns=cols_to_drop, inplace=True)
test_df.drop(columns=cols_to_drop, inplace=True)

In [20]:
pos_null_value_fract_val = val_df[val_df['aki_label']==1].isnull().sum()/len(val_df[val_df['aki_label']==1])
pos_null_value_fract_val[pos_null_value_fract_val>0.1]

Series([], dtype: float64)

In [22]:
pos_null_value_fract_test = test_df[test_df['aki_label']==1].isnull().sum()/len(test_df[test_df['aki_label']==1])
pos_null_value_fract_test[pos_null_value_fract_test>0.1]

Series([], dtype: float64)

In [24]:
# before imputing need to fill out categorical column and one-hot-encode
train_df.columns.tolist()

['patienthealthsystemstayid',
 'patientunitstayid',
 'gender',
 'age',
 'ethnicity',
 'unittype',
 'unitadmittime24',
 'unitadmitsource',
 'unitvisitnumber',
 'unitstaytype',
 'admissionweight',
 'unitdischargetime24',
 'unitdischargeoffset',
 'unitdischargestatus',
 'aki_label',
 'aki_offset',
 'organ_system',
 'from_OR',
 'min_sao2',
 'max_sao2',
 'mean_sao2',
 'min_heartrate',
 'max_heartrate',
 'mean_heartrate',
 'min_respiration',
 'max_respiration',
 'mean_respiration',
 'days_of_data',
 'min_result_BUN',
 'min_result_Hct',
 'min_result_Hgb',
 'min_result_MCHC',
 'min_result_MCV',
 'min_result_RBC',
 'min_result_WBC x 1000',
 'min_result_bicarbonate',
 'min_result_calcium',
 'min_result_chloride',
 'min_result_creatinine',
 'min_result_glucose',
 'min_result_platelets x 1000',
 'min_result_potassium',
 'min_result_sodium',
 'max_result_BUN',
 'max_result_Hct',
 'max_result_Hgb',
 'max_result_MCHC',
 'max_result_MCV',
 'max_result_RBC',
 'max_result_WBC x 1000',
 'max_result_bicar

In [26]:
more_cols_to_drop = ['unitadmittime24',
 'unitadmitsource',
 'unitvisitnumber',
 'unitstaytype',
 'admissionweight',
 'unitdischargetime24',
 'unitdischargeoffset']
train_df.drop(columns=more_cols_to_drop, inplace=True)
val_df.drop(columns=more_cols_to_drop, inplace=True)
test_df.drop(columns=more_cols_to_drop, inplace=True)

cat_cols = ['gender',
 'age',
 'ethnicity',
 'unittype',
 'organ_system',
 'from_OR']

#train_cat = pd.get_dummies(train_df, prefix_sep='_', columns=cat_cols) 

In [30]:
train_df[train_df['aki_label']==1][cat_cols].isnull().sum()

gender           0
age              0
ethnicity       12
unittype         0
organ_system     0
from_OR          0
dtype: int64

In [34]:
train_df = train_df.loc[(train_df.gender=='Male') | (train_df.gender=='Female')]
train_df.loc[(train_df.ethnicity.isnull()) & (train_df.aki_label==1), 'ethnicity'] = 'Other/Unknown'
train_df = train_df[~train_df.ethnicity.isnull()]
train_df[cat_cols].isnull().sum()

gender          0
age             0
ethnicity       0
unittype        0
organ_system    0
from_OR         0
dtype: int64

In [35]:
val_df = val_df.loc[(val_df.gender=='Male') | (val_df.gender=='Female')]
val_df.loc[(val_df.ethnicity.isnull()) & (val_df.aki_label==1), 'ethnicity'] = 'Other/Unknown'
val_df = train_df[~train_df.ethnicity.isnull()]
val_df[cat_cols].isnull().sum()

gender          0
age             0
ethnicity       0
unittype        0
organ_system    0
from_OR         0
dtype: int64

In [37]:
test_df = test_df.loc[(val_df.gender=='Male') | (test_df.gender=='Female')]
test_df.loc[(test_df.ethnicity.isnull()) & (test_df.aki_label==1), 'ethnicity'] = 'Other/Unknown'
test_df = test_df[~test_df.ethnicity.isnull()]
test_df[cat_cols].isnull().sum()

gender          0
age             0
ethnicity       0
unittype        0
organ_system    0
from_OR         0
dtype: int64

In [39]:
train_cat = pd.get_dummies(train_df, prefix_sep='_', columns=cat_cols) 
val_cat = pd.get_dummies(val_df, prefix_sep='_', columns=cat_cols) 
test_cat = pd.get_dummies(test_df, prefix_sep='_', columns=cat_cols) 

In [41]:
categorical_encoded = ['unitdischargestatus_alive',
 'gender_Female',
 'gender_Male',
    'ethnicity_African American',
 'ethnicity_Asian',
 'ethnicity_Caucasian',
 'ethnicity_Hispanic',
 'ethnicity_Native American',
 'ethnicity_Other/Unknown',
 'unittype_CCU-CTICU',
 'unittype_CSICU',
 'unittype_CTICU',
 'unittype_Cardiac ICU',
 'unittype_MICU',
 'unittype_Med-Surg ICU',
 'unittype_Neuro ICU',
 'unittype_SICU',
 'organ_system_Cardiovascular',
 'organ_system_Gastrointestinal',
 'organ_system_Genitourinary',
 'organ_system_Hematology',
 'organ_system_Metabolic/Endocrine',
 'organ_system_Musculoskeletal/Skin',
 'organ_system_Neurologic',
 'organ_system_Respiratory',
 'organ_system_Transplant',
 'organ_system_Trauma',
 'from_OR_No',
 'from_OR_Yes']

In [43]:
numerical_cols = ['age',
    'min_sao2',
 'max_sao2',
 'mean_sao2',
 'min_heartrate',
 'max_heartrate',
 'mean_heartrate',
 'min_respiration',
 'max_respiration',
 'mean_respiration',
 'min_result_BUN',
 'min_result_Hct',
 'min_result_Hgb',
 'min_result_MCHC',
 'min_result_MCV',
 'min_result_RBC',
 'min_result_WBC x 1000',
 'min_result_bicarbonate',
 'min_result_calcium',
 'min_result_chloride',
 'min_result_creatinine',
 'min_result_glucose',
 'min_result_platelets x 1000',
 'min_result_potassium',
 'min_result_sodium',
 'max_result_BUN',
 'max_result_Hct',
 'max_result_Hgb',
 'max_result_MCHC',
 'max_result_MCV',
 'max_result_RBC',
 'max_result_WBC x 1000',
 'max_result_bicarbonate',
 'max_result_calcium',
 'max_result_chloride',
 'max_result_creatinine',
 'max_result_glucose',
 'max_result_platelets x 1000',
 'max_result_potassium',
 'max_result_sodium',
 'delta_result_BUN',
 'delta_result_Hct',
 'delta_result_Hgb',
 'delta_result_MCHC',
 'delta_result_MCV',
 'delta_result_RBC',
 'delta_result_WBC x 1000',
 'delta_result_bicarbonate',
 'delta_result_calcium',
 'delta_result_chloride',
 'delta_result_creatinine',
 'delta_result_glucose',
 'delta_result_platelets x 1000',
 'delta_result_potassium',
 'delta_result_sodium']

In [48]:
pos_num_null = train_df[train_df.aki_label==1][numerical_cols].isnull().sum()
pos_num_null = pos_num_null/len(train_df[train_df.aki_label==1])
pos_num_null

age                              0.000000
min_sao2                         0.015467
max_sao2                         0.015467
mean_sao2                        0.015467
min_heartrate                    0.012105
max_heartrate                    0.012105
mean_heartrate                   0.012105
min_respiration                  0.080699
max_respiration                  0.080699
mean_respiration                 0.080699
min_result_BUN                   0.049765
min_result_Hct                   0.063887
min_result_Hgb                   0.064560
min_result_MCHC                  0.091459
min_result_MCV                   0.091459
min_result_RBC                   0.078009
min_result_WBC x 1000            0.074647
min_result_bicarbonate           0.068594
min_result_calcium               0.059180
min_result_chloride              0.053127
min_result_creatinine            0.048420
min_result_glucose               0.051782
min_result_platelets x 1000      0.072629
min_result_potassium             0

In [52]:
pos_num_null = val_df[val_df.aki_label==1][numerical_cols].isnull().sum()
pos_num_null = pos_num_null/len(val_df[val_df.aki_label==1])
pos_num_null[pos_num_null>0.1]

Series([], dtype: float64)

In [51]:
pos_num_null = test_df[test_df.aki_label==1][numerical_cols].isnull().sum()
pos_num_null = pos_num_null/len(test_df[test_df.aki_label==1])
pos_num_null[pos_num_null>0.1]

Series([], dtype: float64)

In [54]:
pos_num_null = train_df[train_df.aki_label==0][numerical_cols].isnull().sum()
pos_num_null = pos_num_null/len(train_df[train_df.aki_label==0])
pos_num_null[pos_num_null>0.1]

min_result_MCHC             0.114348
min_result_MCV              0.114259
min_result_bicarbonate      0.127387
max_result_MCHC             0.114348
max_result_MCV              0.114259
max_result_bicarbonate      0.127387
delta_result_MCHC           0.114348
delta_result_MCV            0.114259
delta_result_bicarbonate    0.127387
dtype: float64

In [55]:
# for next iteration figure out a way to reasonably drop missing values for the negative class
# proceed with imputation for now
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp_mean = IterativeImputer(random_state=2020)
imp_mean.fit(train_df[numerical_cols])
train_imp = imp_mean.transform(train_df[numerical_cols])



In [56]:
train_imp

array([[ 77.        ,  93.8718675 ,  98.82602775, ...,  33.5182867 ,
          0.5990682 ,   3.50564428],
       [ 63.        ,  94.        ,  98.        , ...,   0.        ,
          0.        ,   0.        ],
       [ 48.        ,  96.        ,  99.        , ...,   3.        ,
          0.59892947,   3.84896303],
       ...,
       [ 85.        ,  98.        , 100.        , ..., 208.        ,
          1.5       ,   6.        ],
       [ 51.        ,  95.        , 100.        , ...,  18.        ,
          1.        ,   2.        ],
       [ 50.        ,  94.        , 100.        , ..., 160.        ,
          0.7       ,   3.        ]])

In [59]:
val_imp = imp_mean.transform(val_df[numerical_cols])
test_imp = imp_mean.transform(test_df[numerical_cols])

In [61]:
np.isnan(test_imp).any()

False

In [76]:
train_full = pd.DataFrame(train_imp, columns=numerical_cols)
train_full = pd.concat([train_full.reset_index(drop=True), train_cat[categorical_encoded].reset_index(drop=True),
                        train_df[['patientunitstayid', 'aki_label']].reset_index(drop=True)], axis=1)           

In [75]:
train_full.isnull().sum().sum()

0

In [77]:
val_full = pd.DataFrame(val_imp, columns=numerical_cols)
val_full = pd.concat([val_full.reset_index(drop=True), val_cat[categorical_encoded].reset_index(drop=True),
                        val_df[['patientunitstayid', 'aki_label']].reset_index(drop=True)], axis=1)      
val_full.isnull().sum().sum()

0

In [78]:
test_full = pd.DataFrame(test_imp, columns=numerical_cols)
test_full = pd.concat([test_full.reset_index(drop=True), test_cat[categorical_encoded].reset_index(drop=True),
                        test_df[['patientunitstayid', 'aki_label']].reset_index(drop=True)], axis=1)      
test_full.isnull().sum().sum()

0

In [None]:
train_full.to_csv('../')