## Project: Development of a reduced pediatric injury prediction model
Created by: Thomas Hartka, MD, MS  
Date created: 5/17/21
  
This notebook performs imputation on missing data and creates five new data sets.

In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
import math

## Read in data

In [2]:
peds = pd.read_csv("../Data/Peds-2010_2018-unfiltered.csv")

## Set variables

In [3]:
predictors = ['sex','age_5_9', 'age_10_14','age_15_18',
              'prop_restraint','any_restraint','front_row', 
              'dvtotal','pdof_rear','pdof_nearside','pdof_farside', 
              'rolled','multicoll','ejection',
              'splimit','abdeply','entrapment']

responses = ['iss16', 'target_inj']

folds = ['fold5x','fold10x']

# all variables
variables = predictors + responses

## Imputate data using multiple imputation

In [4]:
# set up imputater
imp = IterativeImputer(max_iter=10, random_state=42)
imp.fit(peds[variables])

IterativeImputer(random_state=42)

In [5]:
# imputate data
imp_data = imp.transform(peds[variables])

# convert to pandas df
peds_imp = pd.DataFrame(imp_data, columns=variables)

## Clean up imputated variables

In [6]:
# scale variables
for var in variables:
    if not all([((i in [0,1])  | (math.isnan(i))) for i in peds[var].unique()]):
        # all variables should be greater than zero
        peds_imp[var] = peds_imp.apply(lambda x: x[var] if x[var] >= 0  else 0, axis=1)
        print(var, " is continuous")
    else:
        # convert binary variables to 0/1 at 0.5 cut off
        peds_imp[var] = peds_imp.apply(lambda x: 1 if x[var] >= 0.5  else 0, axis=1)
        print(var, " is binary")

sex  is binary
age_5_9  is binary
age_10_14  is binary
age_15_18  is binary
prop_restraint  is binary
any_restraint  is binary
front_row  is binary
dvtotal  is continuous
pdof_rear  is binary
pdof_nearside  is binary
pdof_farside  is binary
rolled  is binary
multicoll  is binary
ejection  is binary
splimit  is continuous
abdeply  is binary
entrapment  is binary
iss16  is binary
target_inj  is continuous


In [7]:
peds_imp.describe(include='all')

Unnamed: 0,sex,age_5_9,age_10_14,age_15_18,prop_restraint,any_restraint,front_row,dvtotal,pdof_rear,pdof_nearside,pdof_farside,rolled,multicoll,ejection,splimit,abdeply,entrapment,iss16,target_inj
count,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0,28373.0
mean,0.481902,0.147958,0.155112,0.532126,0.546541,0.787016,0.528954,24.158504,0.064286,0.079829,0.089381,0.139781,0.356889,0.034399,67.232716,0.386494,0.042223,0.058753,0.061185
std,0.499681,0.355064,0.362018,0.498976,0.497838,0.409424,0.49917,10.982125,0.245267,0.271034,0.285298,0.346766,0.47909,0.182255,20.641583,0.486955,0.201102,0.235166,0.239673
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,1.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,1.0,1.0,1.0,21.898773,0.0,0.0,0.0,0.0,0.0,0.0,64.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,1.0,1.0,1.0,1.0,27.035569,0.0,0.0,0.0,0.0,1.0,0.0,80.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,160.0,1.0,1.0,1.0,1.0,1.0,1.0,121.0,1.0,1.0,1.0,1.0


## Output data

In [8]:
# add addition needed data
peds_imp[folds] = peds[folds]
peds_imp['dataset'] = peds['dataset']

In [9]:
peds_imp.head(10)

Unnamed: 0,sex,age_5_9,age_10_14,age_15_18,prop_restraint,any_restraint,front_row,dvtotal,pdof_rear,pdof_nearside,...,multicoll,ejection,splimit,abdeply,entrapment,iss16,target_inj,fold5x,fold10x,dataset
0,1,1,0,0,0,1,1,27.499378,0,0,...,1,0,72.0,1,0,0,0.0,1,1,NASS
1,0,0,0,1,1,1,1,19.028221,0,0,...,0,0,89.0,0,0,0,0.0,2,2,NASS
2,1,0,0,1,0,0,1,23.0,0,0,...,1,1,89.0,1,0,1,1.0,3,8,NASS
3,0,0,0,1,0,1,1,24.831082,0,0,...,1,0,72.0,0,0,0,0.0,1,6,NASS
4,1,1,0,0,0,1,0,20.0,1,0,...,1,0,80.0,0,0,0,0.0,4,9,NASS
5,1,0,0,1,1,1,1,19.0,0,0,...,0,0,72.0,1,0,0,0.0,2,2,NASS
6,1,0,0,1,1,1,1,20.0,1,0,...,0,0,72.0,0,0,0,0.0,4,9,NASS
7,0,0,0,1,1,1,1,29.0,0,0,...,1,0,72.0,1,0,0,0.0,4,9,NASS
8,1,0,1,0,1,1,1,21.0,1,0,...,1,0,72.0,0,0,0,0.0,2,2,NASS
9,0,1,0,0,0,1,1,22.618714,0,0,...,1,0,113.0,0,0,0,0.0,0,5,NASS


In [11]:
peds_imp.to_csv("../Data/Peds-2010_2018-imputated.csv", index=False)