## Project: Development of a reduced pediatric injury prediction model
Created by: Thomas Hartka, MD, MS  
Date created: 12/5/20  
  
This notebook determines overall characteristics of the patient population for Table 1.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import itertools

## Read in Peds data

In [5]:
peds = pd.read_csv("../Data/Peds-2010_2018-unfiltered.csv")

In [6]:
# columns we care about
missing_cols = ['sex','dvtotal','pdof_front','rolled','prop_restraint','any_restraint','splimit']
    
# remove rows with missing values
peds = peds[peds[missing_cols].notna().all(1)].reset_index(drop=True)

In [7]:
# separate occupants with and without severe injury
peds_severe = peds[peds.iss16==1]
peds_nonsevere = peds[peds.iss16==0]

## Functions to descibe data

In [8]:
def stat_continuous_median(df, variable, sigdig=2):
    '''
    Creates text with median and IQR for continuous variables.
    Inputs:
        df - dataframe with data
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns text with results
    '''
    # extract data
    data = df[variable]
    
    # calc stats
    median = data.median()
    IQR = data.quantile([0.25,0.75])
    
    # convert to text
    text = str(round(median, sigdig)) +" (" + str(round(IQR[0.25], sigdig)) +"-" + str(round(IQR[0.75], sigdig)) + ")"
    
    return text

In [9]:
def stat_continuous_mean(df, variable, sigdig=2):
    '''
    Creates text with mean and standard deviation for continuous variables.
    Inputs:
        df - dataframe with data
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns text with results
    '''
    # extract data
    data = df[variable]
    
    # calc stats
    mean = data.mean()
    sd = np.std(data)
    
    # convert to text
    text = str(round(mean, sigdig)) +" (" + str(round(sd, sigdig)) + ")"
    
    return text

In [10]:
def stat_binary(df, variable, sigdig=2):
    '''
    Creates text with median and IQR for binary variables.
    Inputs:
        df - dataframe with data
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns text with results
    '''
    # extract data
    data = df[variable]
    
    # calc stats
    pos_count = data[data==1].count()
    pos_perc = pos_count / data.count()
    
    # convert to text
    text = str(round(pos_count, sigdig)) +" (" + str(round(pos_perc*100, sigdig)) + "%)"
    
    return text

In [11]:
def analyze_col(variable, sigdig=2):
    '''
    Creates text results for a column of data.  This determines if the data
     is binary or continuous, then gets the results for all patients and 
     those with and without severe injury.
    Inputs:
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns list of strings with results [variable, total, non-severe, severe]
    '''
    results = [variable]
       
    if all([i in [0,1] for i in peds[variable].unique()]):
        results += [stat_binary(peds, variable, sigdig)]
        results += [stat_binary(peds_nonsevere, variable, sigdig)]
        results += [stat_binary(peds_severe, variable, sigdig)]
    else:
        results += [stat_continuous_median(peds, variable)]
        results += [stat_continuous_median(peds_nonsevere, variable, sigdig)]
        results += [stat_continuous_median(peds_severe, variable, sigdig)]
        
    return results

## Describe occupants - CISS + NASS

In [12]:
# variables to describe
variables = ['age','age_0_4','age_5_9', 'age_10_14', 'age_15_18','sex',
             'front_row', 'any_restraint', 'prop_restraint', 
             'dvtotal', 'pdof_front','pdof_rear', 'pdof_nearside', 'pdof_farside', 
             'rolled','multicoll','ejection']

In [13]:
# empty table for results
table = pd.DataFrame(columns=['Variable','Total','Nonsevere','Severe'])

# loop through all variables
for var in variables:
    
    # get results for particular variable
    result = analyze_col(var,1)
    
    # add results to table
    table = table.append(pd.Series(result,index=table.columns),ignore_index=True)

# print table
table.set_index('Variable')

Unnamed: 0_level_0,Total,Nonsevere,Severe
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
age,15.0 (7.0-17.0),15.0 (7.0-17.0),16.0 (12.0-17.0)
age_0_4,2376 (17.4%),2291 (17.6%),85 (13.2%)
age_5_9,2152 (15.8%),2106 (16.2%),46 (7.1%)
age_10_14,2178 (16.0%),2090 (16.1%),88 (13.7%)
age_15_18,6947 (50.9%),6522 (50.1%),425 (66.0%)
sex,6783 (49.7%),6500 (50.0%),283 (43.9%)
front_row,6992 (51.2%),6582 (50.6%),410 (63.7%)
any_restraint,10618 (77.8%),10284 (79.1%),334 (51.9%)
prop_restraint,7356 (53.9%),7115 (54.7%),241 (37.4%)
dvtotal,22.0 (15.0-31.0),21.0 (15.0-29.0),43.0 (32.0-56.0)


## Describe occupants - NASS only 

In [16]:
peds = peds[peds.dataset=='NASS']
peds_severe = peds[peds.target_inj==1]
peds_nonsevere = peds[peds.target_inj==0]

In [17]:
# variables to describe
variables = ['age','age_0_4','age_5_9', 'age_10_14', 'age_15_18','sex',
             'front_row', 'any_restraint', 'prop_restraint', 
             'dvtotal', 'pdof_front','pdof_rear', 'pdof_nearside', 'pdof_farside', 
             'rolled','multicoll','ejection']

In [18]:
# empty table for results
table2 = pd.DataFrame(columns=['Variable','Total','Nonsevere','Severe'])

# loop through all variables
for var in variables:
    
    # get results for particular variable
    result = analyze_col(var,1)
    
    # add results to table
    table2 = table2.append(pd.Series(result,index=table2.columns),ignore_index=True)

# print table
table2.set_index('Variable').rename(columns={'Total':'NASS','Severe':'TIL'})

Unnamed: 0_level_0,NASS,Nonsevere,TIL
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
age,15.0 (7.0-17.0),15.0 (7.0-17.0),16.0 (11.0-17.0)
age_0_4,2149 (17.2%),2061 (17.5%),88 (12.9%)
age_5_9,1954 (15.7%),1889 (16.0%),65 (9.6%)
age_10_14,1971 (15.8%),1857 (15.7%),114 (16.8%)
age_15_18,6411 (51.3%),5998 (50.8%),413 (60.7%)
sex,6162 (49.4%),5858 (49.6%),304 (44.7%)
front_row,6425 (51.5%),6027 (51.1%),398 (58.5%)
any_restraint,9643 (77.2%),9302 (78.8%),341 (50.1%)
prop_restraint,6606 (52.9%),6360 (53.9%),246 (36.2%)
dvtotal,22.0 (16.0-31.0),21.0 (15.0-29.0),40.0 (29.0-53.0)


In [20]:
len(peds)

12485