## Project: Development of a reduced pediatric injury prediction model
Created by: Thomas Hartka, MD, MS  
Date created: 12/5/20  
  
This notebook determines overall characteristics of the patient population for Table 1.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import itertools

## Read in Peds data

In [2]:
peds = pd.read_csv("../Data/Peds-2010_2018.csv")

In [3]:
# separate occupants with and without severe injury
peds_severe = peds[peds.iss16==1]
peds_nonsevere = peds[peds.iss16==0]

## Functions to descibe data

In [4]:
def stat_continuous_median(df, variable, sigdig=2):
    '''
    Creates text with median and IQR for continuous variables.
    Inputs:
        df - dataframe with data
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns text with results
    '''
    # extract data
    data = df[variable]
    
    # calc stats
    median = data.median()
    IQR = data.quantile([0.25,0.75])
    
    # convert to text
    text = str(round(median, sigdig)) +" (" + str(round(IQR[0.25], sigdig)) +"-" + str(round(IQR[0.75], sigdig)) + ")"
    
    return text

In [5]:
def stat_continuous_mean(df, variable, sigdig=2):
    '''
    Creates text with mean and standard deviation for continuous variables.
    Inputs:
        df - dataframe with data
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns text with results
    '''
    # extract data
    data = df[variable]
    
    # calc stats
    mean = data.mean()
    sd = np.std(data)
    
    # convert to text
    text = str(round(mean, sigdig)) +" (" + str(round(sd, sigdig)) + ")"
    
    return text

In [6]:
def stat_binary(df, variable, sigdig=2):
    '''
    Creates text with median and IQR for binary variables.
    Inputs:
        df - dataframe with data
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns text with results
    '''
    # extract data
    data = df[variable]
    
    # calc stats
    pos_count = data[data==1].count()
    pos_perc = pos_count / data.count()
    
    # convert to text
    text = str(round(pos_count, sigdig)) +" (" + str(round(pos_perc*100, sigdig)) + "%)"
    
    return text

In [7]:
def analyze_col(variable, sigdig=2):
    '''
    Creates text results for a column of data.  This determines if the data
     is binary or continuous, then gets the results for all patients and 
     those with and without severe injury.
    Inputs:
        variable - column to analyze
        sigdig - number of significant digits
    Output:
        Returns list of strings with results [variable, total, non-severe, severe]
    '''
    results = [variable]
       
    if all([i in [0,1] for i in peds[variable].unique()]):
        results += [stat_binary(peds, variable, sigdig)]
        results += [stat_binary(peds_nonsevere, variable, sigdig)]
        results += [stat_binary(peds_severe, variable, sigdig)]
    else:
        results += [stat_continuous_median(peds, variable)]
        results += [stat_continuous_median(peds_nonsevere, variable, sigdig)]
        results += [stat_continuous_median(peds_severe, variable, sigdig)]
        
    return results

## Describe occupants - CISS + NASS

In [8]:
# variables to describe
variables = ['age','age_0_4','age_5_9', 'age_10_14', 'age_15_18','sex',
             'front_row', 'no_restraint', 'prop_restraint', 
             'dvtotal', 'pdof_front','pdof_rear', 'pdof_nearside', 'pdof_farside', 
             'rolled','multicoll','ejection']

In [9]:
# empty table for results
table = pd.DataFrame(columns=['Variable','Total','Nonsevere','Severe'])

# loop through all variables
for var in variables:
    
    # get results for particular variable
    result = analyze_col(var,1)
    
    # add results to table
    table = table.append(pd.Series(result,index=table.columns),ignore_index=True)

# print table
table.set_index('Variable')

Unnamed: 0_level_0,Total,Nonsevere,Severe
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
age,15.0 (7.0-17.0),15.0 (7.0-17.0),16.0 (12.0-17.0)
age_0_4,2282 (16.8%),2202 (17.0%),80 (12.7%)
age_5_9,2137 (15.8%),2091 (16.2%),46 (7.3%)
age_10_14,2181 (16.1%),2093 (16.2%),88 (14.0%)
age_15_18,6960 (51.3%),6545 (50.6%),415 (66.0%)
sex,6740 (49.7%),6458 (49.9%),282 (44.8%)
front_row,7007 (51.7%),6607 (51.1%),400 (63.6%)
no_restraint,2975 (21.9%),2677 (20.7%),298 (47.4%)
prop_restraint,7393 (54.5%),7150 (55.3%),243 (38.6%)
dvtotal,22.0 (15.0-31.0),21.0 (15.0-29.0),43.0 (32.0-56.0)


## Describe occupants - NASS only 

In [13]:
peds = peds[peds.dataset=='NASS']
peds_severe = peds[peds.target_inj==1]
peds_nonsevere = peds[peds.target_inj==0]

In [14]:
# variables to describe
variables = ['age','age_0_4','age_5_9', 'age_10_14', 'age_15_18','sex',
             'front_row', 'no_restraint', 'prop_restraint', 
             'dvtotal', 'pdof_front','pdof_rear', 'pdof_nearside', 'pdof_farside', 
             'rolled','multicoll','ejection']

In [16]:
# empty table for results
table2 = pd.DataFrame(columns=['Variable','Total','Nonsevere','Severe'])

# loop through all variables
for var in variables:
    
    # get results for particular variable
    result = analyze_col(var,1)
    
    # add results to table
    table2 = table2.append(pd.Series(result,index=table2.columns),ignore_index=True)

# print table
table2.set_index('Variable').rename(columns={'Total':'NASS','Severe':'TIL'})

Unnamed: 0_level_0,NASS,Nonsevere,TIL
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
age,15.0 (7.0-17.0),15.0 (7.0-17.0),16.0 (11.0-17.0)
age_0_4,2110 (16.8%),2025 (17.1%),85 (12.5%)
age_5_9,1962 (15.7%),1897 (16.0%),65 (9.5%)
age_10_14,1988 (15.9%),1874 (15.8%),114 (16.7%)
age_15_18,6468 (51.6%),6051 (51.1%),417 (61.2%)
sex,6179 (49.3%),5873 (49.6%),306 (44.9%)
front_row,6476 (51.7%),6074 (51.3%),402 (59.0%)
no_restraint,2866 (22.9%),2525 (21.3%),341 (50.1%)
prop_restraint,6657 (53.1%),6409 (54.1%),248 (36.4%)
dvtotal,22.0 (16.0-31.0),21.0 (15.0-29.0),40.0 (29.0-53.0)


In [18]:
len(peds_severe)

681