## Project: Development of a reduced pediatric injury prediction model
Created by: Thomas Hartka, MD, MS  
Date created: 12/14/20  
  
This notebook performs cross-validation on all combinations of predictors for predicting severe injury for pediatric patients.  AUC is recorded for each run of ten-fold cross-validation.

In [17]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as st
import matplotlib.pyplot as plt
from itertools import combinations

## Set outcome

In [30]:
# outcome of interest
#  ISS -> ISS>=16
#  TIL -> any injury on target injury list
outcome = "TIL"

## Read in data

In [31]:
peds = pd.read_csv("../Data/Peds-2010_2018.csv")

In [32]:
# filter out CISS cases if using TIL because TIL is only valid for AIS98
if outcome == "TIL":
    peds = peds[peds.dataset=='NASS']

## Set variables

In [25]:
predictors = ['sex','age_5_9', 'age_10_14','age_15_18',
              'prop_restraint','no_restraint','front_row', 
              'dvtotal','pdof_rear','pdof_nearside','pdof_farside', 
              'rolled','multicoll','ejection']

if outcome == "ISS":
    response = 'iss16'
elif outcome == "TIL":
    response = 'target_inj'
else:
    raise Exception("Outcome not valid") 

## Scale variables

In [4]:
# scale variables
peds_scale = peds[predictors + [response] + ['fold5x','fold10x']].copy()
for key in predictors:
    if not all([i in [0,1] for i in peds[key].unique()]):
        peds_scale[key] = peds_scale[key] - np.mean(peds_scale[key])
        peds_scale[key] = peds_scale[key] / np.std(peds_scale[key])
        print(key, " is continuous")
    else:
        peds_scale[key] = peds_scale[key]
        print(key, " is binary")

sex  is binary
age_5_9  is binary
age_10_14  is binary
age_15_18  is binary
prop_restraint  is binary
no_restraint  is binary
front_row  is binary
dvtotal  is continuous
pdof_rear  is binary
pdof_nearside  is binary
pdof_farside  is binary
rolled  is binary
multicoll  is binary
ejection  is binary


## Cross validation logistic regression function

In [5]:
def log_reg_cv(data, predictors, response, fold_col, num_vars):
    '''
    This function performs lass regression using folds specified in the 
    data set in a 'fold' column.  It returns a dataframe with the coefficients
    for each fold and auc.
    
    Parameters:
        data - data to analyze
        predictors - list of columns for predictors
        response - outcome column
        fold_col - folds column
    Returns:
        cofficient/AUC - DataFrame(contains AUC, fold, cofficients for model)
    '''
    
    # get folds
    folds = np.sort(data[fold_col].unique())
    
    # create dataframe for results
    results = pd.DataFrame(columns=['num_vars','fold']+predictors+['AUC'])
    
    # set up LR model
    lr_mod = LogisticRegression(random_state=1819, penalty='none',solver='saga',max_iter=1000)
    
    # loop through folds
    for fold in folds:
        # separate fold train/test data
        train = data[data[fold_col]!=fold]
        test = data[data[fold_col]==fold]

        # fit regression model
        lr_fit = lr_mod.fit(train[predictors], train[response])

        # predict on fold test data
        pred = lr_fit.predict_proba(test[predictors])

        # calc AUC
        fpr, tpr, thresholds = metrics.roc_curve(test[response], pred[:,1], pos_label=1)
        AUC = metrics.auc(fpr, tpr)

        # gather results
        fold_results = [num_vars, fold]
        for i,var in enumerate(predictors):
            fold_results.append(lr_fit.coef_[0,i])
        fold_results += [AUC]

        # store AUC
        fold_series = pd.Series(fold_results, index = results.columns)
        results = results.append(fold_series, ignore_index=True)
    
    
    return results

## Run regressions

In [6]:
%%time

results = pd.DataFrame(columns=['num_vars','fold']+predictors+['AUC'])

# loop through number of variables
for i in range(1,len(predictors)+1):
    print("Predictors: ", i)
    
    # loop through all combinations with i variables
    for comb in combinations(predictors,i):
        # run regression on each combination
        res = log_reg_cv(peds_scale, list(comb), response, 'fold10x', i)
    
        # store results
        results = results.append(res,ignore_index=True, sort=True)
        
# replace missing cofficients with 0 and return results
results = results.fillna(0)

Predictors:  1
Predictors:  2
Predictors:  3
Predictors:  4
Predictors:  5
Predictors:  6
Predictors:  7
Predictors:  8
Predictors:  9
Predictors:  10
Predictors:  11
Predictors:  12
Predictors:  13
Predictors:  14
CPU times: user 12h 51min 37s, sys: 1d 35min 3s, total: 1d 13h 26min 40s
Wall time: 3h 15min 43s


In [7]:
results.head(5)

Unnamed: 0,AUC,age_10_14,age_15_18,age_5_9,dvtotal,ejection,fold,front_row,multicoll,no_restraint,num_vars,pdof_farside,pdof_nearside,pdof_rear,prop_restraint,rolled,sex
0,0.500564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.226578
1,0.524273,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.204865
2,0.53415,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.195814
3,0.533901,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.196264
4,0.557143,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.171888


## Store results

In [8]:
results.to_csv("../Results/Model_avg_10x-"+outcome+".csv",index=False)