## Project: Development of a reduced pediatric injury prediction model
Created by: Thomas Hartka, MD, MS  
Date created: 12/14/20  
  
This notebook performs cross-validation on all combinations of predictors for predicting severe injury for pediatric patients.  AUC is recorded for each run of ten-fold cross-validation.

In [101]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import scipy.stats as st
import matplotlib.pyplot as plt
from itertools import combinations
import datetime
from multiprocessing import Process, Queue
import multiprocessing

## Set outcome

In [102]:
# outcome of interest
#  ISS -> ISS>=16
#  TIL -> any injury on target injury list
outcome = "ISS"

filter_missing = True

## Read in data

In [122]:
peds = pd.read_csv("../Data/Peds-2010_2018-unfiltered.csv")

In [123]:
# filter out CISS cases if using TIL because TIL is only valid for AIS98
if outcome == "TIL":
    peds = peds[peds.dataset=='NASS']

## Set variables

In [124]:
predictors = ['sex','age_5_9', 'age_10_14','age_15_18',
              'prop_restraint','any_restraint','front_row', 
              'dvtotal','pdof_rear','pdof_nearside','pdof_farside', 
              'rolled','multicoll','ejection',
              'splimit','abdeply','entrapment']

if outcome == "ISS":
    response = 'iss16'
elif outcome == "TIL":
    response = 'target_inj'
else:
    raise Exception("Outcome not valid") 

## Filter cases with missing data

In [118]:
# filter cases with missing values
if filter_missing:
    # columns we care about
    missing_cols = ['sex','dvtotal','pdof_front','rolled','prop_restraint','any_restraint','splimit', response]
    
    # remove rows with missing values
    peds = peds[peds[missing_cols].notna().all(1)].reset_index(drop=True)

In [119]:
len(peds)

13653

## Scale variables

In [7]:
# scale variables
peds_scale = peds[predictors + [response] + ['fold5x','fold10x']].copy()
for key in predictors:
    if not all([i in [0,1] for i in peds[key].unique()]):
        peds_scale[key] = peds_scale[key] - np.mean(peds_scale[key])
        peds_scale[key] = peds_scale[key] / np.std(peds_scale[key])
        print(key, " is continuous")
    else:
        peds_scale[key] = peds_scale[key]
        print(key, " is binary")

sex  is binary
age_5_9  is binary
age_10_14  is binary
age_15_18  is binary
prop_restraint  is binary
any_restraint  is binary
front_row  is binary
dvtotal  is continuous
pdof_rear  is binary
pdof_nearside  is binary
pdof_farside  is binary
rolled  is binary
multicoll  is binary
ejection  is binary
splimit  is continuous
abdeply  is binary
entrapment  is binary


## Cross validation logistic regression function

In [8]:
def log_reg_cv(data, predictors, response, fold_col, num_vars):
    '''
    This function performs lass regression using folds specified in the 
    data set in a 'fold' column.  It returns a dataframe with the coefficients
    for each fold and auc.
    
    Parameters:
        data - data to analyze
        predictors - list of columns for predictors
        response - outcome column
        fold_col - folds column
    Returns:
        cofficient/AUC - DataFrame(contains AUC, fold, cofficients for model)
    '''
    
    # get folds
    folds = np.sort(data[fold_col].unique())
    
    # create dataframe for results
    results = pd.DataFrame(columns=['num_vars','fold']+predictors+['AUC'])
    
    # set up LR model
    lr_mod = LogisticRegression(random_state=1819, penalty='none',solver='saga',max_iter=1000)
    
    # loop through folds
    for fold in folds:
        # separate fold train/test data
        train = data[data[fold_col]!=fold]
        test = data[data[fold_col]==fold]

        # fit regression model
        lr_fit = lr_mod.fit(train[predictors], train[response])

        # predict on fold test data
        pred = lr_fit.predict_proba(test[predictors])

        # calc AUC
        fpr, tpr, thresholds = metrics.roc_curve(test[response], pred[:,1], pos_label=1)
        AUC = metrics.auc(fpr, tpr)

        # gather results
        fold_results = [num_vars, fold]
        for i,var in enumerate(predictors):
            fold_results.append(lr_fit.coef_[0,i])
        fold_results += [AUC]

        # store AUC
        fold_series = pd.Series(fold_results, index = results.columns)
        results = results.append(fold_series, ignore_index=True)
    
    
    return results
    #queue.put(results)
    #exit(results)

## Run regressions

In [9]:
def helper_reg(comb):
    #print(type(comb), list(comb))
    return log_reg_cv(peds_scale, list(comb), response, 'fold10x',len(comb))

In [None]:
%%time

results = pd.DataFrame(columns=['num_vars','fold']+predictors+['AUC'])

# loop through number of variables
for i in range(1,len(predictors)+1):
    print("Predictors: ", i)
    print(datetime.datetime.now())    
    
    params = []
    
    # loop through all combinations with i variables
    for comb in combinations(predictors,i):
        params.append(comb)
        #print(comb)
   
    pool = multiprocessing.Pool(processes = 14)
    results = results.append(pool.map(helper_reg, params), ignore_index=True, sort=True)
    #res = pool.map(helper_reg, params)
    
    #if i>=4:
    #    break
    
# replace missing cofficients with 0 and return results
results = results.fillna(0)

Predictors:  1
2021-05-20 10:07:47.569224
Predictors:  2
2021-05-20 10:07:48.601209
Predictors:  3
2021-05-20 10:07:54.112208
Predictors:  4
2021-05-20 10:08:19.869759
Predictors:  5
2021-05-20 10:09:52.304573
Predictors:  6
2021-05-20 10:17:00.015218


In [None]:
results.tail(10)

## Store results

In [None]:
results.to_csv("../Results/Model_avg_10x-ext_pred-"+outcome+".csv",index=False)