# **NTDB Feature Selection**
### Used for selecting important features for the downstream logistic regression model

In [None]:
# Importing required packages
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# Setting working directory
cwd = os.getcwd()
print(cwd)

## NTDB Bike data

In [None]:
# NTDB filtered trauma dataset 
df = pd.read_excel(cwd + "/TQP_processed/trauma_filtered.xlsx")
df.head()

In [None]:
# Filtering for relevant features (no BIU)
features = ['SEX', 'AGEYEARS', 'ASIAN', 'PACIFICISLANDER', 'RACEOTHER', 'AMERICANINDIAN', 'BLACK', 'WHITE', 'RACE_NA', 'RACE_UK', 'ETHNICITY', 'WORKRELATED',  'TRANSPORTMODE', 'TM_GROUNDAMBULANCE', 'TM_HELICOPTERAMBULANCE', 'TM_FIXEDWINGAMBULANCE', 'TM_PRIVPUBVEHWALKIN', 'TM_POLICE', 'TM_OTHER', 'TM_NA', 'TM_UK', 'INTERFACILITYTRANSFER', 'PREHOSPITALCARDIACARREST', 'EDDISCHARGEHRS', 'EDDISCHARGEDAYS', 'SBP', 'PULSERATE', 'TEMPERATURE', 'RESPIRATORYRATE', 'RESPIRATORYASSISTANCE', 'PULSEOXIMETRY', 'SUPPLEMENTALOXYGEN', 'HEIGHT', 'WEIGHT', 'PRIMARYMETHODPAYMENT', 'GCSEYE', 'GCSVERBAL', 'GCSMOTOR', 'TOTALGCS', 'GCSQ_SEDATEDPARALYZED', 'GCSQ_EYEOBSTRUCTION', 'GCSQ_INTUBATED', 'GCSQ_VALID', 'GCSQ_NA', 'GCSQ_UK', 'DRGSCR_AMPHETAMINE', 'DRGSCR_BARBITURATE', 'DRGSCR_BENZODIAZEPINES', 'DRGSCR_COCAINE', 'DRGSCR_METHAMPHETAMINE', 'DRGSCR_ECSTASY', 'DRGSCR_METHADONE', 'DRGSCR_OPIOID', 'DRGSCR_OXYCODONE', 'DRGSCR_PHENCYCLIDINE', 'DRGSCR_TRICYCLICDEPRESS', 'DRGSCR_CANNABINOID', 'DRGSCR_OTHER', 'DRGSCR_NONE', 'DRGSCR_NOTTESTED', 'DRGSCR_UK', 'DRGSCR_NA', 'ALCOHOLSCREEN', 'ALCOHOLSCREENRESULT', 'EDDISCHARGEDISPOSITION', 'HOSPDISCHARGEDISPOSITION', 'TOTALICULOS', 'TOTALVENTDAYS', 'FINALDISCHARGEHRS', 'FINALDISCHARGEDAYS', 'TBIHIGHESTTOTALGCS', 'TBIGCSMOTOR', 'PMGCSQ_SEDATEDPARALYZED', 'PMGCSQ_EYEOBSTRUCTION', 'PMGCSQ_INTUBATED', 'PMGCSQ_VALID', 'PMGCSQ_NA', 'PMGCSQ_UK', 'TBIPUPILLARYRESPONSE', 'TBIMIDLINESHIFT', 'ICPEVDRAIN', 'ICPPARENCH', 'ICPO2MONITOR', 'ICPJVBULB', 'ICPNONE', 'ICP_NA', 'ICP_UK', 'VTEPROPHYLAXISTYPE', 'VTEPROPHYLAXISHRS', 'VTEPROPHYLAXISDAYS', 'BLOOD4HOURS', 'PLASMA4HOURS', 'PLATELETS4HOURS', 'CRYOPRECIPITATE4HOURS', 'ANGIOGRAPHY', 'ESLIVER', 'ESSPLEEN', 'ESKIDNEY', 'ESPELVIS', 'ESRETROPERI', 'ESVASCULAR', 'ESOTHER', 'ES_UK', 'ES_NA', 'ANGIOGRAPHYHRS', 'HMRRHGCTRLSURGTYPE', 'HMRRHGCTRLSURGHRS', 'HMRRHGCTRLSURGDAYS', 'WITHDRAWALLST', 'ISS', 'TEACHINGSTATUS', 'HOSPITALTYPE', 'BEDSIZE', 'VERIFICATIONLEVEL', 'PEDIATRICVERIFICATIONLEVEL', 'STATEDESIGNATION', 'STATEPEDIATRICDESIGNATION', 'CC_ADHD', 'CC_ADLC', 'CC_ALCOHOLISM', 'CC_ANGINAPECTORIS', 'CC_ANTICOAGULANT', 'CC_BLEEDING', 'CC_CHEMO', 'CC_CIRRHOSIS', 'CC_CONGENITAL', 'CC_COPD', 'CC_CVA', 'CC_DEMENTIA', 'CC_DIABETES', 'CC_DISCANCER', 'CC_FUNCTIONAL', 'CC_CHF', 'CC_HYPERTENSION', 'CC_MI', 'CC_OTHER', 'CC_PAD', 'CC_PREMATURITY', 'CC_MENTALPERSONALITY', 'CC_RENAL', 'CC_SMOKING', 'CC_STEROID', 'CC_SUBSTANCEABUSE', 'CC_UK', 'CC_NA', 'CC_PREGNANCY', 'HC_CLABSI', 'HC_DEEPSSI', 'HC_DVTHROMBOSIS', 'HC_ALCOHOLWITHDRAWAL', 'HC_CARDARREST', 'HC_CAUTI', 'HC_EMBOLISM', 'HC_EXTREMITYCS', 'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI', 'HC_OSTEOMYELITIS', 'HC_OTHER', 'HC_RESPIRATORY', 'HC_RETURNOR', 'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALINCISIONSSI', 'HC_PRESSUREULCER', 'HC_UNPLANNEDICU', 'HC_VAPNEUMONIA', 'HC_NA', 'HC_UK', 'HC_DELIRIUM', 'sex.factor', 'age.factor', 'race.factor', 'total.drugs', 'alcohol.use']

# 'PROTDEV_NONE', 'PROTDEV_LAP_BELT', 'PROTDEV_PER_FLOAT', 'PROTDEV_PROTECT_GEAR', 'PROTDEV_EYE_PROTECT', 'PROTDEV_CHILD_RESTRAINT', 'PROTDEV_HELMET', 'PROTDEV_AIRBAG_PRESENT', 'PROTDEV_PROTECT_CLOTH', 'PROTDEV_SHOULDER_BELT', 'PROTDEV_OTHER', 'PROTDEV_NA', 'PROTDEV_UK',

# features that need numeric conversions
#TEACHINGSTATUS sex.factor age.factor race.factor alcohol.use 

In [None]:
# List of features to standardize (excluding categorical and binary features)
features_to_standardize = ["AGEYEARS", "SBP", "PULSERATE", "TEMPERATURE", "RESPIRATORYRATE", "HEIGHT", "WEIGHT", "EDDISCHARGEHRS", "EDDISCHARGEDAYS", "TOTALICULOS", "TOTALVENTDAYS", "FINALDISCHARGEHRS", "FINALDISCHARGEDAYS", "ISS", "VTEPROPHYLAXISHRS", "VTEPROPHYLAXISDAYS", "ANGIOGRAPHYHRS", "HMRRHGCTRLSURGHRS", "HMRRHGCTRLSURGDAYS", "total.drugs"]

# List of features that need conversion to numerical variables
features_to_numeric = ['TEACHINGSTATUS', 'sex.factor', 'age.factor', 'race.factor', 'alcohol.use']

In [None]:
# creating features df
X = df[features]
scaler = StandardScaler()  #standarizing numerical values (not categorical or binary features)
le = LabelEncoder()  #converting categorical factors (helmet, no helmet) to numeric values
X[features_to_standardize] = scaler.fit_transform(X[features_to_standardize])
for f in features_to_numeric:
    X[f] = le.fit_transform(X[f])

# outcomes data - helmet use
y_raw = df["helmet.factor"]
y = le.fit_transform(y_raw)
X.head()

In [None]:
#X.head(17).to_csv(cwd + "/TQP_Processed/X.csv", index=False)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
# Creating the pipeline - LASSO regression
#pipeline = Pipeline([
#    ('imputer', SimpleImputer(strategy='mean')),  # Adding an imputer to handle NaN values, replacing them with the most frequent value
#    ('scaler', StandardScaler()),
#    ('model', Lasso())
#])

In [None]:
# Scaling the hyperparameters in the pipeline and setting number of cross validations - LASSO regression
#search = GridSearchCV(pipeline, 
#                      {'model__alpha': np.arange(0.1, 3, 0.1)},  #hyperparameter tuning
#                      cv = 5,  #cv = cross validation
#                      scoring = 'neg_mean_squared_error',  #metric we want to optimize when selection features
#                      verbose = 3
#                      )

In [None]:
# Creating the pipeline - logistic regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Adding an imputer to handle NaN values, replacing them with the most frequent value
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

In [None]:
# Scaling the hyperparameters in the pipeline and setting number of cross validations - logistic regression
search = GridSearchCV(pipeline, 
                      {'model__C': np.logspace(-4, 4, 20)},  #hyperparameter tuning
                      cv = 10,  #cv = 10-fold cross validation
                      scoring = 'accuracy',  #metric we want to optimize when selection features
                      verbose = 1
                      )

In [None]:
# Model fitting
search.fit(X_train, y_train)

In [None]:
# viewing the best hyperparameters alpha
search.best_params_

In [None]:
# coefficients
best_pipeline = search.best_estimator_
lasso_model = best_pipeline.named_steps["model"]
coef = lasso_model.coef_
coef = coef[0]
coef

### Features considered by the lasso/logistic regression

In [None]:
# Making a feature selection dataframe based on lasso/logistic regression coefficients
colnames_coef = X.columns.tolist()
coef_df = pd.DataFrame({
    "Feature": colnames_coef,
    "Coefficient": coef
})
coef_df["Abs_Coefficient"] = np.abs(coef_df["Coefficient"])
coef_df["Odds_Ratio"] = np.exp(coef_df["Coefficient"])
coef_df = coef_df.sort_values("Abs_Coefficient", ascending=False)
selected_features_list = []
for index, row in coef_df.iterrows():
    odds_ratio = row["Odds_Ratio"]
    if odds_ratio > 1.05 or odds_ratio < 0.95:
        selected_features_list.append("selected_feature")
    else:
        selected_features_list.append("discarded_feature")
coef_df["selected_feature"] = selected_features_list
coef_df

# Optional export to excel
#coef_df.to_excel(cwd + "/tables/feature_selection.xlsx", index=False)

In [None]:
coef_df_selected = coef_df[(coef_df["Odds_Ratio"] > 1.05) | (coef_df["Odds_Ratio"] < 0.95)]
print(coef_df_selected)
selected_features = np.array(coef_df_selected["Feature"])
selected_features

### Features discarded by the lasso/logistic regression

In [None]:
coef_df_discarded = coef_df[coef_df["Odds_Ratio"] <= 1.05]
coef_df_discarded = coef_df_discarded[coef_df_discarded["Odds_Ratio"] >= 0.95]
print(coef_df_discarded)
discarded_features = np.array(coef_df_discarded["Feature"])
discarded_features

In [None]:
print(len(selected_features))
print(len(discarded_features))

## Example
### Video used for feature selection with lasso regression: https://www.youtube.com/watch?v=KYHSMcP72A0

In [None]:
# Testing dataset
from sklearn.datasets import load_diabetes
X,y = load_diabetes(return_X_y=True)
features = load_diabetes()["feature_names"]
features

In [None]:
# loading the diabetes dataset
diabetes = load_diabetes()

# features
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)

# target outcome to predict
y = diabetes.target

X.head()

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Creating our pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Lasso())
])

In [None]:
# Scaling the hyperparameters in the pipeline and setting number of cross validations
search = GridSearchCV(pipeline, 
                      {'model__alpha': np.arange(0.1, 3, 0.1)},  #hyperparameter tuning
                      cv = 5,  #cv = cross validation
                      scoring = 'neg_mean_squared_error',  #metric we want to optimize when selection features
                      verbose = 3
                      )

In [None]:
# Model fitting
search.fit(X_train, y_train)

In [None]:
# viewing the best hyperparameters alpha
search.best_params_

In [None]:
# coefficients
coef = search.best_estimator_[1].coef_
coef

### Features considered by the lasso regression

In [None]:
np.array(features)[coef != 0]

### Features discarded by the lasso regression

In [None]:
np.array(features)[coef == 0]