# **NTDB TQP Merger**

### Importing required packages, defining file directories, and defining useful functions

In [None]:
# Importing required packages
import os
import glob
import re
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from collections import Counter
from tableone import TableOne

# Setting the file directory for this folder and for the raw data (stored in another HD because it's huge)
cwd = os.getcwd()
print(cwd)
raw_wd = "I:/BCM_projects/Bike-Injuries/NTDB"

In [None]:
def stat_printer(df):
    """
    Function to display some statistics of dataframes.  Input df MUST have an "INC_KEY" column.
    Input: a dataframe with an INC_KEY column
    Outputs: print/display statements with statistics about the dataframe
    """
    x = len(df["INC_KEY"].unique())
    print("Shape: {0}\nUnique INC KEYS: {1}".format(str(df.shape), str(x)))
    display(df.head())

## **Importing all the trauma dfs and merging**

In [None]:
trauma_file_list = glob.glob(cwd+"/TQP_Files/main/*.csv")
trauma_df_list = []
for file_dir in trauma_file_list:
    df = pd.read_csv(file_dir)
    df = df.rename(columns={"LOSDAYS": "FINALDISCHARGEDAYS", "LOSMINS": "FINALDISCHARGEHRS", "ANGIOGRAPHYMINS": "ANGIOGRAPHYHRS", "ISS_05": "ISS",
                            "CEREBRALMONITORDAYS": "TBICEREBRALMONITORDAYS", "CEREBRALMONITORMINS": "TBICEREBRALMONITORHRS", "HMRRHGCTRLSURGMINS": "HMRRHGCTRLSURGHRS", "VTEPROPHYLAXISMINS": "VTEPROPHYLAXISHRS", 
                            "EMSSCENEDAYS": "EMSDEPARTUREDAYS", "EMSSCENEMINS": "EMSDEPARTUREHRS", "ADDITIONALECODE1": "ADDITIONALECODEICD10", "EMSRESPONSEDAYS": "EMSARRIVALDAYS", "EMSRESPONSEMINS": "EMSARRIVALHRS", 
                            "EDDAYS": "EDDISCHARGEDAYS", "EDMINS": "EDDISCHARGEHRS"})
    trauma_df_list.append(df)
    print("Done with {}".format(file_dir))
print(len(trauma_df_list))

In [None]:
# Unit conversions on each df (mostly 2017 & 2018 unit conversion from minutes to hours)
df_2017 = trauma_df_list[0]
df_2017_mod = df_2017.copy()
df_2018 = trauma_df_list[1]
df_2018_mod = df_2018.copy()
df_2019 = trauma_df_list[2]
df_2020 = trauma_df_list[3]
df_2021 = trauma_df_list[4]

# Unit conversions for 2017
df_2017_mod["FINALDISCHARGEHRS"] = df_2017_mod["FINALDISCHARGEHRS"] / 60
df_2017_mod["ANGIOGRAPHYHRS"] = df_2017_mod["ANGIOGRAPHYHRS"] / 60
df_2017_mod["TBICEREBRALMONITORHRS"] = df_2017_mod["TBICEREBRALMONITORHRS"] / 60
df_2017_mod["HMRRHGCTRLSURGHRS"] = df_2017_mod["HMRRHGCTRLSURGHRS"] / 60
df_2017_mod["VTEPROPHYLAXISHRS"] = df_2017_mod["VTEPROPHYLAXISHRS"] / 60
df_2017_mod["EMSDEPARTUREHRS"] = df_2017_mod["EMSDEPARTUREHRS"] / 60
df_2017_mod["EMSARRIVALHRS"] = df_2017_mod["EMSARRIVALHRS"] / 60
df_2017_mod["EDDISCHARGEHRS"] = df_2017_mod["EDDISCHARGEHRS"] / 60
#df_2017_mod[["INC_KEY", "FINALDISCHARGEHRS", "ANGIOGRAPHYHRS", "TBICEREBRALMONITORHRS", "HMRRHGCTRLSURGHRS", "VTEPROPHYLAXISHRS", "EMSDEPARTUREHRS", "EMSARRIVALHRS", "EDDISCHARGEHRS", "Year"]]

# Unit conversions for 2018
df_2018_mod["FINALDISCHARGEHRS"] = df_2018_mod["FINALDISCHARGEHRS"] / 60
df_2018_mod["ANGIOGRAPHYHRS"] = df_2018_mod["ANGIOGRAPHYHRS"] / 60
df_2018_mod["TBICEREBRALMONITORHRS"] = df_2018_mod["TBICEREBRALMONITORHRS"] / 60
df_2018_mod["HMRRHGCTRLSURGHRS"] = df_2018_mod["HMRRHGCTRLSURGHRS"] / 60
df_2018_mod["VTEPROPHYLAXISHRS"] = df_2018_mod["VTEPROPHYLAXISHRS"] / 60
df_2018_mod["EMSDEPARTUREHRS"] = df_2018_mod["EMSDEPARTUREHRS"] / 60
df_2018_mod["EMSARRIVALHRS"] = df_2018_mod["EMSARRIVALHRS"] / 60
df_2018_mod["EDDISCHARGEHRS"] = df_2018_mod["EDDISCHARGEHRS"] / 60
#df_2018_mod[["INC_KEY", "FINALDISCHARGEHRS", "ANGIOGRAPHYHRS", "TBICEREBRALMONITORHRS", "HMRRHGCTRLSURGHRS", "VTEPROPHYLAXISHRS", "EMSDEPARTUREHRS", "EMSARRIVALHRS", "EDDISCHARGEHRS", "Year"]]

In [None]:
# Getting column names for each trauma year
df1_cols = df_2017_mod.columns
df2_cols = df_2018_mod.columns
df3_cols = df_2019.columns
df4_cols = df_2020.columns
df5_cols = df_2021.columns
trauma_mod_df_list = [df_2017_mod, df_2018_mod, df_2019, df_2020, df_2021]

# Finding the common columns
common_cols = np.intersect1d(df1_cols, df2_cols)
common_cols = np.intersect1d(common_cols, df3_cols)
common_cols = np.intersect1d(common_cols, df4_cols)
common_cols = np.intersect1d(common_cols, df5_cols)

# Concatenating
trauma = pd.concat(trauma_mod_df_list, axis=0, ignore_index=True, join='inner', keys=common_cols)
stat_printer(trauma)

In [None]:
len(trauma[trauma["ALCOHOLSCREENRESULT"]>0.02])

In [None]:
# Exporting trauma df to csv
#trauma.to_csv(cwd+"/TQP_Processed/trauma_merged.csv", index=False)

## **Importing all the AIS (injury severity) dfs and merging**

In [None]:
ais_file_list = glob.glob(cwd+"/TQP_Files/ais/*.csv")
ais_df_list = []
for file_dir in ais_file_list:
    df = pd.read_csv(file_dir)
    ais_df_list.append(df)
    print("Done with {}".format(file_dir))
print(len(ais_df_list))

In [None]:
for df in ais_df_list:
    print(df.columns)

In [None]:
# Getting column names for each AIS year 
df1_cols = ais_df_list[0].columns
df2_cols = ais_df_list[1].columns
df3_cols = ais_df_list[2].columns
df4_cols = ais_df_list[3].columns
df5_cols = ais_df_list[4].columns

# Gathering common shared columns
common_cols = np.intersect1d(df1_cols, df2_cols)
common_cols = np.intersect1d(common_cols, df3_cols)
common_cols = np.intersect1d(common_cols, df4_cols)
common_cols = np.intersect1d(common_cols, df5_cols)

# Concatenating
ais = pd.concat(ais_df_list, axis=0, ignore_index=True, join='inner', keys=common_cols)
stat_printer(ais)

In [None]:
# Exporting trauma df to csv
#ais.to_csv(cwd+"/TQP_Processed/ais_merged.csv", index=False)

## Merging Co-morbidities

In [None]:
# getting each df (mostly focussing on 2017 & 2018 because they included co-morbidities in the trauma dfs)
df_2017 = trauma_df_list[0]
df_2017_mod = df_2017.copy()
df_2018 = trauma_df_list[1]
df_2018_mod = df_2018.copy()
df_2019 = trauma_df_list[2]
df_2020 = trauma_df_list[3]
df_2021 = trauma_df_list[4]

# getting all comborbidities files
comor_file_list = glob.glob(cwd+"/TQP_Files/preexisting/*.csv")
comor_dict = {13: "CC_ADLC", 2: "CC_ALCOHOLISM", 32: "CC_ANGINAPECTORIS", 31: "CC_ANTICOAGULANT", 30: "CC_ADHD",
             4: "CC_BLEEDING", 10: "CC_CVA", 23: "CC_COPD", 9: "CC_RENAL", 25: "CC_CIRRHOSIS", 6: "CC_CONGENITAL", 
             7: "CC_CHF", 8: "CC_SMOKING", 5: "CC_CHEMO", 26: "CC_DEMENTIA", 11: "CC_DIABETES", 12: "CC_DISCANCER",
             15: "CC_FUNCTIONAL", 19: "CC_HYPERTENSION", 33: "CC_MENTALPERSONALITY", 34: "CC_MI", 35: "CC_PAD",
             38: "CC_PREGNANCY", 37: "CC_PREMATURITY", 24: "CC_STEROID", 36: "CC_SUBSTANCEABUSE"}
comor_df_list = []
for file_dir in comor_file_list:
    df = pd.read_csv(file_dir)
    df["PREEXISTINGCONDITION"] = df["PREEXISTINGCONDITION"].replace(comor_dict)
    df["PREEXISTINGCONDITIONANSWER"] = df["PREEXISTINGCONDITIONANSWER"].replace({2:0, 1:1})
    df = df.pivot(index=["INC_KEY", "Year"], columns=["PREEXISTINGCONDITION"], values="PREEXISTINGCONDITIONANSWER").reset_index()
    comor_df_list.append(df)
    print("Done with {}".format(file_dir))
print(len(comor_df_list))

In [None]:
col_names = df_2017_mod.columns
col_names[df_2017_mod.columns.str.startswith("CC_")]

In [None]:
# Comorbidity columns to merge
cols = ["INC_KEY", "Year", 'CC_ADHD', 'CC_ADLC', 'CC_ALCOHOLISM', 'CC_ANGINAPECTORIS',
       'CC_ANTICOAGULANT', 'CC_BLEEDING', 'CC_CHEMO', 'CC_CIRRHOSIS',
       'CC_CONGENITAL', 'CC_COPD', 'CC_CVA', 'CC_DEMENTIA', 'CC_DIABETES',
       'CC_DISCANCER', 'CC_FUNCTIONAL', 'CC_CHF', 'CC_HYPERTENSION', 'CC_MI',
       'CC_OTHER', 'CC_PAD', 'CC_PREMATURITY', 'CC_MENTALPERSONALITY',
       'CC_RENAL', 'CC_SMOKING', 'CC_STEROID', 'CC_SUBSTANCEABUSE', 'CC_UK', 'CC_NA']

# 2017
comor_2017 = df_2017_mod[cols]

# 2018
comor_2018 = df_2018_mod[cols]

# 2019-2021
comor_2019 = comor_df_list[0]
comor_2020 = comor_df_list[1]
comor_2021 = comor_df_list[2]

# Concatenating
comor_list = [comor_2017, comor_2018, comor_2019, comor_2020, comor_2021]
comor = pd.concat(comor_list, axis=0, ignore_index=True, join='outer')
comor

In [None]:
# Exporting to csv
#comor.to_csv(cwd+"/TQP_Processed/comorbidities_merged.csv", index=False)

## Merging Adverse Events

In [None]:
# getting each df (mostly focussing on 2017 & 2018 because they included adverse events in the trauma dfs)
df_2017 = trauma_df_list[0]
df_2017_mod = df_2017.copy()
df_2018 = trauma_df_list[1]
df_2018_mod = df_2018.copy()
df_2019 = trauma_df_list[2]
df_2020 = trauma_df_list[3]
df_2021 = trauma_df_list[4]

# getting all comborbidities files
ae_file_list = glob.glob(cwd+"/TQP_Files/events/*.csv")
ae_dict = {4: "HC_KIDNEY", 5: "HC_RESPIRATORY", 36: "HC_ALCOHOLWITHDRAWAL", 8: "HC_CARDARREST", 33: "HC_CAUTI",
             34: "HC_CLABSI", 12: "HC_DEEPSSI", 14: "HC_DVTHROMBOSIS", 15: "HC_EXTREMITYCS", 18: "HC_MI", 19: "HC_ORGANSPACESSI", 
             29: "HC_OSTEOMYELITIS", 21: "HC_EMBOLISM", 37: "HC_PRESSUREULCER", 32: "HC_SEPSIS", 22: "HC_STROKECVA", 38: "HC_SUPERFICIALINCISIONSSI",
             31: "HC_UNPLANNEDICU", 25: "HC_INTUBATION", 40: "HC_RETURNOR", 35: "HC_VAPNEUMONIA", 39: "HC_DELIRIUM"}
ae_df_list = []
for file_dir in ae_file_list:
    df = pd.read_csv(file_dir)
    df["HOSPITALEVENT"] = df["HOSPITALEVENT"].replace(ae_dict)
    df["HOSPITALEVENTANSWER"] = df["HOSPITALEVENTANSWER"].replace({2:0, 1:1})
    df = df.pivot(index=["INC_KEY", "Year"], columns=["HOSPITALEVENT"], values="HOSPITALEVENTANSWER").reset_index()
    ae_df_list.append(df)
    print("Done with {}".format(file_dir))
print(len(ae_df_list))

In [None]:
col_names = df_2017_mod.columns
col_names[df_2017_mod.columns.str.startswith("HC_")]

In [None]:
ae_df_list[1].columns

In [None]:
# Comorbidity columns to merge
cols = ["INC_KEY", "Year", 'HC_CLABSI', 'HC_DEEPSSI', 'HC_DVTHROMBOSIS', 'HC_ALCOHOLWITHDRAWAL',
       'HC_CARDARREST', 'HC_CAUTI', 'HC_EMBOLISM', 'HC_EXTREMITYCS',
       'HC_INTUBATION', 'HC_KIDNEY', 'HC_MI', 'HC_ORGANSPACESSI',
       'HC_OSTEOMYELITIS', 'HC_OTHER', 'HC_RESPIRATORY', 'HC_RETURNOR',
       'HC_SEPSIS', 'HC_STROKECVA', 'HC_SUPERFICIALINCISIONSSI',
       'HC_PRESSUREULCER', 'HC_UNPLANNEDICU', 'HC_VAPNEUMONIA', 'HC_NA',
       'HC_UK']

# 2017
ae_2017 = df_2017_mod[cols]

# 2018
ae_2018 = df_2018_mod[cols]

# 2019-2021
ae_2019 = ae_df_list[0]
ae_2020 = ae_df_list[1]
ae_2021 = ae_df_list[2]

# Concatenating
ae_list = [ae_2017, ae_2018, ae_2019, ae_2020, ae_2021]
ae = pd.concat(ae_list, axis=0, ignore_index=True, join='outer')
ae

In [None]:
# Exporting to csv
#ae.to_csv(cwd+"/TQP_Processed/adverse_events_merged.csv", index=False)