# Project: ICD-AIS conversion using Deep Learning utilizing ICD10

This script opens the NTDB data files and creates files for used for training and testing translation methods.  These files include patient information(age, sex, MOI, year), observed ICD codes (long format), and observed AIS codes (long format) divided into train/val/test.

## Setup

In [1]:
import numpy as np
import pandas as pd

In [2]:
validation_perc = 0.1 # percent of cases for validation (from years 2017-2018)
test_num = 10_000 # number of data for test (from year 2019)

#### Data files

In [3]:
# patient info
ntdb_files_pt = ["../Data/NTDB_2017/PUF_TRAUMA.csv",
                  "../Data/NTDB_2018/PUF_TRAUMA.csv",
                  "../Data/NTDB_2019/PUF_TRAUMA.csv"]

# procedures in ICD
ntdb_files_proc = ["../Data/NTDB_2017/PUF_ICDPROCEDURE.csv",
                  "../Data/NTDB_2018/PUF_ICDPROCEDURE.csv",
                  "../Data/NTDB_2019/PUF_ICDPROCEDURE.csv"]

# diagnoses in ICD
ntdb_files_icd = ["../Data/NTDB_2017/PUF_ICDDIAGNOSIS.csv",
                  "../Data/NTDB_2018/PUF_ICDDIAGNOSIS.csv",
                  "../Data/NTDB_2019/PUF_ICDDIAGNOSIS.csv"]

# diagnoses in AIS
ntdb_files_ais = ["../Data/NTDB_2017/PUF_AISDIAGNOSIS.csv",
                  "../Data/NTDB_2018/PUF_AISDIAGNOSIS.csv",
                  "../Data/NTDB_2019/PUF_AISDIAGNOSIS.csv"]

#### Output files

In [4]:
# training
train_demo = "../Data/NTDB_combine/ntdb_train_demo.csv"
train_proc = "../Data/NTDB_combine/ntdb_train_proc.csv"
train_icd = "../Data/NTDB_combine/ntdb_train_icd.csv"
train_ais = "../Data/NTDB_combine/ntdb_train_ais.csv"

# validation
valid_demo = "../Data/NTDB_combine/ntdb_valid_demo.csv"
valid_proc = "../Data/NTDB_combine/ntdb_valid_proc.csv"
valid_icd = "../Data/NTDB_combine/ntdb_valid_icd.csv"
valid_ais = "../Data/NTDB_combine/ntdb_valid_ais.csv"

# testing
test_demo = "../Data/NTDB_combine/ntdb_test_demo.csv"
test_proc = "../Data/NTDB_combine/ntdb_test_proc.csv"
test_icd = "../Data/NTDB_combine/ntdb_test_icd.csv"
test_ais = "../Data/NTDB_combine/ntdb_test_ais.csv"

## Read patient data
Combine patient data from different years and randomly select patients for training, validation, and testing.

In [5]:
# read in 2017 data
pt_dat = pd.read_csv(ntdb_files_pt[0])

# add year
pt_dat['adm_year'] = 2017

# create df for combined data
pt_demo = pt_dat[['inc_key','SEX','AGEYEARS','PRIMARYECODEICD10','adm_year','ISS_05']].copy()

# Convert sex to 1(M)/2(F)
pt_demo['SEX'] = pt_demo.apply(lambda row: 1.0 if row.SEX == "Male" else 2.0, axis=1)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
# read in 2018 data
pt_dat = pd.read_csv(ntdb_files_pt[1])

# add year
pt_dat['adm_year'] = 2018

# rename column to match 2017
pt_dat = pt_dat.rename(columns={'AGEYears':'AGEYEARS'})

# append to combined df
pt_demo = pt_demo.append(pt_dat[['inc_key','SEX','AGEYEARS','PRIMARYECODEICD10','adm_year','ISS_05']], ignore_index=True)

In [7]:
# read in 2019 data
pt_dat = pd.read_csv(ntdb_files_pt[2])

# add year
pt_dat['adm_year'] = 2019

# rename column to match 2017
pt_dat = pt_dat.rename(columns={'AGEyears':'AGEYEARS'})

# append to combined df
pt_demo = pt_demo.append(pt_dat[['inc_key','SEX','AGEYEARS','PRIMARYECODEICD10','adm_year','ISS_05']], ignore_index=True)

In [8]:
# remove pt_dat
del pt_dat

In [9]:
# replace age NaN with 0
pt_demo['AGEYEARS'] = pt_demo.AGEYEARS.fillna(0)

In [10]:
# remove rows with missing data
pt_demo = pt_demo.dropna()

In [11]:
# remove rows with non-numerics in the inc_key field
pt_demo = pt_demo[pd.to_numeric(pt_demo['inc_key'], errors='coerce').notnull()]

# convert inc_key to integers
pt_demo['inc_key'] = pt_demo.inc_key.astype(int)

## Split patient keys for training, validation, and testing

In [12]:
# year 2019 will be used for testing
pt_demo_test = pt_demo[pt_demo.adm_year==2019]

# years 2017-2018 will be used for training/validation
pt_demo = pt_demo[pt_demo.adm_year!=2019]

In [13]:
# randomize order, then split keys for validation and training
key_valid, key_train = np.split(pt_demo.inc_key.sample(frac=1, random_state=42), 
                               [int(len(pt_demo)*validation_perc)])

# randomly sample keys for testing
key_test = pt_demo_test.inc_key.sample(n=test_num, random_state=42)

In [14]:
# sort in descending order
key_valid = key_valid.sort_values()
key_train = key_train.sort_values()
key_test = key_test.sort_values()

## Write out patient demographics

In [15]:
# write training, validation, and testing data to separate files
pt_demo[pt_demo.inc_key.isin(key_train)].to_csv(train_demo, index=False)
pt_demo[pt_demo.inc_key.isin(key_valid)].to_csv(valid_demo, index=False)
pt_demo_test[pt_demo_test.inc_key.isin(key_test)].to_csv(test_demo, index=False)

In [16]:
# remove demographics
del pt_demo, pt_demo_test

## Read/Write patient procedures

In [17]:
# read in 2017 data
pt_dat = pd.read_csv(ntdb_files_proc[0])

# select columns and rename key
pt_proc = pt_dat[['Inc_Key','ICDPROCEDURECODE']].rename(columns={'Inc_Key':'inc_key'})

In [18]:
# read in 2018 data
pt_dat = pd.read_csv(ntdb_files_proc[1])

# select columns and rename key
pt_dat = pt_dat[['Inc_Key','ICDPROCEDURECODE']].rename(columns={'Inc_Key':'inc_key'})

# append to previous data
pt_proc = pt_proc.append(pt_dat, ignore_index=True)

In [19]:
# read in 2019 data
pt_dat = pd.read_csv(ntdb_files_proc[2])

# select columns and rename key
pt_proc_test = pt_dat[['Inc_Key','ICDPROCEDURECODE']].rename(columns={'Inc_Key':'inc_key'})

In [20]:
# remove rows with missing data
pt_proc = pt_proc.dropna()
pt_proc_test = pt_proc_test.dropna()

In [21]:
# sort procedures in descending order
pt_proc = pt_proc.sort_values(['inc_key','ICDPROCEDURECODE']).reset_index(drop=True)
pt_proc_test = pt_proc_test.sort_values(['inc_key','ICDPROCEDURECODE']).reset_index(drop=True)

In [22]:
# write training, validation, and testing data to separate files
pt_proc[pt_proc.inc_key.isin(key_train)].to_csv(train_proc, index=False)
pt_proc[pt_proc.inc_key.isin(key_valid)].to_csv(valid_proc, index=False)
pt_proc_test[pt_proc_test.inc_key.isin(key_test)].to_csv(test_proc, index=False)

In [23]:
del pt_proc, pt_proc_test

## Read/Write patient ICD diagnosis codes

In [24]:
# read in 2017 data
pt_dat = pd.read_csv(ntdb_files_icd[0])

# select columns and rename key
pt_icd = pt_dat[['Inc_Key','ICDDIAGNOSISCODE']].rename(columns={'Inc_Key':'inc_key'})

In [25]:
# read in 2018 data
pt_dat = pd.read_csv(ntdb_files_icd[1])

# select columns and rename key
pt_dat = pt_dat[['Inc_Key','ICDDIAGNOSISCODE']].rename(columns={'Inc_Key':'inc_key'})

# append to previous data
pt_icd = pt_icd.append(pt_dat, ignore_index=True)

In [26]:
# read in 2019 data
pt_dat = pd.read_csv(ntdb_files_icd[2])

# select columns and rename key
pt_icd_test = pt_dat[['Inc_Key','ICDDIAGNOSISCODE']].rename(columns={'Inc_Key':'inc_key'})

In [27]:
# remove rows with missing data
pt_icd = pt_icd.dropna()
pt_icd_test = pt_icd_test.dropna()

In [28]:
# sort in descending order
pt_icd = pt_icd.sort_values(['inc_key','ICDDIAGNOSISCODE']).reset_index(drop=True)
pt_icd_test = pt_icd_test.sort_values(['inc_key','ICDDIAGNOSISCODE']).reset_index(drop=True)

In [29]:
# write training, validation, and testing data to separate files
pt_icd[pt_icd.inc_key.isin(key_train)].to_csv(train_icd, index=False)
pt_icd[pt_icd.inc_key.isin(key_valid)].to_csv(valid_icd, index=False)
pt_icd_test[pt_icd_test.inc_key.isin(key_test)].to_csv(test_icd, index=False)

In [30]:
del pt_icd, pt_icd_test

## Read/Write patient AIS diagnosis codes

In [31]:
# read in 2017 data
pt_dat = pd.read_csv(ntdb_files_ais[0])

# make AIS code
pt_dat['AISCODE'] = pt_dat.AISPREDOT + (pt_dat.AISSeverity/10)

# select columns 
pt_ais = pt_dat[['inc_key','AISCODE']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [32]:
# read in 2018 data
pt_dat = pd.read_csv(ntdb_files_ais[1])

# make AIS code
pt_dat['AISCODE'] = pt_dat.AISPREDOT + (pt_dat.AISSeverity/10)

# select columns 
pt_dat = pt_dat[['inc_key','AISCODE']]

# append to previous data
pt_ais = pt_ais.append(pt_dat, ignore_index=True)

In [33]:
# read in 2019 data
pt_dat = pd.read_csv(ntdb_files_ais[2])

# make AIS code
pt_dat['AISCODE'] = pt_dat.AISPreDot + (pt_dat.AISSeverity/10)

# select columns and rename key
pt_ais_test = pt_dat[['inc_key','AISCODE']]

In [34]:
# remove rows with missing data
pt_ais = pt_ais.dropna()
pt_ais_test = pt_ais_test.dropna()

In [35]:
# sort values in descending orders
pt_ais = pt_ais.sort_values(['inc_key','AISCODE']).reset_index(drop=True)
pt_ais_test = pt_ais_test.sort_values(['inc_key','AISCODE']).reset_index(drop=True)

In [36]:
# write training, validation, and testing data to separate files
pt_ais[pt_ais.inc_key.isin(key_train)].to_csv(train_ais, index=False)
pt_ais[pt_ais.inc_key.isin(key_valid)].to_csv(valid_ais, index=False)
pt_ais_test[pt_ais_test.inc_key.isin(key_test)].to_csv(test_ais, index=False)

In [37]:
del pt_dat, pt_ais, pt_ais_test

In [38]:
key_test

2042267    190026915434
2042507    190026952586
2042652    190026952733
2042770    190026952851
2043012    190026953094
               ...     
3138618    190045782557
3138705    190046127851
3138843    190046127989
3138931    190046128077
3139173    190046128321
Name: inc_key, Length: 10000, dtype: int64