# Project: ICD-AIS conversion using Deep Learning using ICD10

This script converts the training, testing, and validation files into a format that OpenNMT can use.  The output format is:
- One patient per line
- ICD: Age first, then ECode, then procedure and diagnostic codes in ascending alphanumeric order
- AIS: Codes in ascending order, no post-dot severity
- Space in between codes

## Setup

In [1]:
import numpy as np
import pandas as pd
import datetime

In [2]:
# parameters
age_cat = True  # place ages in bins
include_age = True  # include age in input
include_sex = True  # include sex in input
include_ecodes = True  # include ecodes in input
include_pcodes = True  # include procedure codes in input

#### Combined NTDB files

In [3]:
# training
train_demo_file = "../Data/NTDB_combine/ntdb_train_demo.csv"
train_proc_file = "../Data/NTDB_combine/ntdb_train_proc.csv"
train_icd_file = "../Data/NTDB_combine/ntdb_train_icd.csv"
train_ais_file = "../Data/NTDB_combine/ntdb_train_ais.csv"

# validation
valid_demo_file = "../Data/NTDB_combine/ntdb_valid_demo.csv"
valid_proc_file = "../Data/NTDB_combine/ntdb_valid_proc.csv"
valid_icd_file = "../Data/NTDB_combine/ntdb_valid_icd.csv"
valid_ais_file = "../Data/NTDB_combine/ntdb_valid_ais.csv"

# testing
test_demo_file = "../Data/NTDB_combine/ntdb_test_demo.csv"
test_proc_file = "../Data/NTDB_combine/ntdb_test_proc.csv"
test_icd_file = "../Data/NTDB_combine/ntdb_test_icd.csv"
test_ais_file = "../Data/NTDB_combine/ntdb_test_ais.csv"

#### Sentence formated files

In [4]:
# training
train_sent_icd_file = "../Data/NTDB_sentence/train_icd10_agecat_proc.csv"
train_sent_ais_file = "../Data/NTDB_sentence/train_ais_no_sev.csv"

# validation
valid_sent_icd_file = "../Data/NTDB_sentence/valid_icd10_agecat_proc.csv"
valid_sent_ais_file = "../Data/NTDB_sentence/valid_ais_no_sev.csv"

# testing
test_sent_icd_file = "../Data/NTDB_sentence/test_icd10_agecat_proc.csv"
test_sent_ais_file = "../Data/NTDB_sentence/test_ais_no_sev.csv"

## Functions to convert data to input and output strings

In [5]:
# This function returns True with variable is NaN
def isNaN(num):
    return num != num

In [6]:
# This function converts values in a dataframe to a string.
#  It first finds all rows that match an id, then adds
#  a prefix to each value, and finally makes the values into
#  a string with a space separating values.

def list_to_sentence_fast(key, df, prefix, start_num, missing_key=False):
    
    #print(key, "==", df.iloc[start_num,:].inc_key, start_num)
    
    # check for any entries
    if(df.iloc[start_num,:].inc_key==key):

        # store end_num
        end_num = start_num

        # find end key
        if((end_num+1)<len(df)):
            
            # while there is more data
            while(key==df.iloc[end_num+1,:].inc_key):
                end_num = end_num + 1

                # check for end of df
                if((end_num+1)>=len(df)):
                    break

        # get entries
        entries = df.iloc[start_num:end_num+1,1]
        
        # update start number
        start_num = end_num+1
        
        #print(entries)
        
    # else no matching entries
    else:
        entries = '0'
        
        if missing_key:
            print("No entry, key:", key)
       
    #print(entries)
    
    # combine entries into string, prefix each code, and add spaces
    codes = prefix + (' ' + prefix).join(entries)
    
    # return codes, next start num
    return codes, start_num

In [7]:
# This function converts data from multiple dataframes
#  to a single sentence for each patient.  It takes
#  demographic data, procedures codes, and icd codes.

def form_input_sentences(demo, proc, icd):     
        
    # list for results
    input_sentences = []
    
    # order list
    demo = demo.sort_values('inc_key').reset_index(drop=True)
    proc = proc.sort_values(['inc_key','ICDPROCEDURECODE']).reset_index(drop=True)
    icd = icd.sort_values(['inc_key','ICDDIAGNOSISCODE']).reset_index(drop=True)
    
    # start numbers for D and P
    d_start = 0 
    p_start = 0
    
    # loop through all patients
    for i,row in demo.iterrows():
    
        #print(row)
    
        # check if in include age
        if include_age: 
            
            # get age 
            age_num = int(row.AGEYEARS)

            # bin ages if necessary
            if age_cat:
                # round to next lower ten years
                age_num = (age_num // 10) * 10

            # make age string
            age = "A" + str(age_num)
        else:
            age = ''
        
        # check if in include sex
        if include_sex:
            # get sex
            sex = 'M' if row.SEX == 1.0 else 'F'
        else:
            sex = ''
            
        # check if in include ecodes
        if include_ecodes:
            # get mechanism of injury code
            ecodes = row.PRIMARYECODEICD10
        else:
            ecodes = ''
        
        # check if in include pcodes
        if include_pcodes:
            # get procedure codes
            pcodes, p_start = list_to_sentence_fast(row.inc_key, proc, 'P', p_start)
        else:
            pcodes = ''
        
        # get diagnosis codes, always include these
        dcodes, d_start = list_to_sentence_fast(row.inc_key, icd, 'D', d_start)
        #dcodes = ''
    
        # form sentence
        pt_sent = ' '.join([age, sex, ecodes, pcodes, dcodes])
        
        # remove periods
        pt_sent = pt_sent.replace('.','')     
              
        # append to list
        input_sentences.append(pt_sent)
        
        #print(pt_sent)
        
        if i%100_000 == 0:
            print("i:",i,"    ", datetime.datetime.now())
        
    input_df = pd.DataFrame(input_sentences, columns=['values'])    
    
    return input_df

In [8]:
# This function converts data from ais dataframe
#  to a single sentence for each patient.  

def form_output_sentences(demo, ais):
    
    # list for results
    output_sentences = []
    
    # order list
    demo = demo.sort_values('inc_key').reset_index(drop=True)
    ais = ais.sort_values(['inc_key','AISCODE']).reset_index(drop=True)
    
    # remove severity code and convert to string
    ais['AISCODE'] = np.floor(ais.AISCODE).astype(int).astype(str)
    
    # start numbers 
    a_start = 0 
    
    # loop through all patients
    for i,row in demo.iterrows():
        
        # get ais codes
        codes, a_start = list_to_sentence_fast(row.inc_key, ais, '', a_start)
                      
        # append to list
        output_sentences.append(codes)
        
        if i%100_000 == 0:
            print("i:",i,"    ", datetime.datetime.now())
    
    output_df = pd.DataFrame(output_sentences, columns=['values'])    

    return output_df

## Convert Test Data

In [9]:
%%time
# read in testing input files
test_demo = pd.read_csv(test_demo_file)
test_proc = pd.read_csv(test_proc_file)
test_icd = pd.read_csv(test_icd_file)

print("Number of test patients:", len(test_demo))

# convert input strings
test_input = form_input_sentences(test_demo, test_proc, test_icd)

# write input to file
test_input.to_csv(test_sent_icd_file, index=False, header=False)

# release memory
del test_demo, test_proc, test_icd, test_input

Number of test patients: 10000
i: 0      2023-02-13 16:28:18.663642
CPU times: user 20 s, sys: 42.9 ms, total: 20.1 s
Wall time: 20.1 s


In [10]:
%%time
# read in output file
test_demo = pd.read_csv(test_demo_file)
test_ais = pd.read_csv(test_ais_file)

print("Number of test patients:", len(test_demo))
print("Number of test injuries:", len(test_ais))

# convert input strings
test_output = form_output_sentences(test_demo,test_ais)

# output files
test_output.to_csv(test_sent_ais_file, index=False, header=False)

#del test_ais, test_output

Number of test patients: 10000
Number of test injuries: 33497
i: 0      2023-02-13 16:28:38.741073
CPU times: user 7.64 s, sys: 20.9 ms, total: 7.66 s
Wall time: 7.68 s


## Convert Validation Data

In [11]:
%%time
# read in validation input files
valid_demo = pd.read_csv(valid_demo_file)
valid_proc = pd.read_csv(valid_proc_file)
valid_icd = pd.read_csv(valid_icd_file)

print("Number of validation patients:", len(valid_demo))

# convert input strings
valid_input = form_input_sentences(valid_demo, valid_proc, valid_icd)

# write input to file
valid_input.to_csv(valid_sent_icd_file, index=False, header=False)

# release memory
del valid_demo, valid_proc, valid_icd, valid_input

Number of validation patients: 203224
i: 0      2023-02-13 16:28:47.930775
i: 100000      2023-02-13 16:32:00.121772
i: 200000      2023-02-13 16:35:15.253924
CPU times: user 6min 35s, sys: 646 ms, total: 6min 35s
Wall time: 6min 35s


In [12]:
%%time
# read in output file
valid_demo = pd.read_csv(valid_demo_file)
valid_ais = pd.read_csv(valid_ais_file)

print("Number of validation patients:", len(valid_demo))
print("Number of validation injuries:", len(valid_ais))

# convert input strings
valid_output = form_output_sentences(valid_demo, valid_ais)

# output files
valid_output.to_csv(valid_sent_ais_file, index=False, header=False)

del valid_ais, valid_output

Number of validation patients: 203224
Number of validation injuries: 696184
i: 0      2023-02-13 16:35:23.426689
i: 100000      2023-02-13 16:36:38.087263
i: 200000      2023-02-13 16:37:52.535931
CPU times: user 2min 32s, sys: 362 ms, total: 2min 32s
Wall time: 2min 32s


## Convert Training Data

In [13]:
%%time
# read in training input files
train_demo = pd.read_csv(train_demo_file)
train_proc = pd.read_csv(train_proc_file)
train_icd = pd.read_csv(train_icd_file)

print("Number of training patients:", len(train_demo))

# convert input strings
train_input = form_input_sentences(train_demo, train_proc, train_icd)

# write input to file
train_input.to_csv(train_sent_icd_file, index=False, header=False)

# release memory
del train_demo, train_proc, train_icd, train_input

Number of training patients: 1829022
i: 0      2023-02-13 16:38:10.471602
i: 100000      2023-02-13 16:41:17.777810
i: 200000      2023-02-13 16:44:37.136904
i: 300000      2023-02-13 16:47:44.149483
i: 400000      2023-02-13 16:50:50.430777
i: 500000      2023-02-13 16:54:03.415548
i: 600000      2023-02-13 16:57:14.442515
i: 700000      2023-02-13 17:00:27.377983
i: 800000      2023-02-13 17:03:36.898338
i: 900000      2023-02-13 17:06:27.663364
i: 1000000      2023-02-13 17:09:36.765606
i: 1100000      2023-02-13 17:12:45.201342
i: 1200000      2023-02-13 17:16:05.259688
i: 1300000      2023-02-13 17:19:19.607486
i: 1400000      2023-02-13 17:22:39.325068
i: 1500000      2023-02-13 17:25:59.341781
i: 1600000      2023-02-13 17:29:14.222792
i: 1700000      2023-02-13 17:32:28.625972
i: 1800000      2023-02-13 17:35:39.272075
CPU times: user 58min 42s, sys: 5.46 s, total: 58min 47s
Wall time: 58min 48s


In [14]:
%%time
# read in output file
train_demo = pd.read_csv(train_demo_file)
train_ais = pd.read_csv(train_ais_file)

print("Number of training patients:", len(train_demo))
print("Number of training injuries:", len(train_ais))

# convert input strings
train_output = form_output_sentences(train_demo, train_ais)

# output files
train_output.to_csv(train_sent_ais_file, index=False, header=False)

del train_ais, train_output

Number of training patients: 1829022
Number of training injuries: 6277251
i: 0      2023-02-13 17:36:53.900298
i: 100000      2023-02-13 17:38:08.328161
i: 200000      2023-02-13 17:39:25.045085
i: 300000      2023-02-13 17:40:39.207668
i: 400000      2023-02-13 17:41:54.241986
i: 500000      2023-02-13 17:43:11.823713
i: 600000      2023-02-13 17:44:26.698609
i: 700000      2023-02-13 17:45:41.338605
i: 800000      2023-02-13 17:46:55.932261
i: 900000      2023-02-13 17:48:05.749752
i: 1000000      2023-02-13 17:49:18.925491
i: 1100000      2023-02-13 17:50:32.353942
i: 1200000      2023-02-13 17:51:47.426179
i: 1300000      2023-02-13 17:53:04.358037
i: 1400000      2023-02-13 17:54:21.273968
i: 1500000      2023-02-13 17:55:39.439697
i: 1600000      2023-02-13 17:56:54.768232
i: 1700000      2023-02-13 17:58:10.620531
i: 1800000      2023-02-13 17:59:24.899777
CPU times: user 23min 1s, sys: 3.78 s, total: 23min 5s
Wall time: 23min 5s


In [15]:
# training + validation patients
1834639 + 203848

2038487

In [16]:
# training + validation injuries
695429 + 6283720

6979149