In [1]:
# preprocess patient data into csv files for convenience
# one set will contain the patient data in the same format
# the other set will hold extracted patient features: race, sex, age at time of diag1, age at diag2
# calculate duration between events and label whether patient was censored
# -- according to HCUP documentation, the number of days between visits is the difference between visit ids

# diagnosis codes correspond to ICD-9
# -- I will only use the general code for diseases (i.e. first three digits: nnn.*)

In [2]:
# Create Positive Sets
# -- the positive patients are patients who have been diagnosed with both AFib and the complication, with the AFib 
#    diagnosis occuring a defined number of hospital visits before the complication was diagnosed
# -- specifying visits between diagnoses is useful for ensuring our patients have not been diagnosed with both
#    diseases in the same visit

afib = 'D_42731'
comps = ['D_425', 'D_428', 'D_434'] # cardiomyopathy, heart failure, stroke
visitsInBetweenList = [0, 2, 3, 4] # can be set to zero

for comp in comps:
    for visitsInBetween in visitsInBetweenList:
        count = 0 

        newCSV = open('afib_' + comp[2:] + '_' + str(visitsInBetween) + '.csv', 'w') 

        with open('../datasets/FullHCUPDataset.csv') as f:
            for line in f: #for each of our patients
                visits = line.replace('\n','').split('|')[2].split('#') #list of all of this patient's visits

                # this patient does not have enough visits
                if len(visits) < visitsInBetween: 
                    continue

                afibFound = False #used to check during initial visits and for early return afterward
                compFound = False #used to check if the complication was found
                index = 0

                #check for afib diag
                while index < len(visits): 

                    codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                    index += 1

                    for code in codes:

                        if afib in code: 
                            afibFound = True


                        elif comp in code:
                            compFound = True
                            break

                    if afibFound or compFound: # early return
                        break

                #if we did not find AFIB || if complication was found in the initial diag visit
                if not afibFound or compFound: 
                    continue

                numVisits = visitsInBetween # keep original value saved

                if len(visits) - index - 1 < numVisits: #this patient doesn't have enough visits left
                    continue

                #now check the visits in between afib diag and comp diag
                #make sure complication has not been diagnosed here
                while index < len(visits) and numVisits > 0: 
                    codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                    index += 1
                    numVisits -= 1

                    for code in codes:
                        if comp in code: #complication code found too early
                            compFound = True
                            break

                    if compFound: 
                        break

                if compFound:
                    continue

                while index < len(visits):
                    codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                    index += 1

                    for code in codes:
                        if comp in code: #we found the complication
                            count += 1
                            newCSV.write(line)
                            compFound = True
                            break

                    if compFound: 
                        break

                if compFound:
                    continue

        print(comp, '-', visitsInBetween, ': ', count)
        newCSV.close()

D_425 - 0 :  74582
D_425 - 2 :  32580
D_425 - 3 :  22570
D_425 - 4 :  15901
D_428 - 0 :  148242
D_428 - 2 :  43175
D_428 - 3 :  25429
D_428 - 4 :  15481
D_434 - 0 :  48886
D_434 - 2 :  20624
D_434 - 3 :  14067
D_434 - 4 :  9813


In [3]:
# Create Negative Sets
# -- negative patients are the censored patients
# -- a censored patient is a patient who has not developed the complication of interest after AFib diagnosis

for comp in comps:
    count = 0

    newCSV = open('afib_' + comp[2:] + '_Negative.csv', 'w') 

    with open('../datasets/FullHCUPDataset.csv') as f:
        for line in f: #for each of our patients
            visits = line.replace('\n','').split('|')[2].split('#') #list of all of this patient's visits

            afibFound = False #used to make sure patient has initial disease
            compFound = False #used to check if the complication was found and discard this patient

            index = 0

            while index < len(visits):
                codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                index += 1

                for code in codes:
                    if afib in code: #primary diag code
                        afibFound = True

                    elif comp in code:
                        compFound = True
                        break

                if compFound: 
                    break

            if not afibFound or compFound: #if we did not find AFIB or if complication was found in the initial diag visit
                continue
            else:
                newCSV.write(line)
                count += 1

    print(comp, ': ', count)
    newCSV.close()

D_425 :  880247
D_428 :  506479
D_434 :  945576


In [4]:
# format patient subset data for survival analysis
# -- duration, observed?, race, sex, age1, age2 
# positive patients

files = []

for file in files:
    comp = file[5:9] # get disease code
    
    newCSV = open(file[:-4] + '_Survival.csv', 'w')
    with open(file) as f:
        for line in f:
            patientInfo = line.replace('\n','').split('|')
            
            demos = patientInfo[0].split(',')
            race = demos[0]
            sex = demos[1]
            
            visits = patientInfo[1].split('#')
            
            
            for i, visit in enumerate(visits):
                
            