In [1]:
# Create Positive Sets
# -- the positive patients are patients who have been diagnosed with AFib and MAY OR MAY NOT have developed the 
#    complication, but if they did the AFib 
#    diagnosis will have occurred a defined number of hospital visits before the complication was diagnosed
# -- specifying visits between diagnoses is useful for ensuring our patients have not been diagnosed with both
#    diseases in the same visit

afib = 'D_42731'
comps = ['D_425', 'D_428', 'D_434'] # cardiomyopathy, heart failure, stroke
visitsInBetweenList = [0, 2, 3, 4] # can be set to zero

for comp in comps:
    for visitsInBetween in visitsInBetweenList:
        count = 0 

        newCSV = open('afib_' + comp[2:] + '_' + str(visitsInBetween) + '_X2.csv', 'w') 

        with open('../datasets/FullHCUPDataset.csv') as f:
            for line in f: #for each of our patients
                visits = line.replace('\n','').split('|')[2].split('#') #list of all of this patient's visits

                # this patient does not have enough visits
                if len(visits) < visitsInBetween: 
                    continue

                afibFound = False #used to check during initial visits and for early return afterward
                compFound = False #used to check if the complication was found
                index = 0

                #check for afib diag
                while index < len(visits): 

                    codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                    index += 1

                    for code in codes:

                        if afib in code: 
                            afibFound = True


                        elif comp in code:
                            compFound = True
                            break

                    if afibFound or compFound: # early return
                        break

                #if we did not find AFIB || if complication was found in the initial diag visit
                if not afibFound or compFound: 
                    continue

                numVisits = visitsInBetween # keep original value saved

                if len(visits) - index - 1 < numVisits: #this patient doesn't have enough visits left
                    continue

                #now check the visits in between afib diag and comp diag
                #make sure complication has not been diagnosed here
                while index < len(visits) and numVisits > 0: 
                    codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                    index += 1
                    numVisits -= 1

                    for code in codes:
                        if comp in code: #complication code found too early
                            compFound = True
                            break

                    if compFound: 
                        break

                if compFound:
                    continue

                while index < len(visits):
                    codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                    index += 1

                    for code in codes:
                        if comp in code: #we found the complication
                            count += 1
                            newCSV.write(line)
                            compFound = True
                            break

                    if compFound: 
                        break

                if compFound:
                    continue
        
        censoredNum = count * .3 # get 30% censored patients for extrapolation
        with open('../datasets/FullHCUPDataset.csv') as f:
            for line in f: #for each of our patients
                visits = line.replace('\n','').split('|')[2].split('#') #list of all of this patient's visits

                afibFound = False #used to make sure patient has initial disease
                compFound = False #used to check if the complication was found and discard this patient

                index = 0

                while index < len(visits):
                    codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                    index += 1

                    for code in codes:
                        if afib in code: #primary diag code
                            afibFound = True

                        elif comp in code:
                            compFound = True
                            break

                    if compFound: 
                        break

                if not afibFound or compFound: #if we did not find AFIB or if complication was found in the initial diag visit
                    continue
                else:
                    newCSV.write(line)
                    censoredNum -= 1
                    if censoredNum == 0:
                        break

                            
        print(comp, '-', visitsInBetween, ': ', count)
        newCSV.close()

D_425 - 0 :  74582
D_425 - 2 :  32580
D_425 - 3 :  22570
D_425 - 4 :  15901
D_428 - 0 :  148242
D_428 - 2 :  43175
D_428 - 3 :  25429
D_428 - 4 :  15481
D_434 - 0 :  48886
D_434 - 2 :  20624
D_434 - 3 :  14067
D_434 - 4 :  9813


In [2]:
# Create Negative Sets
# -- negative patients are patients without afib, but who have been diagnosed with the complication
# -- no censoring here

for comp in comps:
    count = 0

    newCSV = open('afib_' + comp[2:] + '_Negative_X2.csv', 'w') 

    with open('../datasets/FullHCUPDataset.csv') as f:
        for line in f: #for each of our patients
            visits = line.replace('\n','').split('|')[2].split('#') #list of all of this patient's visits

            afibFound = False #used to make sure patient has initial disease and discard this patient
            compFound = False #used to check if the complication was found 

            index = 0

            while index < len(visits):
                codes = visits[index].split(',')[2:-1] #split on comma and remove visitlink, age, and year (only diags and procs left)
                index += 1

                for code in codes:
                    if afib in code: #primary diag code
                        afibFound = True
                        break

                    elif comp in code:
                        compFound = True

                if afibFound: 
                    break

            if afibFound or not compFound: #if we did not find complication or afib was found
                continue
            else:
                newCSV.write(line)
                count += 1

    print(comp, ': ', count)
    newCSV.close()

D_425 :  207675
D_428 :  697029
D_434 :  225766


In [3]:
# format patient subset data for survival analysis
# -- duration, observed?, race, sex, age1, age2 
# positive patients

files = ['afib_425_0_X2.csv','afib_425_2_X2.csv','afib_425_3_X2.csv','afib_425_4_X2.csv',\
         'afib_428_0_X2.csv','afib_428_2_X2.csv','afib_428_3_X2.csv','afib_428_4_X2.csv',\
         'afib_434_0_X2.csv','afib_434_2_X2.csv','afib_434_3_X2.csv','afib_434_4_X2.csv']

for file in files:
    comp = 'D_' + file[5:8] # get disease code from filename
    
    newCSV = open(file[:-4] + '_Survival.csv', 'w')
    with open(file) as f:
        for line in f:
            patientInfo = line.replace('\n','').split('|')
            
            demos = patientInfo[1].split(',')
            race = demos[0]
            sex = demos[1]
            
            visits = patientInfo[2].split('#')
            
            afibDiag, compDiag = [], []
            compFound = False
            
            startTime = int(visits[0].split(',')[0]) # timestamp of first visit
            
            for visit in visits:
                for code in visit.split(','):
                    if not afibDiag and afib in code: # first time we see afib diagnosed
                        afibDiag = visit.split(',')
                        
                    if afibDiag and comp in code: # first time comp is diagnosed 
                        compDiag = visit.split(',')
                        compFound = True
                        break
                        
                if compFound:
                    break
            
            classTarget = '-9'
            if afibDiag:
                classTarget = '1' # this is part of positive set -- should all result to 1, but just in case...
            
            if not compDiag: # this is a censored patient
                duration = int(visits[-1].split(',')[0]) - startTime # num days between first and last visits
                observed = '0' # censored
                finalAge = visits[-1].split(',')[1] # age at time of last visit

                newCSV.write(str(duration) + ',' + observed + ',' + classTarget + ',' + race + ',' + sex + \
                             ',' + finalAge + '\n')
                
            else: # this is an uncensored patient
                duration = int(compDiag[0]) - startTime # num days between diagnosis and start
                observed = '1' # uncensored
                compAge = compDiag[1]

                newCSV.write(str(duration) + ',' + observed + ',' + classTarget + ',' + race + ',' + sex + \
                             ',' + compAge + '\n')
        newCSV.close()

In [4]:
# format patient subset data for survival analysis
# -- duration, observed?, race, sex, age1, age2 
# negative patients

files = ['afib_425_Negative_X2.csv',\
         'afib_428_Negative_X2.csv',\
         'afib_434_Negative_X2.csv']

for file in files:
    newCSV = open(file[:-4] + '_Survival.csv', 'w')
    comp = 'D_' + file[5:8]
    
    with open(file) as f:
        for line in f:
            patientInfo = line.replace('\n','').split('|')
            
            demos = patientInfo[1].split(',')
            race = demos[0]
            sex = demos[1]
            
            visits = patientInfo[2].split('#')
            
            compDiag, firstVisit = [], visits[0].split(',')
            compFound = False
            
            for visit in visits:
                for code in visit.split(','):
                    if not compDiag and comp in code: # first time we see comp diagnosed
                        compDiag = visit.split(',')
                        compFound = True
                        break
                        
                if compFound:
                    break
                    
            classTarget = '-9'
            if compFound:
                classTarget = '0' # this is part of negative set -- should all result to 0, but just in case...
                    
            duration = int(compDiag[0]) - int(firstVisit[0]) # num days between diagnoses
            observed = '1' # these should all result to 1 since this is the control group (uncensored)
            
            compAge = compDiag[1]
            
            newCSV.write(str(duration) + ',' + observed + ',' + classTarget + ',' + race + ',' + sex + \
                         ',' + compAge + '\n')
        newCSV.close()