In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lifelines as lf

import math
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# drop patients missing duration
# encode missing value for race
# impute missing sex
# get dummy values for categorical data (REMOVED)
# impute ages where they cannot be calculated

In [2]:
comps = ['425', '428', '434']
numVisits = ['0', '2', '3', '4']
columns = ['duration', 'event', 'race', 'sex', 'age1', 'age2']

for comp in comps:
    for visitnum in numVisits:
        # load positive set
        data = pd.read_csv('afib_' + comp + '_' + visitnum + '_Survival.csv')
        data.columns = columns
        
        # grab negative set
        temp = pd.read_csv('afib_' + comp + '_Negative_Survival.csv')
        temp.columns = columns
        
        # merge positive and negative
        data = data.append(temp)
        data.columns = columns

        # remove patients that have 0 duration
        data = data[data['duration'] >0]
        
        # transform to array
        temp = data.values.tolist()

        # resolve missing ages for patients only missing one
        # keep avg to later impute values for those missing both
        # convert missing sex to np.nan for sklearn.imputer

        sumAge = 0
        numAges = 0

        for row in temp:
            if row[4] == -99 and row[5] != -99:
                row[4] = row[5] - (row[0] // 365)


            elif row[5] == -99 and row[4] != -99:
                row[5] = row[4] + (row[0] // 365) 

            # missing race
            if row[2] in [-8, -9]:
                row[2] = 9

            # missing sex
            if row[3] == -9:
                row[3] = np.nan

            if row[4] != -99:
                sumAge += row[4]
                numAges += 1

        avgAge = sumAge // numAges
        
        # impute ages for patients missing both
        for row in temp:
            if row[4] == -99:
                row[4] = avgAge
                row[5] = row[4] + (row[0] // 365)

        # impute missing sex
        imp = IterativeImputer(sample_posterior=True)
        temp = imp.fit_transform(temp)
        temp

        # binarize imputations
        for row in temp:
            if 0 <= row[3] and row[3] <= 1: 
                row[3] = round(row[3])
            else:
                row[3] = round(abs(math.sin(row[3])))

        # transform to df and add columns
        data = pd.DataFrame(data=temp)
        data.columns = columns

        # get dummy values for categorical features
        # finalData = pd.get_dummies(data, columns=['sex', 'race'])
        
        # save to file
        outpath = 'clean/afib_' + comp + '_' + visitnum + '_clean.csv'
        data.to_csv(outpath, index=False)
        print(comp, ':', visitnum)

425 : 0
425 : 2
425 : 3
425 : 4
428 : 0
428 : 2
428 : 3
428 : 4
434 : 0
434 : 2
434 : 3
434 : 4
