In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lifelines as lf

import math
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# drop patients missing duration
# encode missing value for race
# impute missing sex
# get dummy values for categorical data (REMOVED)
# impute ages where they cannot be calculated

In [2]:
comps = ['425', '428', '434']
numVisits = ['0', '2', '3', '4']
columns = ['duration', 'event', 'afib', 'race', 'sex', 'age']

for comp in comps:
    for visitnum in numVisits:
        # load positive set
        data = pd.read_csv('afib_' + comp + '_' + visitnum + '_X2_Survival.csv')
        data.columns = columns
        
        # remove patients that have 0 duration
        data = data[data['duration'] > 0]
        
        # grab negative set
        temp = pd.read_csv('afib_' + comp + '_Negative_X2_Survival.csv')
        temp.columns = columns
        
        # remove patients that have 0 duration
        temp = temp[temp['duration'] > 0]
        
        # sample as many patients from negative set as positive
        # 50/50 split
        numPos = data.shape[0]
        temp = temp.sample(n=numPos, random_state=0)
        
        # merge positive and negative
        data = data.append(temp)
        data.columns = columns
        
        # transform to array
        temp = data.values.tolist()

        # impute values for age with mean
        # convert missing sex to np.nan for sklearn.imputer

        for row in temp:
            # missing age
            if row[5] == -99:
                row[5] = np.nan

            # missing race
            if row[3] in [-8, -9]:
                row[3] = 9

            # missing sex
            if row[4] == -9:
                row[4] = np.nan
                
        # impute missing age
        tempdf = pd.DataFrame(data=temp)
        tempdf.columns = columns
        tempdf['age'] = tempdf['age'].fillna(tempdf['age'].mean())

        # impute missing sex
        temp = tempdf.values.tolist()
        imp = IterativeImputer(sample_posterior=True)
        temp = imp.fit_transform(temp)

        # binarize imputations
        for row in temp:
            if 0 <= row[4] and row[4] <= 1: 
                row[4] = round(row[4])
            else:
                row[4] = round(abs(math.sin(row[4])))

        # transform to df and add columns
        data = pd.DataFrame(data=temp)
        data.columns = columns

        # get dummy values for categorical features
        finalData = pd.get_dummies(data, columns=['afib','sex','race'])
        
        # save to file
        outpath = 'clean2class/afib_' + comp + '_' + visitnum + '_clean_X2.csv'
        finalData.to_csv(outpath, index=False)
        print(comp, ':', visitnum)

425 : 0
425 : 2
425 : 3
425 : 4
428 : 0
428 : 2
428 : 3
428 : 4
434 : 0
434 : 2
434 : 3
434 : 4
