In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lifelines as lf

In [71]:
# let's get our data
data = pd.read_csv('afib_434_4_Survival.csv')
columns = ['duration', 'event', 'race', 'sex', 'age1', 'age2']
data.columns = columns

# grab negative set and append
temp = pd.read_csv('afib_434_Negative_Survival.csv')
temp.columns = columns
data = data.append(temp)

data.head()

Unnamed: 0,duration,event,race,sex,age1,age2
0,1807,1,1,1,62,67
1,1071,1,4,1,79,82
2,1686,1,1,1,88,93
3,1814,1,1,1,81,86
4,307,1,3,0,67,67


In [72]:
# remove patients that have 0 duration
data = data[data['duration'] >0]
data = data[data['age1']!=0]
data = data[data['age2']!=0]
data

Unnamed: 0,duration,event,race,sex,age1,age2
0,1807,1,1,1,62,67
1,1071,1,4,1,79,82
2,1686,1,1,1,88,93
3,1814,1,1,1,81,86
4,307,1,3,0,67,67
...,...,...,...,...,...,...
945545,9,1,-9,-9,77,78
945549,64,1,-9,-9,88,88
945557,29,1,-9,-9,67,67
945559,5,1,-9,-9,76,76


In [73]:
temp = data.values.tolist()
temp

[[1807, 1, 1, 1, 62, 67],
 [1071, 1, 4, 1, 79, 82],
 [1686, 1, 1, 1, 88, 93],
 [1814, 1, 1, 1, 81, 86],
 [307, 1, 3, 0, 67, 67],
 [1122, 1, 1, 1, 81, 84],
 [2518, 1, -9, -9, 67, 77],
 [1565, 1, 1, 0, 79, 84],
 [1924, 1, 1, 1, 72, 78],
 [2086, 1, 1, 0, 81, 87],
 [530, 1, 1, 0, 71, 73],
 [1859, 1, 1, 1, 84, 89],
 [2453, 1, 1, 0, 76, 83],
 [1325, 1, 2, 1, 78, 82],
 [916, 1, 6, 1, 81, 82],
 [156, 1, 1, 1, 93, 87],
 [1326, 1, 1, 1, 80, 84],
 [1442, 1, -9, 0, 67, 72],
 [2164, 1, 1, 1, 68, 74],
 [625, 1, 2, 0, 84, 85],
 [2066, 1, 1, 1, 86, 92],
 [2359, 1, 1, 1, 77, 84],
 [1839, 1, 4, 1, 87, 87],
 [816, 1, 2, 1, 70, 72],
 [2115, 1, 1, 1, 82, 88],
 [1182, 1, 1, 0, 89, 92],
 [417, 1, 2, 1, 50, 52],
 [2918, 1, -9, -9, 69, 77],
 [2343, 1, 2, 1, 69, 76],
 [2623, 1, 2, 1, 80, 88],
 [1266, 1, 1, 0, 75, 78],
 [1267, 1, -9, -9, 77, 82],
 [1171, 1, 1, 1, 83, 86],
 [635, 1, 1, 1, 83, 85],
 [621, 1, 1, 0, 86, 88],
 [1574, 1, 1, 0, 81, 85],
 [1856, 1, 2, 0, 63, 67],
 [1029, 1, 1, 0, 63, 65],
 [632, 1, 1, 0

In [74]:
# resolve missing ages from data we have
# keep running avg to later impute values
# convert missing values to np.nan

sumAge = 0
numAges = 0

for row in temp:
    if row[4] == -99 and row[5] != -99:
        row[4] = row[5] - (row[0] // 365)
        
        
    elif row[5] == -99 and row[4] != -99:
        row[5] = row[4] + (row[0] // 365) 
        
    # missing race
    if row[2] in [-8, -9]:
        row[2] = np.nan
        
    # missing sex
    if row[3] == -9:
        row[3] = np.nan
        
    if row[4] != -99:
        sumAge += row[4]
        numAges += 1
    
avgAge = sumAge // numAges

for row in temp:
    if row[4] == -99:
        row[4] = avgAge
        row[5] = row[4] + (row[0] // 365)

data = pd.DataFrame(data=temp)
data = data[~np.isnan(data[3])] #drop missing sex
data

Unnamed: 0,0,1,2,3,4,5
0,1807,1,1.0,1.0,62,67
1,1071,1,4.0,1.0,79,82
2,1686,1,1.0,1.0,88,93
3,1814,1,1.0,1.0,81,86
4,307,1,3.0,0.0,67,67
...,...,...,...,...,...,...
328507,1592,1,1.0,0.0,79,84
328508,1627,1,1.0,0.0,73,78
328509,2218,1,1.0,1.0,36,42
328510,3150,1,1.0,0.0,40,49


In [78]:
data[data[2]==0].count()

0    0
1    0
2    0
3    0
4    0
5    0
dtype: int64

In [63]:
# impute missing sex and race
from sklearn.impute import SimpleImputer

imp = SimpleImputer(strategy='most_frequent')
data = pd.DataFrame(data=imp.fit_transform(data))
data

Unnamed: 0,0,1,2,3,4,5
0,1807.0,1.0,1.0,1.0,62.0,67.0
1,1071.0,1.0,4.0,1.0,79.0,82.0
2,1686.0,1.0,1.0,1.0,88.0,93.0
3,1814.0,1.0,1.0,1.0,81.0,86.0
4,307.0,1.0,3.0,0.0,67.0,67.0
...,...,...,...,...,...,...
306659,1592.0,1.0,1.0,0.0,79.0,84.0
306660,1627.0,1.0,1.0,0.0,73.0,78.0
306661,2218.0,1.0,1.0,1.0,36.0,42.0
306662,3150.0,1.0,1.0,0.0,40.0,49.0


1.0    518906
3.0     34362
4.0     21694
2.0     15255
6.0      3191
5.0       178
Name: 2, dtype: int64