In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lifelines as lf
import sklearn as sk

In [2]:
# let's get our data
data = pd.read_csv('clean/afib_425_2_clean.csv')
data.describe()
T, E = data['duration'], data['event']

In [3]:
data.head()

Unnamed: 0,duration,event,age1,sex_0.0,sex_1.0,race_1.0,race_2.0,race_3.0,race_4.0,race_5.0,race_6.0,race_9.0
0,203.0,1.0,88.0,0,1,0,0,0,0,0,1,0
1,1076.0,1.0,67.0,1,0,1,0,0,0,0,0,0
2,1374.0,1.0,87.0,0,1,0,0,0,1,0,0,0
3,1074.0,1.0,87.0,1,0,1,0,0,0,0,0,0
4,1980.0,1.0,73.0,0,1,1,0,0,0,0,0,0


In [4]:
data['event'].value_counts()

0.0    526707
1.0     32541
Name: event, dtype: int64

In [5]:
from lifelines.utils.sklearn_adapter import sklearn_adapter
X = pd.DataFrame(data.drop('duration', axis=1))
Y = pd.DataFrame(data['duration'])

from sklearn import preprocessing
X = pd.DataFrame(preprocessing.scale(X))

In [6]:
from sklearn import model_selection
trainX, valX, trainY, valY = model_selection.train_test_split(X, Y)
trainX, valX, trainY, valY = pd.DataFrame(trainX), pd.DataFrame(valX), pd.DataFrame(trainY), pd.DataFrame(valY)
trainX.columns=['event','age','s1','s2','r1','r2','r3','r4','r5','r6','r9'] 

In [7]:
CoxRegression = sklearn_adapter(lf.CoxPHFitter, event_col='event')

sk_cph = CoxRegression(penalizer=.05)
sk_cph.fit(trainX, trainY)
sk_cph.lifelines_model.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'duration_col'
event col,'event'
penalizer,0.05
l1 ratio,0
baseline estimation,breslow
number of observations,419436
number of events observed,419436
partial log-likelihood,-4981571.87
time fit was run,2020-04-22 03:30:46 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age,0.11,1.12,0.0,0.11,0.12,1.12,1.12,69.08,<0.005,inf
s1,0.01,1.01,0.0,-0.0,0.02,1.0,1.02,1.86,0.06,3.98
s2,-0.01,0.99,0.0,-0.02,0.0,0.98,1.0,-1.86,0.06,3.98
r1,-0.17,0.85,0.0,-0.17,-0.16,0.84,0.85,-37.44,<0.005,1016.49
r2,-0.02,0.98,0.0,-0.03,-0.02,0.97,0.98,-11.51,<0.005,99.39
r3,-0.05,0.95,0.0,-0.06,-0.05,0.94,0.95,-21.73,<0.005,345.53
r4,-0.05,0.95,0.0,-0.05,-0.04,0.95,0.96,-21.41,<0.005,335.4
r5,-0.0,1.0,0.0,-0.01,-0.0,0.99,1.0,-2.68,0.01,7.1
r6,-0.02,0.98,0.0,-0.02,-0.02,0.98,0.98,-11.5,<0.005,99.29
r9,0.22,1.24,0.0,0.21,0.22,1.23,1.25,47.4,<0.005,inf

0,1
Concordance,0.61
Log-likelihood ratio test,58594.84 on 10 df
-log2(p) of ll-ratio test,inf


In [8]:
valX.columns=['event','age','s1','s2','r1','r2','r3','r4','r5','r6','r9'] 
sk_cph.score(valX, valY)

0.6104776720586487

In [9]:
from sklearn.linear_model import LinearRegression

lrg = LinearRegression()
lrg.fit(trainX, trainY)
print(lf.utils.concordance_index(valY, lrg.predict(valX), event_observed=valX['event']))

0.6138784749881531


In [10]:
WeibullAFT = sklearn_adapter(lf.WeibullAFTFitter, event_col='event')

sk_aft = WeibullAFT()
sk_aft.fit(trainX, trainY)
sk_aft.lifelines_model.print_summary()

'''
aft = lf.WeibullAFTFitter()
aft.fit(data, duration_col='duration', event_col='event')
aft.print_summary(3)
'''


It's advisable to not trust the variances reported, and to be suspicious of the fitted parameters too.



0,1
model,lifelines.WeibullAFTFitter
duration col,'duration_col'
event col,'event'
number of observations,419436
number of events observed,419436
log-likelihood,-3176231.37
time fit was run,2020-04-22 03:31:56 UTC

Unnamed: 0,Unnamed: 1,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
lambda_,age,-0.11,0.89,0.0,-0.12,-0.11,0.89,0.9,-57.97,<0.005,inf
lambda_,s1,-0.01,0.99,,,,,,,,
lambda_,s2,0.01,1.01,,,,,,,,
lambda_,r1,0.16,1.18,0.02,0.13,0.2,1.14,1.22,10.02,<0.005,76.1
lambda_,r2,0.02,1.02,0.01,0.01,0.03,1.01,1.03,4.09,<0.005,14.52
lambda_,r3,0.05,1.05,0.01,0.04,0.07,1.04,1.07,6.72,<0.005,35.64
lambda_,r4,0.05,1.05,0.01,0.03,0.06,1.03,1.06,6.79,<0.005,36.4
lambda_,r5,0.0,1.0,0.0,-0.0,0.01,1.0,1.01,1.68,0.09,3.43
lambda_,r6,0.02,1.02,0.0,0.01,0.02,1.01,1.02,5.97,<0.005,28.62
lambda_,r9,-0.21,0.81,0.02,-0.24,-0.18,0.78,0.84,-12.67,<0.005,119.82

0,1
Concordance,0.61
Log-likelihood ratio test,42717.83 on 10 df
-log2(p) of ll-ratio test,inf


"\naft = lf.WeibullAFTFitter()\naft.fit(data, duration_col='duration', event_col='event')\naft.print_summary(3)\n"

In [11]:
sk_aft.score(valX, valY)

0.6103734225733197

In [24]:
aff = lf.AalenAdditiveFitter(coef_penalizer=.05)
aff.fit(data, duration_col='duration', event_col='event')
aff.print_summary()

0,1
model,lifelines.AalenAdditiveFitter
duration col,'duration'
event col,'event'
coef penalizer,0.05
number of subjects,559248
number of events observed,32541
time fit was run,2020-04-22 04:21:00 UTC

Unnamed: 0,slope(coef),se(slope(coef))
age1,0.0,0.0
sex_0.0,0.0,0.0
sex_1.0,0.0,0.0
race_1.0,-0.0,0.0
race_2.0,0.0,0.0
race_3.0,0.0,0.0
race_4.0,0.0,0.0
race_5.0,0.0,0.0
race_6.0,-0.0,0.0
race_9.0,0.0,0.0

0,1
Concordance,0.62
