# Cox-Prop Hazards Analysis using scikit-survival - WHAS500

In [1]:
# Core libraries
import pandas as pd 
import numpy as np 
import seaborn as sb 
import matplotlib.pyplot as plt

# Survival analysis and datasets 
from sklearn import set_config
from sksurv.datasets import load_whas500
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored

# Other analysis
from tableone import TableOne

SEED = 20221228

### Load in dataset 

In [41]:
# scikit-survival y has two fields (outcome and follow-up time)
    ##  If survival was True, then the column denoting the event indicator will be boolean and survival times will be float
    ##  If attr_labels contains None, y is set to None.
whas500_X, whas500_y = load_whas500()
print(whas500_X.dtypes)
print(whas500_y.dtype)

X_train, X_test, y_train, y_test = train_test_split(whas500_X, whas500_y, test_size=0.25, stratify=whas500_y["fstat"], random_state=SEED)

afb       category
age        float64
av3       category
bmi        float64
chf       category
cvd       category
diasbp     float64
gender    category
hr         float64
los        float64
miord     category
mitype    category
sho       category
sysbp      float64
dtype: object
[('fstat', '?'), ('lenfol', '<f8')]


### Train model 

In [None]:
# Note that the categorical features need to be encoded else errors get thrown when trying to predict (model still fits for some reason)
coxPH = make_pipeline(OneHotEncoder(), CoxPHSurvivalAnalysis())
coxPH.fit(X_train, y_train)

#Generate predictions 
predictions = coxPH.predict(X_test)

### Evaluate model

#### Discrimination 

In [None]:
# Basic C-index 
C_Index = concordance_index_censored(y_test['fstat'], y_test['lenfol'], predictions)[0]
C_Index

#### Calibration