In [56]:
import numpy as np
import tensorflow as tf
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored, concordance_index_ipcw
from sksurv.metrics import integrated_brier_score
from utility.survival import convert_to_structured
from utility.training import get_data_loader, scale_data, make_time_event_split
from tools.model_builder import make_cox_model, make_coxnet_model, make_rsf_model
from utility.risk import _make_riskset
from utility.loss import CoxPHLoss
from pathlib import Path
import paths as pt
import joblib
import os
from time import time
from utility.config import load_config
from tools.preprocessor import Preprocessor

# "SUPPORT" "SEER" "GBSG2" "WHAS500" "FLCHAIN" "METABRIC"

In [57]:
dl = get_data_loader("METABRIC").load_data()
X, y = dl.get_data()
num_features, cat_features = dl.get_features()

In [58]:
y

array([(False,  99.33333588), ( True,  95.73332977),
       (False, 140.2333374 ), ..., ( True,  37.86666489),
       (False, 198.43333435), (False, 140.7666626 )],
      dtype=[('Event', '?'), ('Time', '<f8')])

In [59]:
X

Unnamed: 0,x0,x1,x2,x3,x8,x4,x5,x6,x7
0,5.603834,7.811392,10.797988,5.967607,56.840000,1.0,1.0,0.0,1.0
1,5.284882,9.581043,10.204620,5.664970,85.940002,1.0,0.0,0.0,1.0
2,5.920251,6.776564,12.431715,5.873857,48.439999,0.0,1.0,0.0,1.0
3,6.654017,5.341846,8.646379,5.655888,66.910004,0.0,0.0,0.0,0.0
4,5.456747,5.339741,10.555724,6.008429,67.849998,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
1899,5.946987,5.370492,12.345780,5.741395,76.839996,1.0,1.0,0.0,1.0
1900,5.339228,5.408853,12.176101,5.693043,63.090000,1.0,1.0,0.0,1.0
1901,5.901610,5.272237,14.200950,6.139390,57.770000,0.0,0.0,0.0,1.0
1902,6.818109,5.372744,11.652624,6.077852,58.889999,1.0,0.0,0.0,1.0


In [60]:
X.shape

(1904, 9)

In [61]:
X.columns

Index(['x0', 'x1', 'x2', 'x3', 'x8', 'x4', 'x5', 'x6', 'x7'], dtype='object')

In [62]:
obs_arr = np.array(y['Event'])
n_censored = obs_arr.shape[0] - obs_arr.sum()
print(f"Number of samples: {len(y)}")
print(f"Number of uncensored/censored: {obs_arr.shape[0]-n_censored}/{n_censored}")
print(f"{round(n_censored / obs_arr.shape[0] * 100, 2)}% of records are censored\n")

Number of samples: 1904
Number of uncensored/censored: 1103/801
42.07% of records are censored



In [63]:
y[(y['Event'] == True)]['Time'].mean()

99.95222141031692

In [64]:
preprocessor = Preprocessor(cat_feat_strat='mode', num_feat_strat='mean')
transformer = preprocessor.fit(X, cat_feats=cat_features, num_feats=num_features,
                               one_hot=True, fill_value=-1)
np.array(transformer.transform(X), dtype=np.float32).shape

(1904, 9)