In [47]:
import numpy as np
import tensorflow as tf
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from sksurv.metrics import concordance_index_censored, concordance_index_ipcw
from sksurv.metrics import integrated_brier_score
from utility.survival import convert_to_structured
from utility.training import get_data_loader, scale_data, make_time_event_split
from tools.model_builder import make_cox_model, make_coxnet_model, make_rsf_model
from utility.risk import _make_riskset
from utility.loss import CoxPHLoss
from pathlib import Path
import paths as pt
import joblib
import os
from time import time
from utility.config import load_config
from tools.preprocessor import Preprocessor

# "SUPPORT" "SEER" "GBSG2" "WHAS" "FLCHAIN" "METABRIC")

In [48]:
dl = get_data_loader("GBSG2").load_data()
X, y = dl.get_data()
num_features, cat_features = dl.get_features()

In [49]:
obs_arr = np.array(y['Event'])
n_censored = obs_arr.shape[0] - obs_arr.sum()
print(f"Number of samples: {len(y)}")
print(f"Number of censored/uncensored: {n_censored}/{obs_arr.shape[0]-n_censored}")
print(f"{round(n_censored / obs_arr.shape[0] * 100, 2)}% of records are censored\n")

Number of samples: 8873
Number of censored/uncensored: 2837/6036
31.97% of records are censored



In [50]:
y[(y['Event'] == True)]['Time'].mean()

205.4536116633532

In [51]:

preprocessor = Preprocessor(cat_feat_strat='mode', num_feat_strat='mean')
transformer = preprocessor.fit(X, cat_feats=cat_features, num_feats=num_features,
                               one_hot=True, fill_value=-1)
np.array(transformer.transform(X), dtype=np.float32).shape

(8873, 14)