In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from synthcity.utils.serialization import save_to_file, load_from_file
import tabulate

In [2]:
base_path = Path("/code/datasets/cf")

train_static = pd.read_csv(base_path / "cf_static_train_data.csv.gz")
train_temporal_df = load_from_file(base_path / "temporal_train_data.pkl")
train_horizons = []
for tmp in train_temporal_df:
    train_horizons.append(tmp.index.values.tolist())

test_static = pd.read_csv(base_path / "cf_static_test_data.csv.gz")
test_temporal_df = load_from_file(base_path / "temporal_test_data.pkl")
test_horizons = []
for tmp in test_temporal_df:
    test_horizons.append(tmp.index.values.tolist())

drop_cols = ["death", "time_to_event", "id"]

train_T = train_static["time_to_event"].values
train_E = train_static["death"].values
train_static = train_static.drop(columns = drop_cols).fillna(-1)


test_T = test_static["time_to_event"].values
test_E = test_static["death"].values
test_static = test_static.drop(columns = drop_cols).fillna(-1)


train_static

Unnamed: 0,Gender,Smoking Status,Class I Mutation,Class II Mutation,Class III Mutation,Class IV Mutation,Class V Mutation,Class VI Mutation,DF508 Mutation,G551D Mutation
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4632,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
4633,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
4634,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
4635,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0


In [3]:
from sklearn.preprocessing import MinMaxScaler

features = []
for item in train_temporal_df:
    features.extend(item.columns)
features = np.unique(features)

def prep_temporal(df):
    temporal = []
    scale_cols = {
     "time" : [], 
     "BMI": [], 
     "Weight": [],
     "Best FEV1": [],
    }
    for item in df:
        for col in scale_cols:
            if col in item.columns:
                scale_cols[col].extend(item[col])
        scale_cols["time"].extend(item.index)
        
    scalers = {}
    for col in scale_cols:
        scalers[col] = MinMaxScaler().fit(np.asarray(scale_cols[col]).reshape(-1, 1))
    
    for item in df:
        for col in features:
            if col not in item:
                item[col] = np.nan
        item["time"] = item.index
        item = item.fillna(0)
        
        for col in scalers:
            item[col] = scalers[col].transform(item[col].values.reshape(-1, 1))
        
            
        temporal.append(item[features].astype(float))
    
    return temporal

    
train_temporal = prep_temporal(train_temporal_df)
test_temporal = prep_temporal(test_temporal_df)

In [4]:
import numpy as np
import pandas as pd

def process_temporal(in_static_data, in_temporal_data):
    all_temporal_data = []
    horizons = []
    
    for uid in in_static_data["id"].unique():
        local_temporal_idx = in_temporal_data["id"] == uid
        local_temporal = in_temporal_data[local_temporal_idx]
        columns = sorted(local_temporal["variable"].unique())
        times = sorted(local_temporal["time"].unique())

        temporal_data = pd.DataFrame([], columns = columns)


        for horizon in times:
            local_temporal_horizon_idx = local_temporal["time"] == horizon
            local_temporal_horizon = local_temporal[local_temporal_horizon_idx]

            horizon_data = pd.DataFrame(-1 * np.ones((1, len(columns))), columns = columns)

            proc = local_temporal_horizon[["variable", "value"]]
            proc.index = local_temporal_horizon["variable"]
            proc = proc.drop(columns = ["variable"])
            proc = proc.T.reset_index(drop = True)
            horizon_data[proc.columns] = proc

            temporal_data = pd.concat([temporal_data, horizon_data], ignore_index = True)
        temporal_data.index = times

        for col in columns:
            if col not in temporal_data:
                temporal_data[col] = -1
        horizons.append(temporal_data["time"])
        all_temporal_data.append(temporal_data[columns])
        
    assert len(all_temporal_data) == len(in_static_data)

    return all_temporal_data, horizons

#proc_test_temporal, proc_test_horizons = process_temporal(test_static, test_temporal)
#proc_train_temporal, proc_train_horizons = process_temporal(train_static, train_temporal)

In [5]:
from synthcity.plugins.core.dataloader import (
     TimeSeriesSurvivalDataLoader,
)
horizons = [0.25, 0.5, 0.75]
time_horizons = np.quantile(
             [t_ for t_, e_ in zip(train_T, train_E) if e_ == 1], horizons
).tolist()
    
survival_data_train = TimeSeriesSurvivalDataLoader(
    temporal_data=train_temporal,
    temporal_horizons=train_horizons,
    static_data=train_static,
    T=train_T,
    E=train_E,
    time_horizons=time_horizons,
)


survival_data_test = TimeSeriesSurvivalDataLoader(
    temporal_data=test_temporal,
    temporal_horizons=test_horizons,
    static_data=test_static,
    T=test_T,
    E=test_E,
    time_horizons=time_horizons,
)

<stdin>:1:10: fatal error: cuda.h: No such file or directory
compilation terminated.

<stdin>:1:10: fatal error: cuda.h: No such file or directory
compilation terminated.

<stdin>:1:10: fatal error: cuda.h: No such file or directory
compilation terminated.



In [6]:
from synthcity.plugins.core.models.time_series_survival.benchmarks import (
     evaluate_ts_survival_model,
)


def eval_model(mod, **kwargs):
    n_folds = 3
    model = mod(**kwargs)
    
    train_static, train_temporal, train_horizons, train_T, train_E = survival_data_train.unpack(as_numpy = True)
    model.fit(train_static, train_temporal, train_horizons, train_T, train_E)

    
    test_static, test_temporal, test_horizons, test_T, test_E = survival_data_test.unpack(as_numpy = True)
    
    score = evaluate_ts_survival_model(
        [model] * n_folds, 
        test_static, test_temporal, test_horizons, test_T, test_E, 
        time_horizons, pretrained = True,
        n_folds = n_folds
    )
    return score


In [None]:
from synthcity.plugins.core.models.time_series_survival import (
     DynamicDeephitTimeSeriesSurvival, rnn_modes, output_modes
 )

headers = ["Model", "C-Index", "Brier score"]
results = pd.DataFrame([], columns = headers)

for output_mode in output_modes:
    for base in rnn_modes:
        try:
            score = eval_model(DynamicDeephitTimeSeriesSurvival, rnn_type = base, output_type = output_mode)["str"]
        except BaseException as e:
            print("failed", base, output_mode)
            continue
        local_results = pd.DataFrame([[f"DynDeephit[{base} -> {output_mode}]", score["c_index"], score["brier_score"]]], columns = headers)
        results = pd.concat([results, local_results], ignore_index = True)
    
tabulate.tabulate(results, tablefmt='html')

In [9]:
import tabulate
tabulate.tabulate(results, tablefmt='html')


0,1,2,3
0,DynDeephit[GRU -> MLP],0.9169 +/- 0.0216,0.0612 +/- 0.0063
1,DynDeephit[LSTM -> MLP],0.9169 +/- 0.0133,0.0617 +/- 0.007
2,DynDeephit[RNN -> MLP],0.923 +/- 0.009,0.0618 +/- 0.0057
3,DynDeephit[Transformer -> MLP],0.9175 +/- 0.017,0.0605 +/- 0.005
4,DynDeephit[Wavelet -> MLP],0.9266 +/- 0.0072,0.0596 +/- 0.0047
5,DynDeephit[GRU -> Transformer],0.9204 +/- 0.0086,0.061 +/- 0.0056
6,DynDeephit[LSTM -> Transformer],0.923 +/- 0.0129,0.0608 +/- 0.0053
7,DynDeephit[RNN -> Transformer],0.9339 +/- 0.0095,0.0612 +/- 0.0047
8,DynDeephit[Transformer -> Transformer],0.9166 +/- 0.009,0.0624 +/- 0.005
9,DynDeephit[Wavelet -> Transformer],0.9251 +/- 0.0094,0.0598 +/- 0.0038


In [10]:
from synthcity.plugins.core.models.time_series_survival import (
     CoxTimeSeriesSurvival,
 )

headers = ["Model", "C-Index", "Brier score"]
results = pd.DataFrame([], columns = headers)

for output_mode in output_modes:
    for base in rnn_modes:
        try:
            score = eval_model(CoxTimeSeriesSurvival, emb_rnn_type = base, emb_output_type = output_mode)["str"]
        except BaseException:
            print("failed", base, output_mode)
            continue
        local_results = pd.DataFrame([[f"CoxPH[{base} -> {output_mode}]", score["c_index"], score["brier_score"]]], columns = headers)
        results = pd.concat([results, local_results], ignore_index = True)

tabulate.tabulate(results, tablefmt='html')

failed GRU MiniRocket
failed LSTM MiniRocket
failed RNN MiniRocket
failed Transformer MiniRocket
failed Wavelet MiniRocket
failed GRU mWDNPlus
failed LSTM mWDNPlus
failed RNN mWDNPlus
failed Transformer mWDNPlus
failed Wavelet mWDNPlus
failed GRU XCM
failed LSTM XCM
failed RNN XCM
failed Transformer XCM
failed Wavelet XCM


0,1,2,3
0,CoxPH[GRU -> MLP],0.9437 +/- 0.0035,0.0544 +/- 0.0062
1,CoxPH[LSTM -> MLP],0.9428 +/- 0.003,0.0552 +/- 0.0066
2,CoxPH[RNN -> MLP],0.9448 +/- 0.0051,0.0552 +/- 0.0059
3,CoxPH[Transformer -> MLP],0.9427 +/- 0.0033,0.0564 +/- 0.0063
4,CoxPH[Wavelet -> MLP],0.9321 +/- 0.011,0.0558 +/- 0.0065
5,CoxPH[GRU -> Transformer],0.9461 +/- 0.0034,0.0556 +/- 0.0067
6,CoxPH[LSTM -> Transformer],0.945 +/- 0.0044,0.0561 +/- 0.0072
7,CoxPH[RNN -> Transformer],0.9481 +/- 0.0027,0.0546 +/- 0.0046
8,CoxPH[Transformer -> Transformer],0.9415 +/- 0.0045,0.0562 +/- 0.0059
9,CoxPH[Wavelet -> Transformer],0.9402 +/- 0.0037,0.0556 +/- 0.0041


In [11]:
from synthcity.plugins.core.models.time_series_survival import (
     XGBTimeSeriesSurvival,
 )

headers = ["Model", "C-Index", "Brier score"]
results = pd.DataFrame([], columns = headers)

for output_mode in output_modes:
    for base in rnn_modes:
        try:
            score = eval_model(XGBTimeSeriesSurvival, emb_rnn_type = base, emb_output_type = output_mode)["str"]
        except BaseException:
            print("failed", base, output_mode)
            continue
        local_results = pd.DataFrame([[f"XGB[{base} -> {output_mode}]", score["c_index"], score["brier_score"]]], columns = headers)
        results = pd.concat([results, local_results], ignore_index = True)

tabulate.tabulate(results, tablefmt='html')

failed GRU MiniRocket
failed LSTM MiniRocket
failed RNN MiniRocket
failed Transformer MiniRocket
failed Wavelet MiniRocket
failed GRU mWDNPlus
failed LSTM mWDNPlus
failed RNN mWDNPlus
failed Transformer mWDNPlus
failed Wavelet mWDNPlus
failed GRU XCM
failed LSTM XCM
failed RNN XCM
failed Transformer XCM
failed Wavelet XCM


0,1,2,3
0,XGB[GRU -> MLP],0.9 +/- 0.0101,0.0526 +/- 0.0042
1,XGB[LSTM -> MLP],0.9096 +/- 0.01,0.053 +/- 0.0042
2,XGB[RNN -> MLP],0.9138 +/- 0.0292,0.0536 +/- 0.0058
3,XGB[Transformer -> MLP],0.9185 +/- 0.0257,0.0517 +/- 0.0008
4,XGB[Wavelet -> MLP],0.8731 +/- 0.0092,0.0565 +/- 0.0036
5,XGB[GRU -> Transformer],0.9 +/- 0.041,0.0522 +/- 0.0045
6,XGB[LSTM -> Transformer],0.8935 +/- 0.0293,0.0524 +/- 0.0035
7,XGB[RNN -> Transformer],0.8999 +/- 0.0089,0.051 +/- 0.002
8,XGB[Transformer -> Transformer],0.9077 +/- 0.0272,0.0516 +/- 0.004
9,XGB[Wavelet -> Transformer],0.8797 +/- 0.0127,0.0545 +/- 0.0033
