In [1]:
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import synthcity.logger as log
from sklearn.mixture import GaussianMixture as GMM

from datasets import get_dataset

warnings.filterwarnings("ignore", category=FutureWarning)

warnings.filterwarnings("ignore")
log.add(sink=sys.stderr, level="DEBUG")

In [2]:
from synthcity.plugins.core.models.time_to_event import \
    get_model_template as get_tte_model_template


class TabularGMM:
    def __init__(self, components: int = 100, random_state: int = 0):
        self.model = GMM(100, covariance_type="full", random_state=random_state)
        self.tte_regressor = get_tte_model_template("survival_function_regression")()

    def fit(self, X, T, E):
        self.model.fit(X)
        self.tte_regressor.fit(X, T, E)

        self.E = E
        self.count = len(X)
        self.columns = X.columns

    def generate(self, count: int = None):
        if count is None:
            count = self.count
        sampled, _ = self.model.sample(count)
        sampled = pd.DataFrame(sampled, columns=self.columns)

        E = self.E.reset_index(drop=True).head(count)

        T = pd.Series(
            self.tte_regressor.predict_any(
                sampled,
                E,
            )
        )
        return sampled, T, E

<stdin>:1:10: fatal error: cuda.h: No such file or directory
compilation terminated.

<stdin>:1:10: fatal error: cuda.h: No such file or directory
compilation terminated.

<stdin>:1:10: fatal error: cuda.h: No such file or directory
compilation terminated.



In [4]:
from pathlib import Path

from adjutorium.plugins.prediction.risk_estimation import RiskEstimation
from adjutorium.utils.metrics import generate_score, print_score
from adjutorium.utils.tester import evaluate_survival_estimator
from sklearn.model_selection import train_test_split
from synthcity.utils.serialization import (dataframe_hash, load_from_file,
                                           save_to_file)

out_dir = Path("workspace_rebuttal")

log.remove()
headers = ["dataset", "cindex", "brier score"]

scores = []
for ref_df in ["aids", "maggic", "cutract", "metabric", "seer"]:

    print("=======================")
    print("Evaluate ", ref_df)

    df, duration_col, event_col, time_horizons = get_dataset(ref_df)
    train_df, test_df = train_test_split(df, random_state=0)

    df_hash = dataframe_hash(train_df)

    X = train_df.drop(columns=[duration_col, event_col])
    T = train_df[duration_col]
    E = train_df[event_col]

    Xtest = train_df.drop(columns=[duration_col, event_col])
    Ttest = train_df[duration_col]
    Etest = train_df[event_col]

    cindex = []
    brier = []

    for seed in range(3):
        model_bkp = out_dir / f"{df_hash}_gmm_{seed}.bkp"

        if model_bkp.exists():
            syn_df = load_from_file(model_bkp)
        else:
            generator = TabularGMM(random_state=seed, components=100)

            generator.fit(X, T, E)

            Xsample, Tsample, Esample = generator.generate()

            syn_df = Xsample.copy()
            syn_df[duration_col] = Tsample
            syn_df[event_col] = Esample

            save_to_file(model_bkp, syn_df)

        Xsample = syn_df.drop(columns=[duration_col, event_col])
        Tsample = syn_df[duration_col]
        Esample = syn_df[event_col]

        model = RiskEstimation().get("cox_ph")
        model.fit(Xsample, Tsample, Esample)

        score = evaluate_survival_estimator(
            [model] * 3,
            Xtest,
            Ttest,
            Etest,
            time_horizons=time_horizons,
            pretrained=True,
        )
        cindex.append(score["clf"]["c_index"][0])
        brier.append(score["clf"]["brier_score"][0])

    cidx_str = print_score(generate_score(cindex))
    brier_str = print_score(generate_score(brier))
    print(ref_df, cidx_str, brier_str)
    scores.append((ref_df, cidx_str, brier_str))

Evaluate  aids
aids 0.477 +/- 0.079 0.068 +/- 0.001
Evaluate  maggic
maggic 0.636 +/- 0.001 0.194 +/- 0.001
Evaluate  cutract
cutract 0.764 +/- 0.007 0.087 +/- 0.001
Evaluate  metabric
metabric 0.644 +/- 0.015 0.232 +/- 0.005
Evaluate  seer
seer 0.666 +/- 0.017 0.024 +/- 0.0


In [None]:
import tabulate

tabulate.tabulate(scores, headers=headers, tablefmt="html")

In [5]:
from pathlib import Path

from adjutorium.plugins.prediction.risk_estimation import RiskEstimation
from adjutorium.utils.metrics import generate_score, print_score
from adjutorium.utils.tester import evaluate_survival_estimator
from sklearn.model_selection import train_test_split
from synthcity.utils.serialization import (dataframe_hash, load_from_file,
                                           save_to_file)

out_dir = Path("workspace_rebuttal")

log.remove()
headers = ["dataset", "cindex", "brier score"]

scores = []
for ref_df in ["aids", "maggic", "cutract", "metabric", "seer"]:

    print("=======================")
    print("Evaluate ", ref_df)

    df, duration_col, event_col, time_horizons = get_dataset(ref_df)
    train_df, test_df = train_test_split(df, random_state=0)

    df_hash = dataframe_hash(train_df)

    X = train_df.drop(columns=[duration_col, event_col])
    T = train_df[duration_col]
    E = train_df[event_col]

    Xtest = test_df.drop(columns=[duration_col, event_col])
    Ttest = test_df[duration_col]
    Etest = test_df[event_col]

    cindex = []
    brier = []

    for seed in range(3):
        model_bkp = out_dir / f"{df_hash}_gmm_{seed}.bkp"

        if model_bkp.exists():
            syn_df = load_from_file(model_bkp)
        else:
            generator = TabularGMM(random_state=seed, components=100)

            generator.fit(X, T, E)

            Xsample, Tsample, Esample = generator.generate()

            syn_df = Xsample.copy()
            syn_df[duration_col] = Tsample
            syn_df[event_col] = Esample

            save_to_file(model_bkp, syn_df)

        Xsample = syn_df.drop(columns=[duration_col, event_col])
        Tsample = syn_df[duration_col]
        Esample = syn_df[event_col]

        model = RiskEstimation().get("cox_ph")
        model.fit(Xsample, Tsample, Esample)

        score = evaluate_survival_estimator(
            [model] * 3,
            Xtest,
            Ttest,
            Etest,
            time_horizons=time_horizons,
            pretrained=True,
        )
        cindex.append(score["clf"]["c_index"][0])
        brier.append(score["clf"]["brier_score"][0])

    cidx_str = print_score(generate_score(cindex))
    brier_str = print_score(generate_score(brier))
    print(ref_df, cidx_str, brier_str)
    scores.append((ref_df, cidx_str, brier_str))

Evaluate  aids
aids 0.51 +/- 0.157 0.061 +/- 0.002
Evaluate  maggic
maggic 0.627 +/- 0.002 0.205 +/- 0.001
Evaluate  cutract
cutract 0.78 +/- 0.008 0.089 +/- 0.001
Evaluate  metabric
metabric 0.564 +/- 0.013 0.282 +/- 0.036
Evaluate  seer
seer 0.662 +/- 0.017 0.024 +/- 0.0


In [6]:
import tabulate

tabulate.tabulate(scores, headers=headers, tablefmt="html")

dataset,cindex,brier score
aids,0.51 +/- 0.157,0.061 +/- 0.002
maggic,0.627 +/- 0.002,0.205 +/- 0.001
cutract,0.78 +/- 0.008,0.089 +/- 0.001
metabric,0.564 +/- 0.013,0.282 +/- 0.036
seer,0.662 +/- 0.017,0.024 +/- 0.0


In [3]:
from pathlib import Path

from adjutorium.plugins.prediction.risk_estimation import RiskEstimation
from adjutorium.utils.metrics import generate_score, print_score
from adjutorium.utils.tester import evaluate_survival_estimator
from sklearn.model_selection import train_test_split
from synthcity.utils.serialization import (dataframe_hash, load_from_file,
                                           save_to_file)

out_dir = Path("workspace_rebuttal")

log.remove()
headers = ["dataset", "cindex", "brier score"]

scores = []
for ref_df in [
    "metabric",
]:

    print("=======================")
    print("Evaluate ", ref_df)

    df, duration_col, event_col, time_horizons = get_dataset(ref_df)
    df_hash = dataframe_hash(df)

    X = df.drop(columns=[duration_col, event_col])
    T = df[duration_col]
    E = df[event_col]

    cindex = []
    brier = []

    for seed in range(3):
        model_bkp = out_dir / f"{df_hash}_gmm_{seed}.bkp"

        if model_bkp.exists():
            syn_df = load_from_file(model_bkp)
        else:
            generator = TabularGMM(random_state=seed, components=100)

            generator.fit(X, T, E)

            Xsample, Tsample, Esample = generator.generate()

            syn_df = Xsample.copy()
            syn_df[duration_col] = Tsample
            syn_df[event_col] = Esample

            save_to_file(model_bkp, syn_df)

Evaluate  metabric
