In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from src.configspace import (
    COX_PH_CONFIGSPACE,
    GB_COX_CONFIGSPACE,
    GPR_CONFIGSPACE,
    POLY_RIDGE_CONFIGSPACE,
    RANDOM_FOREST_CONFIGSPACE,
    RANDOM_SURVIVAL_FOREST_CONFIGSPACE,
    RIDGE_CONFIGSPACE,
    SCHMEE_HAHN_QRF_CONFIGSPACE,
    SVR_CONFIGSPACE,
    TOBIT_NN_CONFIGSPACE,
    XGB_AFT_CONFIGSPACE,
    XGB_CONFIGSPACE,
)
from src.constant import (
    HO,
    PROCESSED_DATA_DIR,
    RANDOM_STATE_LIST,
    RESULTS_0_10_DIR,
    SOLVER_NUMBER_LIST,
)
from src.evaluation import evaluate_model_with_cross_validation
from src.hyperparameter_optimization import optimize_hyperparameters
from src.model import (
    SVR,
    CoxPHSurvivalAnalysis,
    GPRWithRBF,
    GradientBoostingSurvivalAnalysis,
    PolynomialRidge,
    RandomForestRegressor,
    RandomSurvivalForest,
    Ridge,
    SchmeeHahnQRF,
    TobitModel,
    XGBRegressor,
    XGBRegressorAFT,
)
from src.results import plot_line, plot_scatter, wilcoxon_df
from src.split import get_n_splits
from src.wrapper import (
    ScikitLearnWrapper,
    SkipCutOffScikitLearnWrapper,
    StandardScaledLogTransformedWrapper,
    SurvivalFunctionWrapper,
    XGBwrapper,
)

In [2]:
evaluations_df = pd.read_parquet(PROCESSED_DATA_DIR / "evaluations.parquet")
solvers_df = pd.read_parquet(PROCESSED_DATA_DIR / "solvers.parquet")
instances_df = pd.read_parquet(PROCESSED_DATA_DIR / "instances.parquet")

df = pd.merge(evaluations_df, solvers_df, left_on="solver_id", right_on="id").drop(columns=["id"])
df = pd.merge(df, instances_df, left_on="instance_id", right_on="id").drop(columns=["id"])
df

Unnamed: 0,solver_id,instance_id,cost,ALGORITHM,CMA_ELITIST,CMA_POPSIZE,CMA_POPSIZE_FACTOR,CMA_RANDOM_INIT,CMA_SCALE,DE_CROSSOVER,...,pca_expl_var_PC1_cor_x,pca_expl_var_PC1_cov_init,pca_expl_var_PC1_cor_init,pca_costs_runtime,ic_h_max,ic_eps_s,ic_eps_max,ic_eps_ratio,ic_m0,ic_costs_runtime
0,410950163550714701,1459556901948702861,0.364175,0.0,1.0,0.955556,0.410492,1.0,0.972919,0.081101,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
1,1743092914995070369,1459556901948702861,0.398162,0.0,1.0,0.600000,0.246063,1.0,0.447144,0.449083,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
2,2283111023303066572,1459556901948702861,0.055367,1.0,0.0,0.244444,0.426904,1.0,0.406922,0.772266,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
3,2253826169615980878,1459556901948702861,0.808347,1.0,1.0,0.111111,0.118728,1.0,0.916723,0.842342,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
4,1589992703076004730,1459556901948702861,0.572030,1.0,1.0,0.700000,0.884952,1.0,0.751022,0.878372,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,1090844655874379195,1903964392399407975,300.000000,0.0,1.0,0.222222,0.604851,0.0,0.259828,0.415285,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308
119996,686048789577017900,1903964392399407975,300.000000,0.0,0.0,0.744444,0.570018,0.0,0.793826,0.983013,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308
119997,243020324673513847,1903964392399407975,300.000000,0.0,1.0,0.644444,0.531420,0.0,0.071196,0.581472,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308
119998,708619547354767275,1903964392399407975,300.000000,0.0,0.0,0.233333,0.384026,0.0,0.714490,0.829189,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308


In [3]:
# from src.instance.TSP_Instance import TSP_from_index_file, set_n22_cut_off_time
# from src.constant import DATA_DIR
# import json
# train_instances = TSP_from_index_file(
#     filepath=DATA_DIR / "TSP" / "TRAIN" / "index.json",
# )
# train_instances = set_n22_cut_off_time(train_instances, reference_cut_off_time=10.0)
# instance_to_cut_off = {}
# for instance in train_instances:
#     key = instance._get_short_filepath()
#     instance_to_cut_off[key] = instance.cut_off_time
# with open("instance_to_cut_off.json", "w") as f:
#     json.dump(instance_to_cut_off, f, indent=4)

import json
with open("instance_to_cut_off.json", "r") as f:
    INSTANCE_TO_CUT_OFF = json.load(f)

(df["cost"] < df["instance_id"].map(INSTANCE_TO_CUT_OFF)).value_counts(normalize=True)

False    0.806883
True     0.193117
dtype: float64

In [4]:
SPLITS = get_n_splits(
    df=df,
    n=HO.N,
    instance_number=HO.INSTANCE_NUMBER,
    solver_number=HO.SOLVER_NUMBER,
    random_state=HO.RANDOM_STATE,
)

### Ridge

In [5]:
ridge_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=Ridge,
    wrapper_cls=ScikitLearnWrapper,
    configspace=RIDGE_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "ridge_incumbent.pkl",
)

ridge_incumbent_skip_cutoff = optimize_hyperparameters(
    df=df,
    model_cls=Ridge,
    wrapper_cls=SkipCutOffScikitLearnWrapper,
    configspace=RIDGE_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "ridge_incumbent_skip_cutoff.pkl",
)

### PolynomialRidge

In [6]:
poly_ridge_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=PolynomialRidge,
    wrapper_cls=ScikitLearnWrapper,
    configspace=POLY_RIDGE_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "poly_ridge_incumbent.pkl",
)

poly_ridge_incumbent_skip_cutoff = optimize_hyperparameters(
    df=df,
    model_cls=PolynomialRidge,
    wrapper_cls=SkipCutOffScikitLearnWrapper,
    configspace=POLY_RIDGE_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "poly_ridge_incumbent_skip_cutoff.pkl",
)

### RandomForestRegressor

In [7]:
rf_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=RandomForestRegressor,
    wrapper_cls=ScikitLearnWrapper,
    configspace=RANDOM_FOREST_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "rf_incumbent.pkl",
)

rf_incumbent_skip_cutoff = optimize_hyperparameters(
    df=df,
    model_cls=RandomForestRegressor,
    wrapper_cls=SkipCutOffScikitLearnWrapper,
    configspace=RANDOM_FOREST_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "rf_incumbent_skip_cutoff.pkl",
)

### XGBRegressor

In [8]:
xgb_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=XGBRegressor,
    wrapper_cls=ScikitLearnWrapper,
    configspace=XGB_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "xgb_incumbent.pkl",
)

xgb_incumbent_skip_cutoff = optimize_hyperparameters(
    df=df,
    model_cls=XGBRegressor,
    wrapper_cls=SkipCutOffScikitLearnWrapper,
    configspace=XGB_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "xgb_incumbent_skip_cutoff.pkl",
)

### SVR

In [9]:
svr_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=SVR,
    wrapper_cls=ScikitLearnWrapper,
    configspace=SVR_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "svr_incumbent.pkl",
)

svr_incumbent_skip_cutoff = optimize_hyperparameters(
    df=df,
    model_cls=SVR,
    wrapper_cls=SkipCutOffScikitLearnWrapper,
    configspace=SVR_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "svr_incumbent_skip_cutoff.pkl",
)

### GPRWithRBF

In [10]:
gpr_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=GPRWithRBF,
    wrapper_cls=ScikitLearnWrapper,
    configspace=GPR_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "gpr_incumbent.pkl",
)

gpr_incumbent_skip_cutoff = optimize_hyperparameters(
    df=df,
    model_cls=GPRWithRBF,
    wrapper_cls=SkipCutOffScikitLearnWrapper,
    configspace=GPR_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "gpr_incumbent_skip_cutoff.pkl",
)

### CoxPHSurvivalAnalysis

In [11]:
coxph_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=CoxPHSurvivalAnalysis,
    wrapper_cls=SurvivalFunctionWrapper,
    configspace=COX_PH_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "coxph_incumbent.pkl",
)
coxph_incumbent

{'alpha': 5.9818622218769,
 'risk_function': 'polynomial',
 'ties': 'efron',
 'risk_alpha': 1.6044394313623,
 'model_cls': sksurv.linear_model.coxph.CoxPHSurvivalAnalysis}

### RandomSurvivalForest

In [12]:
rsf_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=RandomSurvivalForest,
    wrapper_cls=SurvivalFunctionWrapper,
    configspace=RANDOM_SURVIVAL_FOREST_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "rsf_incumbent.pkl",
)
rsf_incumbent

{'max_depth': 7,
 'max_features': 0.9754400755314,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_jobs': -1,
 'random_state': 0,
 'risk_function': 'exponential',
 'risk_alpha': 8.3821990811048,
 'risk_beta': 277.8341209205442,
 'model_cls': sksurv.ensemble.forest.RandomSurvivalForest}

### GradientBoostingSurvivalAnalysis

In [13]:
gb_cox_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=GradientBoostingSurvivalAnalysis,
    wrapper_cls=SurvivalFunctionWrapper,
    configspace=GB_COX_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "gb_cox_incumbent.pkl",
)
gb_cox_incumbent

{'ccp_alpha': 0.0010038293216,
 'learning_rate': 0.2926316905238,
 'loss': 'coxph',
 'max_depth': 10,
 'max_features': 0.0571158695451,
 'min_samples_leaf': 32,
 'min_samples_split': 22,
 'n_estimators': 633,
 'random_state': 0,
 'risk_function': 'polynomial',
 'subsample': 0.5000958631118,
 'risk_alpha': 1.4625556046704,
 'model_cls': sksurv.ensemble.boosting.GradientBoostingSurvivalAnalysis}

### XGBRegressorAFT

In [14]:
xgb_aft_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=XGBRegressorAFT,
    wrapper_cls=XGBwrapper,
    configspace=XGB_AFT_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "xgb_aft_incumbent.pkl",
)
xgb_aft_incumbent

{'aft_loss_distribution': 'logistic',
 'colsample_bytree': 0.6378271394952,
 'eval_metric': 'aft-nloglik',
 'gamma': 0.0236635729992,
 'learning_rate': 0.2913384248995,
 'max_depth': 9,
 'min_child_weight': 10,
 'num_boost_round': 23,
 'objective': 'survival:aft',
 'reg_alpha': 3.972157272827,
 'reg_lambda': 0.0075075304279,
 'seed': 0,
 'subsample': 0.6218210811619,
 'aft_loss_distribution_scale': 1.0373486097786,
 'model_cls': src.model.XGBRegressorAFT}

### SchmeeHahnQRF

In [15]:
sh_qrf_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=SchmeeHahnQRF,
    wrapper_cls=StandardScaledLogTransformedWrapper,
    configspace=SCHMEE_HAHN_QRF_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "sh_qrf_incumbent.pkl",
)
sh_qrf_incumbent

{'ccp_alpha': 1.0612624724709,
 'ccp_alpha_rf': 0.0010084895826,
 'k': 3,
 'max_depth': 2,
 'max_depth_rf': 11,
 'max_features': 0.0524415935931,
 'max_features_rf': 0.6195017572219,
 'min_samples_leaf': 29,
 'min_samples_leaf_rf': 11,
 'min_samples_split': 15,
 'min_samples_split_rf': 8,
 'n_jobs': -1,
 'random_state': 0,
 'model_cls': src.model.SchmeeHahnQRF}

### TobitModel

In [16]:
tobit_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=TobitModel,
    wrapper_cls=StandardScaledLogTransformedWrapper,
    configspace=TOBIT_NN_CONFIGSPACE,
    splits=SPLITS,
    instance_to_cut_off=INSTANCE_TO_CUT_OFF,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_0_10_DIR / "HO" / "tobit_incumbent.pkl",
)
tobit_incumbent

{'base_lr': 0.0008288957983,
 'dropout': 0.5681126601849,
 'momentum': 0.8019205974563,
 'n_epochs': 16,
 'scheduler_step_size_up': 416,
 'model_cls': src.model.TobitModel}

## Comparison

In [17]:
model_info_list = [
    # include cut-off
    {
        "wrapper": ScikitLearnWrapper(**ridge_incumbent),
        "name": "Ridge Regression"
    },
    {
        "wrapper": ScikitLearnWrapper(**poly_ridge_incumbent),
        "name": "Polynomial Regression"
    },
    {
        "wrapper": ScikitLearnWrapper(**svr_incumbent),
        "name": "Support Vector Regression"
    },
    {
        "wrapper": ScikitLearnWrapper(**gpr_incumbent),
        "name": "Gaussian Process Regression"
    },
    {
        "wrapper": ScikitLearnWrapper(**rf_incumbent),
        "name": "Random Forest"
    },
    {
        "wrapper": ScikitLearnWrapper(**xgb_incumbent),
        "name": "XGBoost"
    },
    # skip cut-off
    {
        "wrapper": SkipCutOffScikitLearnWrapper(**ridge_incumbent_skip_cutoff),
        "name": "Ridge Regression (skip cut-off)"
    },
    {
        "wrapper": SkipCutOffScikitLearnWrapper(**poly_ridge_incumbent_skip_cutoff),
        "name": "Polynomial Regression (skip cut-off)"
    },
    {
        "wrapper": SkipCutOffScikitLearnWrapper(**svr_incumbent_skip_cutoff),
        "name": "Support Vector Regression (skip cut-off)"
    },
    {
        "wrapper": SkipCutOffScikitLearnWrapper(**gpr_incumbent_skip_cutoff),
        "name": "Gaussian Process Regression (skip cut-off)"
    },
    {
        "wrapper": SkipCutOffScikitLearnWrapper(**rf_incumbent_skip_cutoff),
        "name": "Random Forest (skip cut-off)"
    },
    {
        "wrapper": SkipCutOffScikitLearnWrapper(**xgb_incumbent_skip_cutoff),
        "name": "XGBoost (skip cut-off)"
    },
    # survival models
    {
        "wrapper": SurvivalFunctionWrapper(**coxph_incumbent),
        "name": "Cox PH"
    },
    {
        "wrapper": SurvivalFunctionWrapper(**rsf_incumbent),
        "name": "Random Survival Forest"
    },
    {
        "wrapper": SurvivalFunctionWrapper(**gb_cox_incumbent),
        "name": "Gradient Boosting Cox"
    },
    {
        "wrapper": XGBwrapper(**xgb_aft_incumbent),
        "name": "XGBoost AFT"
    },
    {
        "wrapper": StandardScaledLogTransformedWrapper(**sh_qrf_incumbent),
        "name": "S&H QRF"
    },
    {
        "wrapper": StandardScaledLogTransformedWrapper(**tobit_incumbent),
        "name": "NN Tobit"
    },
]
 
# total_iterations = len(RANDOM_STATE_LIST) * len(SOLVER_NUMBER_LIST) * len(model_info_list)
# pbar = tqdm(total=total_iterations, desc="Evaluating models")

# records = []

# for random_state in RANDOM_STATE_LIST:
#     for solver_number in SOLVER_NUMBER_LIST:
#         pbar.set_description(f"RS={random_state}, Solvers={solver_number}")
#         splits = get_n_splits(
#             df,
#             n=5,
#             instance_number=10,
#             solver_number=solver_number,
#             random_state=random_state,
#         )
#         for model_info in model_info_list:
#             pbar.set_postfix(model=model_info["name"])

#             result = evaluate_model_with_cross_validation(
#                 df,
#                 wrapper=model_info["wrapper"],
#                 splits=splits,
#                 random_state=random_state,
#                 instance_to_cut_off=INSTANCE_TO_CUT_OFF,
#             )
#             result["random_state"] = random_state
#             result["solver_number"] = solver_number
#             result["name"] = model_info["name"]
#             records.append(result)
#             pbar.update(1)

# pbar.close()
# result_df = pd.DataFrame(records)
# result_df.to_pickle(RESULTS_0_10_DIR / "results.gzip", compression="gzip")

result_df = pd.read_pickle(RESULTS_0_10_DIR / "results.gzip", compression="gzip")

In [None]:
plot_df = result_df.loc[(result_df["random_state"] == 1) & (result_df["solver_number"] == 300)].iloc[[0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17]].reset_index(drop=True)

fig, axs = plot_scatter(plot_df)
# plt.savefig("fig.png", dpi=200, bbox_inches="tight")
plt.show()

In [None]:
plot_df = result_df.loc[(result_df["random_state"] == 1) & (result_df["solver_number"] == 300)].iloc[[0, 1, 2, 3, 4, 5, 12, 13, 14, 15, 16, 17]].reset_index(drop=True)
plot_df.loc[:5, "name"] += " (i)"
const_cut_off = None

n_plots = len(plot_df)
n_cols = 3
n_rows = (n_plots + n_cols - 1) // n_cols
fig, axs = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(9, 2.9 * n_rows))

if n_rows == 1:
    axs = axs.reshape(1, -1)

axs = axs.flatten()

for i, result in plot_df.iterrows():
    ax = axs[i]

    ax.scatter(
        result["y_test_not_censored"],
        result["y_pred"],
        alpha=0.5,
        edgecolors="k",
        lw=0.2,
        s=3,
    )
    ax.set_xscale("log")
    ax.set_yscale("log")

    ax.set_xlim(0.01, 320)
    ax.set_ylim(0.01, 320)
    ax.plot([0.01, 300], [0.01, 300], "k--", alpha=0.75, zorder=0)
    ax.set_title(f'{result["name"]} (RMSE={result["rmse"]:.2f})', fontsize=10)
    if const_cut_off is not None:
        ax.axhline(y=const_cut_off, color="red", linestyle="--")

for i in range(n_plots, len(axs)):
    axs[i].set_visible(False)

for row in range(n_rows):
    left_idx = row * n_cols
    if left_idx < n_plots:
        axs[left_idx].set_ylabel("Predicted Runtime")

bottom_row_start = (n_rows - 1) * n_cols
for col in range(n_cols):
    bottom_idx = bottom_row_start + col
    if bottom_idx < n_plots:
        axs[bottom_idx].set_xlabel("Actual Runtime")

plt.tight_layout(h_pad=2, w_pad=2)
plt.savefig("fig.png", dpi=200, bbox_inches="tight")
plt.show()

In [None]:
fig, ax = plot_line(
    result_df.loc[
        result_df["name"].isin(
            [
                "Cox PH",
                "Random Survival Forest",
                "Gradient Boosting Cox",
                "XGBoost AFT",
                "S&H QRF",
                "NN Tobit",
            ]
        )
    ]
)

# plt.savefig("fig.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
plot_df = result_df.loc[
    result_df["name"].isin(
        [
            "Cox PH",
            "Random Survival Forest",
            "Gradient Boosting Cox",
            "XGBoost AFT",
            "S&H QRF",
            "NN Tobit",
        ]
    )
]

fig, ax = plt.subplots(figsize=(9, 3))

# Get unique solver numbers and model names
solver_numbers = sorted(plot_df["solver_number"].unique())
model_names = plot_df["name"].unique()

# Create positions for boxplots
n_models = len(model_names)
width = 4
positions = []
labels = []

for i, solver_num in enumerate(solver_numbers):
    for j, model_name in enumerate(model_names):
        pos = i * (n_models + 1) + j
        positions.append(pos)
        
        # Get data for this model and solver number
        data = plot_df[(plot_df["solver_number"] == solver_num) & 
                      (plot_df["name"] == model_name)]["rmse"].values
        
        # Create boxplot
        bp = plt.boxplot(data, positions=[pos], widths=width/n_models, 
                patch_artist=True, manage_ticks=False, showfliers=False)
        
        # Color the boxplot based on model
        colors = plt.cm.tab10(j)
        bp['boxes'][0].set_facecolor(colors)
        
        # Set edge width
        for element in ['boxes', 'whiskers', 'fliers', 'caps']:
            if element in bp:
                for item in bp[element]:
                    item.set_linewidth(0.66)
        
        # Set median line width and color it to match box (to hide it)
        bp['medians'][0].set_linewidth(0.66)
        bp['medians'][0].set_color('black')

# Set x-axis labels
x_positions = [i * (n_models + 1) + (n_models - 1) / 2 for i in range(len(solver_numbers))]
plt.xticks(x_positions, solver_numbers)

# Create legend
handles = [plt.Rectangle((0,0),1,1, facecolor=plt.cm.tab10(i), alpha=1.0) 
          for i in range(n_models)]
plt.legend(handles, model_names, loc="best", frameon=True, fontsize=9)

plt.xlabel("Number of algorithm configurations in the training set")
plt.ylabel("RMSE (on logarithmized predictions)")
plt.tight_layout()
# plt.title("RMSE vs. the number of algorithm configurations in the training set")
plt.savefig("0_10_boxplot.pdf", bbox_inches="tight")
plt.show()

In [18]:
styled_result = wilcoxon_df(result_df, model_info_list)
styled_result

solver_number,5,10,15,20,30,50,70,100,150,200,300,500
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Ridge Regression,3.403,3.533,3.386,3.375,3.479,3.443,3.507,3.432,3.38,3.477,3.441,3.422
Polynomial Regression,3.421,3.532,3.406,3.368,3.485,3.447,3.504,3.425,3.373,3.466,3.434,3.405
Support Vector Regression,3.719,3.765,3.656,3.551,3.562,3.545,3.554,3.479,3.382,3.477,3.447,3.42
Gaussian Process Regression,4.227,4.296,4.153,4.141,4.132,3.611,3.535,3.427,3.375,3.467,3.43,3.401
Random Forest,3.739,3.824,3.477,3.466,3.52,3.473,3.522,3.44,3.385,3.481,3.445,3.427
XGBoost,3.472,3.542,3.401,3.367,3.479,3.444,3.504,3.421,3.366,3.459,3.417,3.394
Ridge Regression (skip cut-off),4.306,4.222,4.118,4.033,4.138,4.005,4.047,4.098,3.806,4.027,3.953,3.919
Polynomial Regression (skip cut-off),4.656,4.139,4.073,4.058,3.93,3.854,3.967,3.82,3.741,3.783,3.792,3.737
Support Vector Regression (skip cut-off),4.126,4.131,3.9,4.012,3.98,4.039,4.183,4.115,3.916,4.328,4.746,4.449
Gaussian Process Regression (skip cut-off),4.221,4.183,3.918,4.049,4.071,3.955,3.959,3.956,3.861,3.922,3.923,3.945


In [19]:
styled_result.to_excel("tmp.xlsx")

In [None]:
idx = [x["name"] for x in model_info_list]
# fit time
result_df.pivot_table(index="name", columns="solver_number", values="fit_time", aggfunc="mean").loc[idx].style.background_gradient(cmap="coolwarm", axis=None).format(precision=2)

In [None]:
# predict time
result_df.pivot_table(index="name", columns="solver_number", values="predict_time", aggfunc="mean").loc[idx].style.background_gradient(cmap="coolwarm", axis=None).format(precision=2)