In [43]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ConfigSpace import (
    Configuration,
    ConfigurationSpace,
    Constant,
    Float,
    Integer,
    Categorical,
)
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from smac import HyperparameterOptimizationFacade, Scenario
import xgboost as xgb

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message=".*algorithm did not converge.*")

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.alpha"] = 0.2
plt.rcParams["grid.color"] = "#cccccc"
plt.rcParams["axes.xmargin"] = 0

PHASE1_DIR = Path(".")
PROCESSED_DIR = PHASE1_DIR / "processed"

In [44]:
evaluations_df = pd.read_parquet(PROCESSED_DIR / "evaluations.parquet")
solvers_df = pd.read_parquet(PROCESSED_DIR / "solvers.parquet")
instances_df = pd.read_parquet(PROCESSED_DIR / "instances.parquet")

df = pd.merge(evaluations_df, solvers_df, left_on="solver_id", right_on="id").drop(columns=["id"])
df = pd.merge(df, instances_df, left_on="instance_id", right_on="id").drop(columns=["id"])
df

Unnamed: 0,solver_id,instance_id,generator,cost,ASCENT_CANDIDATES,BACKBONE_TRIALS,BACKTRACKING,CANDIDATE_SET_TYPE,EXTRA_CANDIDATES,EXTRA_CANDIDATE_SET_TYPE,...,mst_dists_span,mst_dists_coef_of_var,mst_dists_sum,nnds_min,nnds_median,nnds_mean,nnds_max,nnds_sd,nnds_span,nnds_coef_of_var
0,1251473931473582278,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,4.74,0.95,1.0,0.0,0.0,0.2,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
1,2289112522627003788,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,0.02,0.15,0.0,1.0,1.0,0.4,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
2,960932965817811220,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,3.72,0.20,0.0,1.0,2.0,0.9,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
3,39012066323493184,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,1.52,0.60,1.0,1.0,2.0,0.7,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
4,494182449327999052,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,84.44,0.90,1.0,1.0,3.0,0.3,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1286196444435323941,TSP/TRAIN/grid/019.tsp,grid,208.32,0.45,0.0,0.0,0.0,0.2,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99996,1435531534300921454,TSP/TRAIN/grid/019.tsp,grid,300.00,0.20,1.0,0.0,3.0,0.6,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99997,27607668447685341,TSP/TRAIN/grid/019.tsp,grid,3.21,0.95,1.0,1.0,3.0,0.9,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99998,2245205590089179674,TSP/TRAIN/grid/019.tsp,grid,17.23,0.65,0.0,0.0,1.0,0.5,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201


In [45]:
from src.split import get_n_splits

train_idx, test_idx = get_n_splits(df, n=2, instance_number=10, solver_number=300, random_state=0)[0]

df_train = df.loc[train_idx]
X_train = df_train.drop(columns=["solver_id", "instance_id", "generator", "cost"])
cut_off_train = np.full(X_train.shape[0], 1.0)
y_train = df_train["cost"].clip(upper=cut_off_train).to_numpy()

df_test = df.loc[test_idx]
X_test = df_test.drop(columns=["solver_id", "instance_id", "generator", "cost"])
cut_off_test = np.full(X_test.shape[0], 1.0)
y_test = df_test["cost"].clip(upper=cut_off_test).to_numpy()
y_test_not_censored = df_test["cost"].to_numpy()

In [46]:
xgb_configspace = ConfigurationSpace(
    seed=0,
    space=[
        Constant(name="objective", value="survival:aft"),
        Constant(name="eval_metric", value="aft-nloglik"),
        Categorical(name="aft_loss_distribution", items=["normal", "logistic", "extreme"], default="normal"),
        Float(name="aft_loss_distribution_scale", bounds=(1.0, 2.0), default=1.20),
        Integer(name="num_boost_round", bounds=(1, 300), default=100, log=True),
        Integer(name="n_estimators", bounds=(10, 1000), default=100),
        Integer(name="max_depth", bounds=(2, 10), default=6),
        Float(name="learning_rate", bounds=(0.01, 0.3), default=0.1, log=True),
        Float(name="subsample", bounds=(0.5, 1.0), default=1.0),
        Float(name="colsample_bytree", bounds=(0.5, 1.0), default=1.0),
        Integer(name="min_child_weight", bounds=(1, 10), default=1),
        Float(name="gamma", bounds=(0, 5), default=0),
        Float(name="reg_lambda", bounds=(1e-3, 10.0), default=1e-3, log=True),
        Float(name="reg_alpha", bounds=(1e-3, 10.0), default=1e-3, log=True),
        Constant(name="seed", value=0),
    ],
)


class XGBRegressorAFT:
    def __init__(
        self,
        objective="survival:aft",
        eval_metric="aft-nloglik",
        aft_loss_distribution="normal",
        aft_loss_distribution_scale=1.20,
        num_boost_round=100,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        subsample=1.0,
        colsample_bytree=1.0,
        min_child_weight=1,
        gamma=0,
        reg_lambda=1e-3,
        reg_alpha=1e-3,
        seed=0,
    ):
        self.params = {
            "objective": objective,
            "eval_metric": eval_metric,
            "aft_loss_distribution": aft_loss_distribution,
            "aft_loss_distribution_scale": aft_loss_distribution_scale,
            "n_estimators": n_estimators,
            "max_depth": max_depth,
            "learning_rate": learning_rate,
            "subsample": subsample,
            "colsample_bytree": colsample_bytree,
            "min_child_weight": min_child_weight,
            "gamma": gamma,
            "reg_lambda": reg_lambda,
            "reg_alpha": reg_alpha,
            "seed": seed,
        }
        self.num_boost_round = num_boost_round

    def fit(self, dtrain):
        self.bst = xgb.train(
            self.params,
            dtrain,
            num_boost_round=self.num_boost_round,
            evals=[(dtrain, "train")],
        )
        return self

    def predict(self, dtest):
        return self.bst.predict(dtest)

In [47]:
from src.wrapper import BaseWrapper, StandardScalerMixin

class XGBwrapper(StandardScalerMixin, BaseWrapper):
    def __init__(
        self,
        model_cls,
        **kwargs,
    ):
        super().__init__(model_cls, **kwargs)

    def _fit(self, X, y, cut_off) -> "XGBwrapper":
        dtrain = xgb.DMatrix(X)
        y_lower_bound = y
        y_upper_bound = np.where(y == cut_off, np.inf, y)
        dtrain.set_float_info('label_lower_bound', y_lower_bound)
        dtrain.set_float_info('label_upper_bound', y_upper_bound)
        self.model.fit(dtrain)
        return self

    def _predict(self, X, cut_off) -> np.ndarray:
        dtest = xgb.DMatrix(X_test)
        return self.model.predict(dtest)

In [48]:
wrapper = XGBwrapper(XGBRegressorAFT, **xgb_configspace.sample_configuration())
wrapper.fit(X_train, y_train, cut_off_train)
wrapper.predict(X_test, cut_off_test)

[0]	train-aft-nloglik:0.87835


[1]	train-aft-nloglik:0.84243
[2]	train-aft-nloglik:0.80981
[3]	train-aft-nloglik:0.77965
[4]	train-aft-nloglik:0.75206
[5]	train-aft-nloglik:0.72641
[6]	train-aft-nloglik:0.70238
[7]	train-aft-nloglik:0.68295
[8]	train-aft-nloglik:0.66143
[9]	train-aft-nloglik:0.64182
[10]	train-aft-nloglik:0.62389
[11]	train-aft-nloglik:0.60864
[12]	train-aft-nloglik:0.59318
[13]	train-aft-nloglik:0.57884
[14]	train-aft-nloglik:0.56561
[15]	train-aft-nloglik:0.55244
[16]	train-aft-nloglik:0.53938
[17]	train-aft-nloglik:0.52750
[18]	train-aft-nloglik:0.51713
[19]	train-aft-nloglik:0.50741
[20]	train-aft-nloglik:0.49827
[21]	train-aft-nloglik:0.48870
[22]	train-aft-nloglik:0.48004
[23]	train-aft-nloglik:0.47102
[24]	train-aft-nloglik:0.46378
[25]	train-aft-nloglik:0.45625
[26]	train-aft-nloglik:0.44914
[27]	train-aft-nloglik:0.44216
[28]	train-aft-nloglik:0.43590
[29]	train-aft-nloglik:0.43044
[30]	train-aft-nloglik:0.42524
[31]	train-aft-nloglik:0.41963
[32]	train-aft-nloglik:0.41382


array([1.649479 , 1.898113 , 1.2851045, ..., 1.4220297, 1.649479 ,
       1.2928429], dtype=float32)