In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from src.constant import PROCESSED_DATA_DIR

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.alpha"] = 0.2
plt.rcParams["grid.color"] = "#cccccc"
plt.rcParams["axes.xmargin"] = 0

In [2]:
evaluations_df = pd.read_parquet(PROCESSED_DATA_DIR / "evaluations.parquet")
solvers_df = pd.read_parquet(PROCESSED_DATA_DIR / "solvers.parquet")
instances_df = pd.read_parquet(PROCESSED_DATA_DIR / "instances.parquet")

df = pd.merge(evaluations_df, solvers_df, left_on="solver_id", right_on="id").drop(columns=["id"])
df = pd.merge(df, instances_df, left_on="instance_id", right_on="id").drop(columns=["id"])
df

Unnamed: 0,solver_id,instance_id,generator,cost,ASCENT_CANDIDATES,BACKBONE_TRIALS,BACKTRACKING,CANDIDATE_SET_TYPE,EXTRA_CANDIDATES,EXTRA_CANDIDATE_SET_TYPE,...,mst_dists_span,mst_dists_coef_of_var,mst_dists_sum,nnds_min,nnds_median,nnds_mean,nnds_max,nnds_sd,nnds_span,nnds_coef_of_var
0,1251473931473582278,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,4.74,0.95,1.0,0.0,0.0,0.2,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
1,2289112522627003788,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,0.02,0.15,0.0,1.0,1.0,0.4,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
2,960932965817811220,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,3.72,0.20,0.0,1.0,2.0,0.9,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
3,39012066323493184,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,1.52,0.60,1.0,1.0,2.0,0.7,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
4,494182449327999052,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,84.44,0.90,1.0,1.0,3.0,0.3,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1286196444435323941,TSP/TRAIN/grid/019.tsp,grid,208.32,0.45,0.0,0.0,0.0,0.2,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99996,1435531534300921454,TSP/TRAIN/grid/019.tsp,grid,300.00,0.20,1.0,0.0,3.0,0.6,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99997,27607668447685341,TSP/TRAIN/grid/019.tsp,grid,3.21,0.95,1.0,1.0,3.0,0.9,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99998,2245205590089179674,TSP/TRAIN/grid/019.tsp,grid,17.23,0.65,0.0,0.0,1.0,0.5,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201


In [3]:
from src.evaluation import evaluate_model_with_cross_validation
from src.split import permutate_df_by_cost_decreasing
from src.split import get_n_splits

In [4]:
train_idx, test_idx = get_n_splits(df, n=2, instance_number=5, solver_number=100, random_state=0)[0]

df_train = df.loc[train_idx]
df_test = df.loc[test_idx]

not_train_cols = ["solver_id", "instance_id", "generator", "cost"]

X_train = df_train.drop(columns=not_train_cols)
y_train = df_train["cost"].to_numpy()

X_test = df_test.drop(columns=not_train_cols)
y_test = df_test["cost"].to_numpy()
y_test_not_censored = y_test.copy()

const_cut_off = 1.0

cut_off_train = np.full(X_train.shape[0], const_cut_off)
cut_off_test = np.full(X_test.shape[0], const_cut_off)

y_train = np.clip(y_train, 0, cut_off_train)
y_test = np.clip(y_test, 0, cut_off_test)

In [24]:
from scipy.stats import norm

def truncated_normal_mean(mu, sigma, C):
    alpha = (C - mu) / sigma
    trunc_mean = mu + sigma * norm.pdf(alpha) / (1 - norm.cdf(alpha))
    return trunc_mean

truncated_normal_mean(0, 1, 0)

0.7978845608028654

In [7]:
X_train

Unnamed: 0,ASCENT_CANDIDATES,BACKBONE_TRIALS,BACKTRACKING,CANDIDATE_SET_TYPE,EXTRA_CANDIDATES,EXTRA_CANDIDATE_SET_TYPE,GAIN23,GAIN_CRITERION,INITIAL_STEP_SIZE,INITIAL_TOUR_ALGORITHM,...,mst_dists_span,mst_dists_coef_of_var,mst_dists_sum,nnds_min,nnds_median,nnds_mean,nnds_max,nnds_sd,nnds_span,nnds_coef_of_var
22007,0.90,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.75,3.0,...,0.174616,0.922159,0.000067,0.000010,0.010188,0.014403,0.174626,0.015943,0.174616,1.106919
22015,0.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50,5.0,...,0.174616,0.922159,0.000067,0.000010,0.010188,0.014403,0.174626,0.015943,0.174616,1.106919
22026,0.95,0.0,0.0,3.0,0.1,0.0,1.0,1.0,0.50,0.0,...,0.174616,0.922159,0.000067,0.000010,0.010188,0.014403,0.174626,0.015943,0.174616,1.106919
22048,0.10,0.0,0.0,1.0,0.8,0.0,1.0,1.0,0.00,3.0,...,0.174616,0.922159,0.000067,0.000010,0.010188,0.014403,0.174626,0.015943,0.174616,1.106919
22071,0.35,0.0,1.0,1.0,0.6,0.0,1.0,0.0,0.50,3.0,...,0.174616,0.922159,0.000067,0.000010,0.010188,0.014403,0.174626,0.015943,0.174616,1.106919
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78913,0.75,1.0,0.0,1.0,0.2,0.0,1.0,0.0,0.50,4.0,...,0.130153,0.791664,0.000086,0.001014,0.013686,0.018296,0.131167,0.016228,0.130153,0.886956
78937,0.10,0.0,0.0,2.0,0.4,0.0,0.0,1.0,0.25,1.0,...,0.130153,0.791664,0.000086,0.001014,0.013686,0.018296,0.131167,0.016228,0.130153,0.886956
78951,0.75,0.0,0.0,3.0,0.4,0.0,0.0,0.0,1.00,4.0,...,0.130153,0.791664,0.000086,0.001014,0.013686,0.018296,0.131167,0.016228,0.130153,0.886956
78970,0.30,1.0,0.0,3.0,0.1,0.0,0.0,0.0,0.00,0.0,...,0.130153,0.791664,0.000086,0.001014,0.013686,0.018296,0.131167,0.016228,0.130153,0.886956
