In [2]:
import numpy as np
import pandas as pd

from src.configspace import RANDOM_FOREST_CONFIGSPACE
from src.constant import HO, PROCESSED_DATA_DIR, RESULTS_BASE_DIR
from src.hyperparameter_optimization import optimize_hyperparameters
from src.model import RandomForestRegressor
from src.split import get_n_splits
from src.wrapper import ScikitLearnWrapper
from src.evaluation import evaluate_model_with_cross_validation

In [3]:
evaluations_df = pd.read_parquet(PROCESSED_DATA_DIR / "evaluations.parquet")
solvers_df = pd.read_parquet(PROCESSED_DATA_DIR / "solvers.parquet")
instances_df = pd.read_parquet(PROCESSED_DATA_DIR / "instances.parquet")

df = pd.merge(evaluations_df, solvers_df, left_on="solver_id", right_on="id").drop(columns=["id"])
df = pd.merge(df, instances_df, left_on="instance_id", right_on="id").drop(columns=["id"])
df

Unnamed: 0,solver_id,instance_id,generator,cost,ASCENT_CANDIDATES,BACKBONE_TRIALS,BACKTRACKING,CANDIDATE_SET_TYPE,EXTRA_CANDIDATES,EXTRA_CANDIDATE_SET_TYPE,...,mst_dists_span,mst_dists_coef_of_var,mst_dists_sum,nnds_min,nnds_median,nnds_mean,nnds_max,nnds_sd,nnds_span,nnds_coef_of_var
0,1251473931473582278,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,4.74,0.95,1.0,0.0,0.0,0.2,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
1,2289112522627003788,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,0.02,0.15,0.0,1.0,1.0,0.4,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
2,960932965817811220,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,3.72,0.20,0.0,1.0,2.0,0.9,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
3,39012066323493184,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,1.52,0.60,1.0,1.0,2.0,0.7,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
4,494182449327999052,TSP/TRAIN/cluster_netgen/000.tsp,cluster_netgen,84.44,0.90,1.0,1.0,3.0,0.3,0.0,...,0.488488,1.790135,0.000225,0.000715,0.011143,0.014421,0.094965,0.012746,0.094250,0.883825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1286196444435323941,TSP/TRAIN/grid/019.tsp,grid,208.32,0.45,0.0,0.0,0.0,0.2,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99996,1435531534300921454,TSP/TRAIN/grid/019.tsp,grid,300.00,0.20,1.0,0.0,3.0,0.6,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99997,27607668447685341,TSP/TRAIN/grid/019.tsp,grid,3.21,0.95,1.0,1.0,3.0,0.9,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201
99998,2245205590089179674,TSP/TRAIN/grid/019.tsp,grid,17.23,0.65,0.0,0.0,1.0,0.5,0.0,...,0.093952,0.462330,0.000294,0.001335,0.029728,0.032519,0.095287,0.017339,0.093952,0.533201


In [4]:
SPLITS = get_n_splits(
    df=df,
    n=3,
    instance_number=10,
    solver_number=90,
    random_state=0,
)

### RandomForestRegressor

In [5]:
rf_incumbent = optimize_hyperparameters(
    df=df,
    model_cls=RandomForestRegressor,
    wrapper_cls=ScikitLearnWrapper,
    configspace=RANDOM_FOREST_CONFIGSPACE,
    splits=SPLITS,
    random_state=HO.RANDOM_STATE,
    n_trials=HO.N_TRIALS,
    filepath=RESULTS_BASE_DIR / "HO" / "rf_incumbent.pkl",
)
rf_incumbent

{'ccp_alpha': 0.0010550682433,
 'max_depth': 11,
 'max_features': 0.5492108995022,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_jobs': -1,
 'random_state': 0,
 'model_cls': sklearn.ensemble._forest.RandomForestRegressor}

In [5]:
# rf_incumbent["n_jobs"] = -1
# rf_incumbent["n_estimators"] = 1000


In [6]:
wrapper = ScikitLearnWrapper(**rf_incumbent)

In [7]:
result = evaluate_model_with_cross_validation(
    df,
    wrapper=wrapper,
    splits=SPLITS,
    random_state=0,
)

In [15]:
train_idx, test_idx = SPLITS[0]

df_train = df.loc[train_idx]
df_test = df.loc[test_idx]

not_train_cols = ["solver_id", "instance_id", "generator", "cost"]
X_train = df_train.drop(columns=not_train_cols)
y_train = df_train["cost"].to_numpy()

X_test = df_test.drop(columns=not_train_cols)
y_test = df_test["cost"].to_numpy()
y_test_not_censored = y_test.copy()

cut_off_train = np.full(X_train.shape[0], np.inf)
cut_off_test = np.full(X_test.shape[0], np.inf)

y_train = np.clip(y_train, 0, cut_off_train)
y_test = np.clip(y_test, 0, cut_off_test)

In [16]:
import psutil
import time

process = psutil.Process()
cpu_times_start = process.cpu_times()
start_process_time = time.process_time()
start = time.time()

wrapper.fit(X_train, y_train, cut_off_train)

end = time.time()
end_process_time = time.process_time()
cpu_times_end = process.cpu_times()

cpu_time_used = (
    (cpu_times_end.user - cpu_times_start.user) +
    (cpu_times_end.system - cpu_times_start.system)
)

cpu_time_used_user = cpu_times_end.user - cpu_times_start.user
cpu_time_used_system = cpu_times_end.system - cpu_times_start.system
cpu_time_used_total = cpu_time_used_user + cpu_time_used_system
process_time = end_process_time - start_process_time
time_time = end - start

print(f"{cpu_time_used_user=:.2f}s")
print(f"{cpu_time_used_system=:.2f}s")
print(f"{cpu_time_used_total=:.2f}s")
print(f"{process_time=:.2f}s")
print(f"{time_time=:.2f}s")

cpu_time_used_user=5.31s
cpu_time_used_system=0.42s
cpu_time_used_total=5.73s
process_time=5.73s
time_time=1.59s
