In [33]:
import numpy as np

from src.constant import DATA_DIR, DATABASE_DIR
from src.database import DB
from src.database.queries import *
from src.instance.TSP_Instance import TSP_from_index_file
from src.solver.TSP_LKH_Solver import TSP_LKH_Solver
from src.aac.SurrogateEstimator import Estimator1

from sklearn.model_selection import KFold
import numpy as np

def create_folds(X, y, n_splits=4):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    folds = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        folds.append((X_train, y_train, X_test, y_test))
    return folds

In [34]:
generators = [
    "cluster_netgen",
    "compression",
    "expansion",
    "explosion",
    "grid",
    "cluster",
    "implosion",
    "linearprojection",
    "rotation",
    "uniform_portgen",
]

test_instances = TSP_from_index_file(filepath=DATA_DIR / "TSP" / "TEST" / "index.json")

id_to_generator_name = {
    instance.id(): instance.filepath.parts[-2] for instance in test_instances
}

id_to_name = {
    instance.id(): instance.filepath.parts[-1] for instance in test_instances
}

In [35]:
acc = []
rmse = []
training_data_shape = []
non_timeout_training_data_shape = []

PREFIX = "run-500-sur-50"
for db_path in (DATABASE_DIR.glob(f"{PREFIX}-*.db")):
    db = DB(db_path)
    X, y = get_model_training_data(db)

    for X_train, y_train, X_test, y_test in create_folds(X, y):
        estimator = Estimator1(
            max_cost=TSP_LKH_Solver.MAX_COST, estimator_pct=0.5
        )
        estimator.fit(X_train, y_train)
        acc_, rmse_ = estimator.score(X_test, y_test)

        X_, y_ = estimator._training_data_
        training_data_shape_ = X_.shape[0]
        non_timeout_training_data_shape_ = X_[estimator._mask_non_timeout].shape[0]

        acc.append(acc_)
        rmse.append(rmse_)
        training_data_shape.append(training_data_shape_)
        non_timeout_training_data_shape.append(non_timeout_training_data_shape_)

In [36]:
acc = np.mean(acc)
rmse = np.mean(rmse)
training_data_shape = np.mean(training_data_shape)
non_timeout_training_data_shape = np.mean(non_timeout_training_data_shape)
print(f"{acc=}, {rmse=}, {training_data_shape=}, {non_timeout_training_data_shape=}")

acc=0.8280000000000001, rmse=1.9789999999999999, training_data_shape=42045.0, non_timeout_training_data_shape=21425.1
