In [33]:
import numpy as np
import pandas as pd

from src.constant import DATA_DIR, DATABASE_DIR
from src.database import DB
from src.database.queries import *
from src.instance.InstanceList import InstanceList
from src.instance.TSP_Instance import TSP_from_index_file
from src.solver.TSP_LKH_Solver import TSP_LKH_Solver
from src.aac.SurrogateEstimator import Estimator1

from sklearn.model_selection import KFold
import numpy as np

def create_folds(X, y, n_splits=4):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    folds = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        folds.append((X_train, y_train, X_test, y_test))
    return folds

In [34]:
generators = [
    "cluster_netgen",
    "compression",
    "expansion",
    "explosion",
    "grid",
    "cluster",
    "implosion",
    "linearprojection",
    "rotation",
    "uniform_portgen",
]

test_instances = TSP_from_index_file(filepath=DATA_DIR / "TSP" / "TEST" / "index.json")

id_to_generator_name = {
    instance.id(): instance.filepath.parts[-2] for instance in test_instances
}

id_to_name = {
    instance.id(): instance.filepath.parts[-1] for instance in test_instances
}

In [35]:
acc = []
rmse = []
training_data_shape = []
non_timeout_training_data_shape = []

PREFIX = "run-500-sur-50"
for db_path in (DATABASE_DIR.glob(f"{PREFIX}-*.db")):
    db = DB(db_path)
    X, y = get_model_training_data(db)

    for X_train, y_train, X_test, y_test in create_folds(X, y):
        estimator = Estimator1(
            max_cost=TSP_LKH_Solver.MAX_COST, estimator_pct=0.5
        )
        estimator.fit(X_train, y_train)
        acc_, rmse_ = estimator.score(X_test, y_test)

        X_, y_ = estimator._training_data_
        training_data_shape_ = X_.shape[0]
        non_timeout_training_data_shape_ = X_[estimator._mask_non_timeout].shape[0]

        acc.append(acc_)
        rmse.append(rmse_)
        training_data_shape.append(training_data_shape_)
        non_timeout_training_data_shape.append(non_timeout_training_data_shape_)

In [36]:
acc = np.mean(acc)
rmse = np.mean(rmse)
training_data_shape = np.mean(training_data_shape)
non_timeout_training_data_shape = np.mean(non_timeout_training_data_shape)
print(f"{acc=}, {rmse=}, {training_data_shape=}, {non_timeout_training_data_shape=}")

acc=0.8280000000000001, rmse=1.9789999999999999, training_data_shape=42045.0, non_timeout_training_data_shape=21425.1


In [12]:
# PREFIX = "run-plain-30"


# PREFIX = "run-plain-250"
# PREFIX = "run-250-sur-50"

# PREFIX = "run-plain-500"
# PREFIX = "run-500-sur-50"

rows = []
for idx, db_path in enumerate(DATABASE_DIR.glob(f"{PREFIX}-*.db")):
    db = DB(db_path)
    results = pd.read_sql_query("SELECT * FROM results", db._conn)
    results_config = results.loc[results["prefix"].str.startswith("config"), :]
    config_cutoff_ratio = (results_config["cost"] == TSP_LKH_Solver.MAX_COST).mean()
    config_error_ratio = (results_config["error"] == 1).mean()
    config_surrogate_ratio = (results_config["surrogate"] == 1).mean()

    results_test = results.loc[results["prefix"].str.startswith("test"), :]
    test_cutoff_ratio = (results_test["cost"] == TSP_LKH_Solver.MAX_COST).mean()
    test_error_ratio = (results_test["error"] == 1).mean()

    rows.append({
        "name": db_path.name,
        "config_cutoff_ratio": config_cutoff_ratio,
        "config_error_ratio": config_error_ratio,
        "config_surrogate_ratio": config_surrogate_ratio,
        "test_cutoff_ratio": test_cutoff_ratio,
        "test_error_ratio": test_error_ratio,
    })

pd.DataFrame(rows).style.format(precision=2)

Unnamed: 0,name,config_cutoff_ratio,config_error_ratio,config_surrogate_ratio,test_cutoff_ratio,test_error_ratio
0,run-500-sur-50-951753.db,0.47,0.05,0.25,0.01,0.0
1,run-500-sur-50-951754.db,0.45,0.08,0.25,0.01,0.0
2,run-500-sur-50-951756.db,0.47,0.06,0.25,0.09,0.08
3,run-500-sur-50-951757.db,0.49,0.05,0.25,0.1,0.0
4,run-500-sur-50-951758.db,0.49,0.06,0.25,0.0,0.0
5,run-500-sur-50-951759.db,0.47,0.07,0.25,0.0,0.0
6,run-500-sur-50-951760.db,0.44,0.06,0.25,0.01,0.0
7,run-500-sur-50-951761.db,0.47,0.07,0.25,0.01,0.0
8,run-500-sur-50-951840.db,0.47,0.05,0.25,0.0,0.0
9,run-500-sur-50-951841.db,0.48,0.05,0.25,0.02,0.0


In [5]:
frames = []

def agg_cost(x):
    count = x.shape[0]
    x = x[x < TSP_LKH_Solver.MAX_COST]
    idx = count // 2
    if x.shape[0] <= idx:
        return x.iloc[-1]
    return x.iloc[idx]
    

for idx, db_path in enumerate(DATABASE_DIR.glob(f"{PREFIX}-*.db")):
    db = DB(db_path)
    results = pd.read_sql_query("SELECT * FROM results", db._conn)
    series = (
        results.loc[results["prefix"].str.startswith("test")]
        .groupby(["instance_id", "prefix"])["cost"]
        .min()
        .reset_index()
        .groupby("instance_id")["cost"]
        .agg(agg_cost)
    )
    frames.append(series)
    

df = (
    pd.concat(frames, axis=1)
    .mean(axis=1)
    .round(2)
    .rename(PREFIX)
    .to_frame()
)

df["generator"] = df.index.map(id_to_generator_name)
df["name"] = df.index.map(id_to_name)
df = df.pivot_table(index="generator", columns="name", values=PREFIX).loc[generators, :]
df["mean"] = df.mean(axis=1)

In [19]:
df.to_excel("tmp1.xlsx")

In [6]:
# 
df

name,000.tsp,001.tsp,002.tsp,003.tsp,004.tsp,mean
generator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cluster_netgen,0.16,0.76,0.86,0.49,1.4,0.734
compression,0.42,0.34,1.6,0.31,0.19,0.572
expansion,0.82,0.31,0.72,0.55,0.19,0.518
explosion,0.58,0.24,0.12,0.52,0.42,0.376
grid,0.38,0.39,0.32,0.82,0.19,0.42
cluster,0.51,0.19,0.35,0.24,0.14,0.286
implosion,0.08,0.31,1.61,0.58,0.27,0.57
linearprojection,0.23,0.6,0.33,0.19,0.39,0.348
rotation,0.41,2.33,0.16,0.23,0.27,0.68
uniform_portgen,1.28,1.03,0.16,0.66,0.23,0.672


In [53]:
df.mean().round(2)

run-30-sur-50    0.52
dtype: float64

In [54]:
times = {}

for idx, db_path in enumerate(DATABASE_DIR.glob(f"{PREFIX}-*.db")):
    db = DB(db_path)
    results = pd.read_sql_query("SELECT * FROM results", db._conn)
    times[idx] = results.loc[
        results["prefix"].str.startswith("config")
        & results["cached"].eq(0)
        & results["surrogate"].eq(0),
        "time",
    ].sum()
    
np.round(pd.Series(times).mean(), 2)

20640.65

In [55]:
pd.Series(times).mean() / 3600

5.73351327659766