In [1]:
import json
import sys
from pathlib import Path

import numpy as np
import pandas as pd

current_dir = Path.cwd()
parent_dir = str(current_dir.parent.parent)
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)


from src.constant import DATA_DIR, MAIN_DIR
from src.database.db import DB

PHASE1_DIR = MAIN_DIR / "archive" / "phase1_bbob" 
RAW_DIR = PHASE1_DIR / "raw"
PROCESSED_DIR = PHASE1_DIR / "processed"
FEATURES_PATH = DATA_DIR / "BBOB" / "features.json"

In [2]:
with open(FEATURES_PATH, "r") as f:
    features = json.load(f)

features_df = pd.DataFrame([{"id": k, **v["result"]} for k, v in features.items()])

In [3]:
instances_frames = []
evaluations_frames = []

for db_path in RAW_DIR.glob("*.db"):
    print(db_path)

    db = DB(db_path)
    
    instances_df = db.get_instances()
    instances_frames.append(instances_df)

    evaluations_df = db.get_evaluations().drop(columns=["id"])
    evaluations_df["cost"] = np.where(evaluations_df["cost"] == 3000.0, 300.0, evaluations_df["cost"])
    evaluations_frames.append(evaluations_df)

db.get_solvers().to_parquet(PROCESSED_DIR / "solvers.parquet", index=False)

instances_df = pd.concat(instances_frames, ignore_index=True)
instances_df = pd.merge(instances_df, features_df, on="id", how="left")
instances_df.to_parquet(PROCESSED_DIR / "instances.parquet", index=False)

evaluations_df = pd.concat(evaluations_frames, ignore_index=True)
evaluations_df.to_parquet(PROCESSED_DIR / "evaluations.parquet", index=False)

C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128432.db
C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128459.db
C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128463.db
C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128464.db
C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128465.db
C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128466.db
C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128468.db
C:\Users\gzakrzewski\Documents\projects\raw-algorithm-portfolios\archive\pha

In [4]:
evaluations_df = pd.read_parquet(PROCESSED_DIR / "evaluations.parquet")
solvers_df = pd.read_parquet(PROCESSED_DIR / "solvers.parquet")
instances_df = pd.read_parquet(PROCESSED_DIR / "instances.parquet")

df = pd.merge(evaluations_df, solvers_df, left_on="solver_id", right_on="id").drop(columns=["id"])
df = pd.merge(df, instances_df, left_on="instance_id", right_on="id").drop(columns=["id"])
df

Unnamed: 0,solver_id,instance_id,cost,ALGORITHM,CMA_ELITIST,CMA_POPSIZE,CMA_POPSIZE_FACTOR,CMA_RANDOM_INIT,CMA_SCALE,DE_CROSSOVER,...,pca_expl_var_PC1_cor_x,pca_expl_var_PC1_cov_init,pca_expl_var_PC1_cor_init,pca_costs_runtime,ic_h_max,ic_eps_s,ic_eps_max,ic_eps_ratio,ic_m0,ic_costs_runtime
0,410950163550714701,1459556901948702861,0.364175,0.0,1.0,0.955556,0.410492,1.0,0.972919,0.081101,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
1,1743092914995070369,1459556901948702861,0.398162,0.0,1.0,0.600000,0.246063,1.0,0.447144,0.449083,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
2,2283111023303066572,1459556901948702861,0.055367,1.0,0.0,0.244444,0.426904,1.0,0.406922,0.772266,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
3,2253826169615980878,1459556901948702861,0.808347,1.0,1.0,0.111111,0.118728,1.0,0.916723,0.842342,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
4,1589992703076004730,1459556901948702861,0.572030,1.0,1.0,0.700000,0.884952,1.0,0.751022,0.878372,...,0.530700,0.919870,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119995,1090844655874379195,1903964392399407975,300.000000,0.0,1.0,0.222222,0.604851,0.0,0.259828,0.415285,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308
119996,686048789577017900,1903964392399407975,300.000000,0.0,0.0,0.744444,0.570018,0.0,0.793826,0.983013,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308
119997,243020324673513847,1903964392399407975,300.000000,0.0,1.0,0.644444,0.531420,0.0,0.071196,0.581472,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308
119998,708619547354767275,1903964392399407975,300.000000,0.0,0.0,0.233333,0.384026,0.0,0.714490,0.829189,...,0.062206,0.990413,0.067438,0.269816,0.868371,1.546547,5.568596,0.995996,0.627756,21.302308


In [5]:
df.groupby("dimension")["cost"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
dimension,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,24000.0,77.38773,128.857061,0.005787,0.607542,1.56607,300.0,300.0
3,24000.0,135.906009,147.036095,0.006001,1.349006,12.922658,300.0,300.0
5,24000.0,181.805552,144.454758,0.008303,3.195017,300.0,300.0,300.0
10,24000.0,215.480567,132.872267,0.012826,11.629091,300.0,300.0,300.0
20,24000.0,232.103117,122.403844,0.022462,300.0,300.0,300.0,300.0


In [6]:
df.groupby("function_index")["cost"].describe().style.background_gradient(cmap="coolwarm", axis=0).format(precision=2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
function_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5000.0,67.47,123.39,0.03,0.56,1.52,8.25,300.0
2,5000.0,83.08,131.26,0.08,1.12,3.41,300.0,300.0
3,5000.0,197.59,139.47,0.07,8.75,300.0,300.0,300.0
4,5000.0,242.54,115.43,0.21,300.0,300.0,300.0,300.0
5,5000.0,26.95,85.46,0.01,0.05,0.1,0.28,300.0
6,5000.0,128.58,145.21,0.1,2.12,12.06,300.0,300.0
7,5000.0,120.01,145.57,0.01,0.47,2.93,300.0,300.0
8,5000.0,143.19,146.32,0.1,2.71,23.3,300.0,300.0
9,5000.0,144.44,146.89,0.08,2.38,22.34,300.0,300.0
10,5000.0,170.92,146.39,0.13,3.05,300.0,300.0,300.0


In [7]:
(df["cost"] == 300).value_counts(normalize=True)

True     0.5541
False    0.4459
Name: cost, dtype: float64

In [21]:
thresholds = np.linspace(0.05, 20.0, 400)
for dimension in df["dimension"].unique():
    dimension_df = df.loc[~df["function_index"].isin([3, 4, 15, 18, 19, 20, 22, 23, 24])].loc[df["dimension"] == dimension]
    for t in thresholds:
        cutoff_ratio = (dimension_df["cost"] > t).mean()
        if cutoff_ratio <= 0.6:
            print(f"Dimension: {dimension}, Threshold: {t:.2f}, Cutoff Ratio: {cutoff_ratio:.4f}")
            break

Dimension: 2, Threshold: 0.75, Cutoff Ratio: 0.5967
Dimension: 3, Threshold: 1.55, Cutoff Ratio: 0.5997
Dimension: 5, Threshold: 4.30, Cutoff Ratio: 0.5993


In [None]:
(3/2) ** 3 * 0.75

2.53125

In [19]:
df.loc[~df["function_index"].isin([3, 4, 15, 18, 19, 20, 22, 23, 24])].assign(is_cutoff=lambda x: x["cost"] == 300).pivot_table(index="function_index", columns="dimension", values="is_cutoff").style.background_gradient(cmap="coolwarm", axis=None).format(precision=2)

dimension,2,3,5,10,20
function_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.05,0.08,0.17,0.36,0.45
2,0.11,0.15,0.23,0.39,0.47
5,0.07,0.08,0.08,0.1,0.12
6,0.26,0.25,0.41,0.53,0.64
7,0.06,0.1,0.45,0.64,0.72
8,0.15,0.32,0.57,0.63,0.65
9,0.13,0.3,0.61,0.65,0.66
10,0.35,0.53,0.62,0.66,0.66
11,0.37,0.55,0.63,0.66,0.66
12,0.17,0.63,0.65,0.66,0.66
