In [16]:
import json
import sys
from pathlib import Path

import numpy as np
import pandas as pd

current_dir = Path.cwd()
parent_dir = str(current_dir.parent.parent)
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)


from src.constant import DATA_DIR, MAIN_DIR
from src.database.db import DB

PHASE1_DIR = MAIN_DIR / "archive" / "phase1_bbob" 
RAW_DIR = PHASE1_DIR / "raw"
PROCESSED_DIR = PHASE1_DIR / "processed"
FEATURES_PATH = DATA_DIR / "BBOB" / "features.json"

In [22]:
with open(FEATURES_PATH, "r") as f:
    features = json.load(f)

features_df = pd.DataFrame([{"id": k, **v["result"]} for k, v in features.items()])

In [38]:
instances_frames = []
evaluations_frames = []

for db_path in RAW_DIR.glob("*.db"):
    print(db_path)

    db = DB(db_path)
    
    instances_df = db.get_instances()
    instances_frames.append(instances_df)

    evaluations_df = db.get_evaluations().drop(columns=["id"])
    evaluations_df["cost"] = np.where(evaluations_df["cost"] == 3000.0, 300.0, evaluations_df["cost"])
    evaluations_frames.append(evaluations_df)

db.get_solvers().to_parquet(PROCESSED_DIR / "solvers.parquet", index=False)

instances_df = pd.concat(instances_frames, ignore_index=True)
instances_df = pd.merge(instances_df, features_df, on="id", how="left")
instances_df.to_parquet(PROCESSED_DIR / "instances.parquet", index=False)

evaluations_df = pd.concat(evaluations_frames, ignore_index=True)
evaluations_df.to_parquet(PROCESSED_DIR / "evaluations.parquet", index=False)

C:\Users\zakrz\Documents\DataScience\praca magisterska\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128432.db
C:\Users\zakrz\Documents\DataScience\praca magisterska\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128459.db
C:\Users\zakrz\Documents\DataScience\praca magisterska\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128463.db
C:\Users\zakrz\Documents\DataScience\praca magisterska\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128465.db
C:\Users\zakrz\Documents\DataScience\praca magisterska\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128466.db
C:\Users\zakrz\Documents\DataScience\praca magisterska\raw-algorithm-portfolios\archive\phase1_bbob\raw\run-phase1-dataset-baseline--25-1128468.db


In [39]:
evaluations_df = pd.read_parquet(PROCESSED_DIR / "evaluations.parquet")
solvers_df = pd.read_parquet(PROCESSED_DIR / "solvers.parquet")
instances_df = pd.read_parquet(PROCESSED_DIR / "instances.parquet")

df = pd.merge(evaluations_df, solvers_df, left_on="solver_id", right_on="id").drop(columns=["id"])
df = pd.merge(df, instances_df, left_on="instance_id", right_on="id").drop(columns=["id"])
df

Unnamed: 0,solver_id,instance_id,cost,ALGORITHM,CMA_ELITIST,CMA_POPSIZE,CMA_POPSIZE_FACTOR,CMA_RANDOM_INIT,CMA_SCALE,DE_CROSSOVER,...,pca_expl_var_PC1_cor_x,pca_expl_var_PC1_cov_init,pca_expl_var_PC1_cor_init,pca_costs_runtime,ic_h_max,ic_eps_s,ic_eps_max,ic_eps_ratio,ic_m0,ic_costs_runtime
0,410950163550714701,1459556901948702861,0.364175,0.0,1.0,0.955556,0.410492,1.0,0.972919,0.081101,...,0.530700,0.91987,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
1,1743092914995070369,1459556901948702861,0.398162,0.0,1.0,0.600000,0.246063,1.0,0.447144,0.449083,...,0.530700,0.91987,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
2,2283111023303066572,1459556901948702861,0.055367,1.0,0.0,0.244444,0.426904,1.0,0.406922,0.772266,...,0.530700,0.91987,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
3,2253826169615980878,1459556901948702861,0.808347,1.0,1.0,0.111111,0.118728,1.0,0.916723,0.842342,...,0.530700,0.91987,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
4,1589992703076004730,1459556901948702861,0.572030,1.0,1.0,0.700000,0.884952,1.0,0.751022,0.878372,...,0.530700,0.91987,0.522955,0.006267,0.745011,1.106106,2.633336,0.635636,0.392857,0.717704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,1090844655874379195,272533663176857570,300.000000,0.0,1.0,0.222222,0.604851,0.0,0.259828,0.415285,...,0.062206,0.99992,0.091769,0.235030,0.846320,2.527528,48.605642,1.926927,0.574148,23.141772
29996,686048789577017900,272533663176857570,300.000000,0.0,0.0,0.744444,0.570018,0.0,0.793826,0.983013,...,0.062206,0.99992,0.091769,0.235030,0.846320,2.527528,48.605642,1.926927,0.574148,23.141772
29997,243020324673513847,272533663176857570,300.000000,0.0,1.0,0.644444,0.531420,0.0,0.071196,0.581472,...,0.062206,0.99992,0.091769,0.235030,0.846320,2.527528,48.605642,1.926927,0.574148,23.141772
29998,708619547354767275,272533663176857570,300.000000,0.0,0.0,0.233333,0.384026,0.0,0.714490,0.829189,...,0.062206,0.99992,0.091769,0.235030,0.846320,2.527528,48.605642,1.926927,0.574148,23.141772


In [44]:
df.groupby("dimension")["cost"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
dimension,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,6000.0,39.686696,100.002204,0.005787,0.235768,0.626595,1.833461,300.0
3,6000.0,53.226619,112.351745,0.006001,0.369284,1.084515,5.376524,300.0
5,6000.0,102.316969,139.471425,0.008303,0.881324,3.401679,300.0,300.0
10,6000.0,151.581513,148.321969,0.012826,2.002552,42.582452,300.0,300.0
20,6000.0,172.926663,145.60793,0.022462,5.005857,300.0,300.0,300.0


In [47]:
df.groupby("function_index")["cost"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
function_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,5000.0,67.473944,123.391899,0.028009,0.561519,1.518596,8.248689,300.0
2,5000.0,83.080386,131.259735,0.084861,1.120768,3.405683,300.0,300.0
3,5000.0,197.593135,139.469002,0.067904,8.75477,300.0,300.0,300.0
5,5000.0,26.949779,85.460842,0.005787,0.049803,0.104996,0.283184,300.0
6,5000.0,128.57709,145.2082,0.104814,2.123924,12.060658,300.0,300.0
7,5000.0,120.011817,145.566668,0.014824,0.471457,2.9345,300.0,300.0


In [48]:
(df["cost"] == 300).value_counts(normalize=True)

False    0.660267
True     0.339733
Name: cost, dtype: float64

In [69]:
thresholds = np.linspace(0.05, 20.0, 400)
for dimension in df["dimension"].unique():
    dimension_df = df.loc[df["dimension"] == dimension]
    for t in thresholds:
        cutoff_ratio = (dimension_df["cost"] > t).mean()
        if cutoff_ratio <= 0.6:
            print(f"Dimension: {dimension}, Threshold: {t:.2f}, Cutoff Ratio: {cutoff_ratio:.4f}")
            break

Dimension: 2, Threshold: 0.45, Cutoff Ratio: 0.5977
Dimension: 3, Threshold: 0.75, Cutoff Ratio: 0.5847
Dimension: 5, Threshold: 1.85, Cutoff Ratio: 0.5980
Dimension: 10, Threshold: 5.40, Cutoff Ratio: 0.5982
Dimension: 20, Threshold: 17.25, Cutoff Ratio: 0.5998
