# Cleaning the data and providing a small and easily parsable database.

The database with the full logs is too huge to work on. Most of the data would only be needed for debugging but not for evaluation.
Thus, we create a clean and small database for the further evaluation.

In [14]:
from _conf import ITERATIONS, ITERATION_TIME_LIMIT, TIME_LIMIT, BASE, RESULT_FOLDER, INPUT_SAMPLE_ARCHIVE, INSTANCE_ARCHIVE

In [15]:
from algbench import read_as_pandas, Benchmark
from _utils import parse_solution_overview, parse_sample


def get_results(input_sample_archive, result_folder, max_vars=1500):
    # Loading the data of the experiment.

    # Merge the new data with the data of the initial samples
    data = read_as_pandas(
        result_folder,
        lambda result: {
            "parameters": result["parameters"],
            "initial_sample_path": result["parameters"]["args"]["initial_sample_path"],
            "instance_name": result["parameters"]["args"]["instance_name"],
            "lower_bound": result["result"]["lower_bound"],
            "upper_bound": len(result["result"]["solution"]),
            "iteration_info": result["result"]["iteration_info"],
            "time_used_by_yasa": result["result"]["time_used_by_yasa"],
            "timelimit_for_samplns": result["result"]["timelimit_for_samplns"],
            "samplns_used": result["result"]["samplns_used"],
            "instance": result["parameters"]["args"]["instance_name"],
        } if result.get("result", None) and  result.get("result", dict()).get("solution", None) else None,
    )

    data_initial = parse_solution_overview(input_sample_archive)
    data = data.merge(data_initial, left_on="initial_sample_path", right_on="Path")

    # add a good name for 00_baseline algorithms including the settings
    def baseline_alg_name(row):
        settings = row["Settings"]
        if "_m" in settings:
            m = settings.split("_m")[-1].split("_")[0]
            return f"{row['Algorithm']}(m={m})"
        return row["Algorithm"]

    data["baseline_alg"] = data.apply(baseline_alg_name, axis=1)
    n = len(data)
    data = data[data["#Variables"] <= max_vars].copy()
    print(f"Removed {n-len(data)} results because of size constraint.")
    return data

In [25]:
t = read_as_pandas(
        RESULT_FOLDER,
        lambda result: {
            "parameters": result["parameters"],
            "initial_sample_path": result["parameters"]["args"]["initial_sample_path"],
            "instance_name": result["parameters"]["args"]["instance_name"],
            "lower_bound": result["result"]["lower_bound"],
            "upper_bound": len(result["result"]["solution"]),
            "iteration_info": result["result"]["iteration_info"],
            "time_used_by_yasa": result["result"]["time_used_by_yasa"],
            "timelimit_for_samplns": result["result"]["timelimit_for_samplns"],
            "samplns_used": result["result"]["samplns_used"],
            "instance": result["parameters"]["args"]["instance_name"],
            "logging": result["logging"],
        } if result.get("result", None) and  result.get("result", dict()).get("solution", None) else None,
    )
t = t[t["instance_name"] == "toybox_2020-12-06_00-02-46"]
for e in t.iloc[0]["logging"]:
    print(e["message"])

Parsed instance 'toybox_2020-12-06_00-02-46' with 334 features and 92 rules.
Preprocessing instance (Instance[toybox_2020-12-06_00-02-46]<334 features, 92 rules>).
Converting instance to CNF (Instance[toybox_2020-12-06_00-02-46|EQ]<334 features, 92 rules>).
Finished converting instance to CNF (Instance[toybox_2020-12-06_00-02-46|EQ|CNF]<334 features, 92 rules>).
Finnished preprocessing (Instance[toybox_2020-12-06_00-02-46|EQ|CNF]<334 features, 92 rules>).
Setting up random neighborhood selector.
Computing feasible tuples...
Converted sample to list representation.
Instance has 206665 feasible tuples.
Neighborhood selector is ready.
Building transaction graph for toybox_2020-12-06_00-02-46|EQ|CNF with 334 concrete features!
All valid configurations were added to the transaction graph.
Setting up random neighborhood selector.
Computing feasible tuples...
Converted sample to list representation.
Instance has 206665 feasible tuples.
Neighborhood selector is ready.
Beginning optimization wi

In [17]:
import pandas as pd

# Loading the data of the experiment.


# CHECK THAT THESE ARE THE CORRECT PATHS!
TIME_LIMIT = 900

BASE = "900_seconds_5_it"
INPUT_SAMPLE_ARCHIVE = f"../01_ICSE_2024_0/00_baseline/{BASE}.zip"
INSTANCE_ARCHIVE = "../01_ICSE_2024_0//00_benchmark_instances.zip"
RESULT_FOLDER = f"01_results/{BASE}_{TIME_LIMIT}"



def recache_data():
    relevant_columns = [
        "instance",
        "#Variables",
        "#Clauses",
        "iteration_info",
        "baseline_alg",
        "initial_sample_path",
        "SampleSize",
        "lower_bound",
        "upper_bound",
        "time_used_by_yasa",
        "timelimit_for_samplns",
        "samplns_used"
    ]
    data = get_results(
        input_sample_archive=INPUT_SAMPLE_ARCHIVE, result_folder=RESULT_FOLDER
    )[relevant_columns]
    data.rename(
        columns={
            "SampleSize": "initial_sample_size",
            "upper_bound": "optimized_sample_size",
        },
        inplace=True,
    )
    data.drop_duplicates(subset=["initial_sample_path"], inplace=True)
    data.to_json("./05_clean_data.json.zip")
    del data


recache_data()

Found the data folders: {'2023-03-01_13-51-03/'}
Removed 0 results because of size constraint.


In [18]:
data = pd.read_json("./05_clean_data.json.zip")
data

Unnamed: 0,instance,#Variables,#Clauses,iteration_info,baseline_alg,initial_sample_path,initial_sample_size,lower_bound,optimized_sample_size,time_used_by_yasa,timelimit_for_samplns,samplns_used
0,fiasco_2017-09-26_11-30-56,230,1181,"[{'nbrhd_tuples': 249, 'nbrhd_confs': 25, 'ite...",YASA(m=1),2023-03-01_13-51-03/24_1_7_5_sample.csv,246,225,226,0.982,899.018,True
1,busybox_2020-12-16_21-53-05,1050,996,"[{'nbrhd_tuples': 201, 'nbrhd_confs': 2, 'iter...",YASA(m=1),2023-03-01_13-51-03/38_1_7_2_sample.csv,39,15,23,2.233,897.767,True
2,busybox-1_29_2,1018,997,"[{'nbrhd_tuples': 229, 'nbrhd_confs': 5, 'iter...",YASA(m=1),2023-03-01_13-51-03/37_1_7_4_sample.csv,44,18,25,2.297,897.703,True
3,toybox_2020-12-06_00-02-46,334,92,"[{'nbrhd_tuples': 142, 'nbrhd_confs': 2, 'iter...",YASA(m=1),2023-03-01_13-51-03/29_1_7_1_sample.csv,20,7,14,0.462,899.538,True
4,toybox_2020-12-06_00-02-46,334,92,"[{'nbrhd_tuples': 180, 'nbrhd_confs': 2, 'iter...",YASA(m=1),2023-03-01_13-51-03/29_1_7_5_sample.csv,20,7,14,0.474,899.526,True
5,fiasco_2017-09-26_11-30-56,230,1181,"[{'nbrhd_tuples': 242, 'nbrhd_confs': 26, 'ite...",YASA(m=1),2023-03-01_13-51-03/24_1_7_1_sample.csv,246,222,226,0.968,899.032,True
6,fiasco_2020-12-01_14-09-14,258,1542,"[{'nbrhd_tuples': 243, 'nbrhd_confs': 22, 'ite...",YASA(m=1),2023-03-01_13-51-03/25_1_7_5_sample.csv,224,195,198,1.176,898.824,True
7,fiasco_2017-09-26_11-30-56,230,1181,"[{'nbrhd_tuples': 230, 'nbrhd_confs': 16, 'ite...",YASA(m=1),2023-03-01_13-51-03/24_1_7_4_sample.csv,246,225,225,0.99,899.01,True
8,soletta_2015-06-26_18-38-56,129,192,"[{'nbrhd_tuples': 230, 'nbrhd_confs': 13, 'ite...",YASA(m=1),2023-03-01_13-51-03/21_1_7_1_sample.csv,32,24,24,0.286,899.714,True
9,fiasco_2017-09-26_11-30-56,230,1181,"[{'nbrhd_tuples': 230, 'nbrhd_confs': 26, 'ite...",YASA(m=1),2023-03-01_13-51-03/24_1_7_3_sample.csv,246,224,227,0.941,899.059,True


* `instance` Unique name of feature model.
* `#Variables` Number of variables in model.
* `#Clauses` Number of clauses in model.
* `iteration_info` Information on the individual iterations (needs some additional processing)
* `baseline_alg` Name of the algorithm that computed the initial sample.
* `initial_sample_path` Path to the initial sample in the database (for querying, but it also serves as identifier).
* `initial_sample_size` Size of the initial sample.
* `lower_bound` The lower bound computed by SampLNS.
* `optimized_sample_size` Size of the sample after optimization with SampLNS.