# Cleaning the data and providing a small and easily parsable database.

The database with the full logs is too huge to work on. Most of the data would only be needed for debugging but not for evaluation.
Thus, we create a clean and small database for the further evaluation.

In [16]:
from algbench import read_as_pandas
from _utils import parse_solution_overview, parse_sample
def get_results(input_sample_archive, result_folder, max_vars=1500):
    # Loading the data of the experiment.


    # Merge the new data with the data of the initial samples
    data = read_as_pandas(result_folder, lambda result: {
        "initial_sample_path": result["parameters"]["args"]["initial_sample_path"],
        "instance_name": result["parameters"]["args"]["instance_name"],
        "lower_bound": result["result"]["lower_bound"],
        "upper_bound": result["result"]["upper_bound"],
        "iteration_info": result["result"]["iteration_info"],
        "instance": result["parameters"]["args"]["instance_name"],
    })
    
    data_initial = parse_solution_overview(input_sample_archive)
    data = data.merge(data_initial, left_on="initial_sample_path", right_on="Path")

    # add a good name for 00_baseline algorithms including the settings
    def baseline_alg_name(row):
        settings = row["Settings"]
        if "_m" in settings:
            m = settings.split("_m")[-1].split("_")[0]
            return f"{row['Algorithm']}(m={m})"
        return row["Algorithm"]

    data["baseline_alg"] = data.apply(baseline_alg_name, axis=1)
    n = len(data)
    data = data[data["#Variables"] <= max_vars].copy()
    print(f"Removed {n-len(data)} results because of size constraint.")
    return data

In [17]:
import pandas as pd

# Loading the data of the experiment.


# CHECK THAT THESE ARE THE CORRECT PATHS!
TIME_LIMIT = 90

BASE = "900_seconds_5_it"
INPUT_SAMPLE_ARCHIVE = f"../01_ICSE_2024_0/00_baseline/{BASE}.zip"
INSTANCE_ARCHIVE = "../01_ICSE_2024_0//00_benchmark_instances.zip"
RESULT_FOLDER = f"01_results/{BASE}_{TIME_LIMIT}"


def recache_data():
    relevant_columns = [
        "instance",
        "#Variables",
        "#Clauses",
        "iteration_info",
        "baseline_alg",
        "initial_sample_path",
        "SampleSize",
        "lower_bound",
        "upper_bound",
    ]
    data = get_results(
        input_sample_archive=INPUT_SAMPLE_ARCHIVE, result_folder=RESULT_FOLDER
    )[relevant_columns]
    data.rename(
        columns={
            "SampleSize": "initial_sample_size",
            "upper_bound": "optimized_sample_size",
        },
        inplace=True,
    )
    data.drop_duplicates(subset=["initial_sample_path"], inplace=True)
    data.to_json("./05_clean_data.json.zip")
    del data


recache_data()

Found the data folders: {'2023-03-01_13-51-03/'}
Removed 0 results because of size constraint.


In [18]:
data = pd.read_json("./05_clean_data.json.zip")
data

Unnamed: 0,instance,#Variables,#Clauses,iteration_info,baseline_alg,initial_sample_path,initial_sample_size,lower_bound,optimized_sample_size
0,calculate,9,15,"[{'nbrhd_tuples': 38, 'nbrhd_confs': 9, 'itera...",FIDE-ICPL,2023-03-01_13-51-03/1_1_0_1_sample.csv,9,5,5
1,calculate,9,15,"[{'nbrhd_tuples': 38, 'nbrhd_confs': 9, 'itera...",FIDE-ICPL,2023-03-01_13-51-03/1_1_0_2_sample.csv,9,5,5
2,calculate,9,15,"[{'nbrhd_tuples': 38, 'nbrhd_confs': 9, 'itera...",FIDE-ICPL,2023-03-01_13-51-03/1_1_0_3_sample.csv,9,5,5
3,calculate,9,15,"[{'nbrhd_tuples': 38, 'nbrhd_confs': 9, 'itera...",FIDE-ICPL,2023-03-01_13-51-03/1_1_0_4_sample.csv,9,5,5
4,calculate,9,15,"[{'nbrhd_tuples': 38, 'nbrhd_confs': 9, 'itera...",FIDE-ICPL,2023-03-01_13-51-03/1_1_0_5_sample.csv,9,5,5
...,...,...,...,...,...,...,...,...,...
95,berkeleyDB2,119,346,"[{'nbrhd_tuples': 204, 'nbrhd_confs': 5, 'iter...",FIDE-ICPL,2023-03-01_13-51-03/20_1_0_1_sample.csv,24,11,14
96,berkeleyDB2,119,346,"[{'nbrhd_tuples': 244, 'nbrhd_confs': 6, 'iter...",FIDE-ICPL,2023-03-01_13-51-03/20_1_0_2_sample.csv,24,11,14
97,berkeleyDB2,119,346,"[{'nbrhd_tuples': 247, 'nbrhd_confs': 4, 'iter...",FIDE-ICPL,2023-03-01_13-51-03/20_1_0_3_sample.csv,24,11,14
98,berkeleyDB2,119,346,"[{'nbrhd_tuples': 98, 'nbrhd_confs': 4, 'itera...",FIDE-ICPL,2023-03-01_13-51-03/20_1_0_4_sample.csv,24,10,15


* `instance` Unique name of feature model.
* `#Variables` Number of variables in model.
* `#Clauses` Number of clauses in model.
* `iteration_info` Information on the individual iterations (needs some additional processing)
* `baseline_alg` Name of the algorithm that computed the initial sample.
* `initial_sample_path` Path to the initial sample in the database (for querying, but it also serves as identifier).
* `initial_sample_size` Size of the initial sample.
* `lower_bound` The lower bound computed by SampLNS.
* `optimized_sample_size` Size of the sample after optimization with SampLNS.