# Cleaning the data and providing a small and easily parsable database.

The database with the full logs is too huge to work on. Most of the data would only be needed for debugging but not for evaluation.
Thus, we create a clean and small database for the further evaluation.

In [63]:
from _conf import (
    ITERATIONS,
    ITERATION_TIME_LIMIT,
    TIME_LIMIT,
    BASE,
    RESULT_FOLDER,
    INPUT_SAMPLE_ARCHIVE,
    INSTANCE_ARCHIVE,
)

import pandas as pd
from algbench import read_as_pandas, Benchmark
from _utils import parse_solution_overview

In [64]:
# Read the baseline results for comparison. They also already contain instance data (num_clauses and num_features).
baseline_data = pd.read_json("./00_simple_baseline_data.json.zip")
baseline_data.rename(columns={"sample_size": "baseline_sample_size"}, inplace=True)
baseline_data

Unnamed: 0,instance_name,num_vars,num_clauses,baseline_alg,baseline_alg_conf,runtime,baseline_sample_size,path_to_baseline_sample
0,calculate,9,15,FIDE-ICPL,t2,1.484,9.0,2023-03-01_13-51-03/1_1_0_1_sample.csv
1,calculate,9,15,FIDE-ICPL,t2,1.280,9.0,2023-03-01_13-51-03/1_1_0_2_sample.csv
2,calculate,9,15,FIDE-ICPL,t2,1.289,9.0,2023-03-01_13-51-03/1_1_0_3_sample.csv
3,calculate,9,15,FIDE-ICPL,t2,1.274,9.0,2023-03-01_13-51-03/1_1_0_4_sample.csv
4,calculate,9,15,FIDE-ICPL,t2,1.277,9.0,2023-03-01_13-51-03/1_1_0_5_sample.csv
...,...,...,...,...,...,...,...,...
2745,freetz,31012,102705,YASA,t2_m10_null,900.008,,
2746,freetz,31012,102705,YASA,t2_m10_null,900.008,,
2747,freetz,31012,102705,YASA,t2_m10_null,900.010,,
2748,freetz,31012,102705,YASA,t2_m10_null,900.009,,


In [65]:
# Get the data from the result folder
samplns_data = read_as_pandas(
    RESULT_FOLDER,
    lambda result: {
        "parameters": result["parameters"],
        "initial_sample_path": result["parameters"]["args"]["initial_sample_path"],
        "instance_name": result["parameters"]["args"]["instance_name"],
        "lower_bound": result["result"]["lower_bound"],
        "upper_bound": len(result["result"]["solution"]),
        "iteration_info": result["result"]["iteration_info"],
        "time_used_by_yasa": result["result"]["time_used_by_yasa"],
        "timelimit_for_samplns": result["result"]["timelimit_for_samplns"],
        "samplns_used": result["result"]["samplns_used"],
        "runtime": result["runtime"],
        "time_limit": result["parameters"]["args"]["time_limit"],
    }
    if result.get("result", None) and result.get("result", dict()).get("solution", None)
    else None,
)
samplns_data.drop_duplicates(subset=["initial_sample_path"], inplace=True)
samplns_data

Unnamed: 0,parameters,initial_sample_path,instance_name,lower_bound,upper_bound,iteration_info,time_used_by_yasa,timelimit_for_samplns,samplns_used,runtime,time_limit
0,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/38_1_7_5_sample.csv,busybox_2020-12-16_21-53-05,18.0,23,"[{'nbrhd_tuples': 222, 'nbrhd_confs': 4, 'iter...",2.278,897.722,True,909.288080,900
1,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/3_1_7_5_sample.csv,email,6.0,6,"[{'nbrhd_tuples': 70, 'nbrhd_confs': 7, 'itera...",0.199,899.801,True,0.172952,900
2,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/34_1_7_1_sample.csv,WaterlooGenerated,82.0,82,"[{'nbrhd_tuples': 248, 'nbrhd_confs': 36, 'ite...",1.438,898.562,True,637.711078,900
3,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/7_1_7_5_sample.csv,FeatureIDE,7.0,8,"[{'nbrhd_tuples': 214, 'nbrhd_confs': 8, 'iter...",0.207,899.793,True,905.908822,900
4,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/7_1_7_4_sample.csv,FeatureIDE,7.0,8,"[{'nbrhd_tuples': 215, 'nbrhd_confs': 9, 'iter...",0.208,899.792,True,905.727147,900
...,...,...,...,...,...,...,...,...,...,...,...
225,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/19_1_7_1_sample.csv,Violet,14.0,17,"[{'nbrhd_tuples': 178, 'nbrhd_confs': 7, 'iter...",0.288,899.712,True,923.982945,900
226,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/30_1_7_5_sample.csv,DMIE,14.0,17,"[{'nbrhd_tuples': 241, 'nbrhd_confs': 5, 'iter...",0.597,899.403,True,909.323218,900
227,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/25_1_7_3_sample.csv,fiasco_2020-12-01_14-09-14,196.0,196,"[{'nbrhd_tuples': 236, 'nbrhd_confs': 19, 'ite...",1.152,898.848,True,686.337217,900
228,"{'func': 'run_samplns', 'args': {'instance_nam...",2023-03-01_13-51-03/14_1_7_3_sample.csv,SortingLine,9.0,9,"[{'nbrhd_tuples': 206, 'nbrhd_confs': 12, 'ite...",0.202,899.798,True,66.414268,900


In [66]:
# Pull the baseline info into the data frame
samplns_data_with_baseline_infos = baseline_data.merge(
    right=samplns_data,
    left_on="path_to_baseline_sample",
    right_on="initial_sample_path",
    how="right",
    suffixes=("_baseline", "_samplns"),
)
samplns_data_with_baseline_infos.drop(
    columns=["instance_name_baseline", "initial_sample_path"], inplace=True
)
samplns_data_with_baseline_infos.rename(
    columns={"instance_name_samplns": "instance_name"}, inplace=True
)
samplns_data_with_baseline_infos

Unnamed: 0,num_vars,num_clauses,baseline_alg,baseline_alg_conf,runtime_baseline,baseline_sample_size,path_to_baseline_sample,parameters,instance_name,lower_bound,upper_bound,iteration_info,time_used_by_yasa,timelimit_for_samplns,samplns_used,runtime_samplns,time_limit
0,1050,996,YASA,t2_m1_null,2.278,39.0,2023-03-01_13-51-03/38_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",busybox_2020-12-16_21-53-05,18.0,23,"[{'nbrhd_tuples': 222, 'nbrhd_confs': 4, 'iter...",2.278,897.722,True,909.288080,900
1,10,17,YASA,t2_m1_null,0.199,7.0,2023-03-01_13-51-03/3_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",email,6.0,6,"[{'nbrhd_tuples': 70, 'nbrhd_confs': 7, 'itera...",0.199,899.801,True,0.172952,900
2,580,879,YASA,t2_m1_null,1.438,149.0,2023-03-01_13-51-03/34_1_7_1_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",WaterlooGenerated,82.0,82,"[{'nbrhd_tuples': 248, 'nbrhd_confs': 36, 'ite...",1.438,898.562,True,637.711078,900
3,19,27,YASA,t2_m1_null,0.207,12.0,2023-03-01_13-51-03/7_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",FeatureIDE,7.0,8,"[{'nbrhd_tuples': 214, 'nbrhd_confs': 8, 'iter...",0.207,899.793,True,905.908822,900
4,19,27,YASA,t2_m1_null,0.208,12.0,2023-03-01_13-51-03/7_1_7_4_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",FeatureIDE,7.0,8,"[{'nbrhd_tuples': 215, 'nbrhd_confs': 9, 'iter...",0.208,899.792,True,905.727147,900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,101,203,YASA,t2_m1_null,0.288,27.0,2023-03-01_13-51-03/19_1_7_1_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",Violet,14.0,17,"[{'nbrhd_tuples': 178, 'nbrhd_confs': 7, 'iter...",0.288,899.712,True,923.982945,900
226,366,627,YASA,t2_m1_null,0.597,27.0,2023-03-01_13-51-03/30_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",DMIE,14.0,17,"[{'nbrhd_tuples': 241, 'nbrhd_confs': 5, 'iter...",0.597,899.403,True,909.323218,900
227,258,1542,YASA,t2_m1_null,1.152,224.0,2023-03-01_13-51-03/25_1_7_3_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",fiasco_2020-12-01_14-09-14,196.0,196,"[{'nbrhd_tuples': 236, 'nbrhd_confs': 19, 'ite...",1.152,898.848,True,686.337217,900
228,39,77,YASA,t2_m1_null,0.202,17.0,2023-03-01_13-51-03/14_1_7_3_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",SortingLine,9.0,9,"[{'nbrhd_tuples': 206, 'nbrhd_confs': 12, 'ite...",0.202,899.798,True,66.414268,900


In [67]:
# Check that we have all results.
samplns_data_with_baseline_infos.groupby(["instance_name"]).size()

instance_name
APL                            5
APL-Model                      5
BankingSoftware                5
BattleofTanks                  5
ChatClient                     5
DMIE                           5
E-Shop                         5
EMBToolkit                     5
FameDB                         5
FeatureIDE                     5
FreeBSD-8_0_0                  5
PPU                            5
SafeBali                       5
SortingLine                    5
TightVNC                       5
Violet                         5
WaterlooGenerated              5
XSEngine                       5
aaed2000                       5
am31_sim                       5
atlas_mips32_4kc               5
axTLS                          5
berkeleyDB1                    5
berkeleyDB2                    5
busybox-1_18_0                 5
busybox-1_29_2                 5
busybox_2007-01-24_09-14-09    5
busybox_2020-12-16_21-53-05    5
calculate                      5
car                          

In [68]:
# Will stop the notebook if the data is bad
assert samplns_data_with_baseline_infos.groupby(["instance_name"]).size().min() == 5
assert samplns_data_with_baseline_infos.groupby(["instance_name"]).size().max() == 5

In [69]:
# Save the cleaned data to a json file
samplns_data_with_baseline_infos.to_json("./05_clean_data.json.zip")

In [70]:
# This just for looking into one instance.
t = read_as_pandas(
    RESULT_FOLDER,
    lambda result: {
        "parameters": result["parameters"],
        "initial_sample_path": result["parameters"]["args"]["initial_sample_path"],
        "instance_name": result["parameters"]["args"]["instance_name"],
        "lower_bound": result["result"]["lower_bound"],
        "upper_bound": len(result["result"]["solution"]),
        "iteration_info": result["result"]["iteration_info"],
        "time_used_by_yasa": result["result"]["time_used_by_yasa"],
        "timelimit_for_samplns": result["result"]["timelimit_for_samplns"],
        "samplns_used": result["result"]["samplns_used"],
        "instance": result["parameters"]["args"]["instance_name"],
        "logging": result["logging"],
    }
    if result.get("result", None) and result.get("result", dict()).get("solution", None)
    else None,
)
t = t[t["instance_name"] == "toybox_2020-12-06_00-02-46"]
for e in t.iloc[0]["logging"]:
    print(e["message"])

Parsed instance 'toybox_2020-12-06_00-02-46' with 334 features and 92 rules.
Preprocessing instance (Instance[toybox_2020-12-06_00-02-46]<334 features, 92 rules>).
Converting instance to CNF (Instance[toybox_2020-12-06_00-02-46|EQ]<334 features, 92 rules>).
Finished converting instance to CNF (Instance[toybox_2020-12-06_00-02-46|EQ|CNF]<334 features, 92 rules>).
Finnished preprocessing (Instance[toybox_2020-12-06_00-02-46|EQ|CNF]<334 features, 92 rules>).
Setting up random neighborhood selector.
Computing feasible tuples...
Converted sample to list representation.
Instance has 206665 feasible tuples.
Neighborhood selector is ready.
Building transaction graph for toybox_2020-12-06_00-02-46|EQ|CNF with 334 concrete features!
All valid configurations were added to the transaction graph.
Setting up random neighborhood selector.
Computing feasible tuples...
Converted sample to list representation.
Instance has 206665 feasible tuples.
Neighborhood selector is ready.
Beginning optimization wi

In [71]:
# Check that we can read the data
data = pd.read_json("./05_clean_data.json.zip")
data

Unnamed: 0,num_vars,num_clauses,baseline_alg,baseline_alg_conf,runtime_baseline,baseline_sample_size,path_to_baseline_sample,parameters,instance_name,lower_bound,upper_bound,iteration_info,time_used_by_yasa,timelimit_for_samplns,samplns_used,runtime_samplns,time_limit
0,1050,996,YASA,t2_m1_null,2.278,39,2023-03-01_13-51-03/38_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",busybox_2020-12-16_21-53-05,18,23,"[{'nbrhd_tuples': 222, 'nbrhd_confs': 4, 'iter...",2.278,897.722,True,909.288080,900
1,10,17,YASA,t2_m1_null,0.199,7,2023-03-01_13-51-03/3_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",email,6,6,"[{'nbrhd_tuples': 70, 'nbrhd_confs': 7, 'itera...",0.199,899.801,True,0.172952,900
2,580,879,YASA,t2_m1_null,1.438,149,2023-03-01_13-51-03/34_1_7_1_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",WaterlooGenerated,82,82,"[{'nbrhd_tuples': 248, 'nbrhd_confs': 36, 'ite...",1.438,898.562,True,637.711078,900
3,19,27,YASA,t2_m1_null,0.207,12,2023-03-01_13-51-03/7_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",FeatureIDE,7,8,"[{'nbrhd_tuples': 214, 'nbrhd_confs': 8, 'iter...",0.207,899.793,True,905.908822,900
4,19,27,YASA,t2_m1_null,0.208,12,2023-03-01_13-51-03/7_1_7_4_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",FeatureIDE,7,8,"[{'nbrhd_tuples': 215, 'nbrhd_confs': 9, 'iter...",0.208,899.792,True,905.727147,900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,101,203,YASA,t2_m1_null,0.288,27,2023-03-01_13-51-03/19_1_7_1_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",Violet,14,17,"[{'nbrhd_tuples': 178, 'nbrhd_confs': 7, 'iter...",0.288,899.712,True,923.982945,900
226,366,627,YASA,t2_m1_null,0.597,27,2023-03-01_13-51-03/30_1_7_5_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",DMIE,14,17,"[{'nbrhd_tuples': 241, 'nbrhd_confs': 5, 'iter...",0.597,899.403,True,909.323218,900
227,258,1542,YASA,t2_m1_null,1.152,224,2023-03-01_13-51-03/25_1_7_3_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",fiasco_2020-12-01_14-09-14,196,196,"[{'nbrhd_tuples': 236, 'nbrhd_confs': 19, 'ite...",1.152,898.848,True,686.337217,900
228,39,77,YASA,t2_m1_null,0.202,17,2023-03-01_13-51-03/14_1_7_3_sample.csv,"{'func': 'run_samplns', 'args': {'instance_nam...",SortingLine,9,9,"[{'nbrhd_tuples': 206, 'nbrhd_confs': 12, 'ite...",0.202,899.798,True,66.414268,900


* `instance_name`: The instance name.
* `num_vars`: The number of variables in the instance.
* `num_clauses`: The number of clauses in the instance.
* `baseline_alg`: The baseline algorithm that created the sample
* `baseline_alg_conf`: The configuration of the baseline algorithm that created the sample.
* `runtime_baseline`: The runtime of the baseline algorithm that created the sample.
* `baseline_sample_size`: The size of the sample created by the baseline algorithm.
* `path_to_baseline_sample`: The path to the sample created by the baseline algorithm.
* `parameters` The parameters used to run samplns.
* `lower_bound` The lower bound computed by SampLNS.
* `upper_bound` The upper bound computed by SampLNS.
* `iteration_info` Info the extract the progress of SampLNS. The time do not include the baseline algorithm.
* `time_used_by_yasa` The time used by the baseline algorithm.
* `timelimit_for_samplns` The time limit for SampLNS. Check also `parameters` for the full timelimit.
* `runtime_samplns` The runtime of SampLNS without the baseline.
* `time_limit` The overall timelimit.
