# Cleaning the data and providing a small and easily parsable database.

The database with the full logs is too huge to work on. Most of the data would only be needed for debugging but not for evaluation.
Thus, we create a clean and small database for the further evaluation.

In [1]:
import pandas as pd

# Loading the data of the experiment.
from _utils import get_results, parse_sample

# CHECK THAT THESE ARE THE CORRECT PATHS!
INPUT_SAMPLE_ARCHIVE = "./00_baseline/900_seconds_5_it.zip"
INSTANCE_ARCHIVE = "./00_benchmark_instances.zip"
RESULT_FOLDER = "./01_results/900_seconds_5_it_900/"


def recache_data():
    relevant_columns = ["instance", "#Variables", "#Clauses", "iteration_info",
                        "baseline_alg",
                        "initial_sample_path", "SampleSize", "lower_bound", "upper_bound"]
    data = get_results(input_sample_archive=INPUT_SAMPLE_ARCHIVE,
                       result_folder=RESULT_FOLDER)[relevant_columns]
    data.rename(columns={"SampleSize": "initial_sample_size", "upper_bound": "optimized_sample_size"}, inplace=True)
    data.drop_duplicates(subset=["initial_sample_path"], inplace=True)
    data.to_json("./05_clean_data.json.zip")
    del data

recache_data()

Found the data folders: {'2023-03-01_13-51-03/'}
Removed 0 results because of size constraint.


In [2]:
data = pd.read_json("./05_clean_data.json.zip")
data

Index(['instance', '#Variables', '#Clauses', 'iteration_info', 'baseline_alg',
       'initial_sample_path', 'initial_sample_size', 'lower_bound',
       'optimized_sample_size'],
      dtype='object')

* `instance` Unique name of feature model.
* `#Variables` Number of variables in model.
* `#Clauses` Number of clauses in model.
* `iteration_info` Information on the individual iterations (needs some additional processing)
* `baseline_alg` Name of the algorithm that computed the initial sample.
* `initial_sample_path` Path to the initial sample in the database (for querying, but it also serves as identifier).
* `initial_sample_size` Size of the initial sample.
* `lower_bound` The lower bound computed by SampLNS.
* `optimized_sample_size` Size of the sample after optimization with SampLNS.