In [None]:
import os

if "COLAB_GPU" in os.environ:
  from google.colab import drive
  print("Hello, Colab")
  drive.mount("/content/drive")
  ROOT_PATH = "/content/drive/MyDrive/hero"
  os.environ["CLEARML_CONFIG_FILE"] = f"{ROOT_PATH}/clearml.conf"
elif "PAPERSPACE_CLUSTER_ID" in os.environ:
  print("Hello, Paperspace")
  ROOT_PATH = "/notebooks/hero"
  os.environ["CLEARML_CONFIG_FILE"] = f"{ROOT_PATH}/clearml.conf"
else:
  print("Hello, Local PC")
  ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd()))))

BTCNN_PATH = os.path.join(ROOT_PATH, "btcnn/src/btcnn")
HBO_BENCH_PATH = os.path.join(ROOT_PATH, "hbo_bench/src/hbo_bench")
EXPERIMENT_PATH = os.getcwd()
ARTIFACTS_PATH = os.path.join(EXPERIMENT_PATH, "artifacts")

BTCNN_PATH = os.path.join(ROOT_PATH, "btcnn/src/btcnn")
HBO_BENCH_PATH = os.path.join(ROOT_PATH, "hbo_bench/src/hbo_bench")
EXPERIMENT_PATH = os.getcwd()
ARTIFACTS_PATH = os.path.join(EXPERIMENT_PATH, "artifacts")

In [None]:
import random
from collections import defaultdict
from matplotlib import pyplot as plt
import seaborn as sns
from json import dump, load

import torch
import pandas as pd
import numpy as np

from hero.hero import Hero
from hero.wrappers import ORACLES_DICT, initialize_oracles, _get_e2e_time, _get_execution_time, _get_planning_time, _get_logical_tree
from hero.neural_network import NN, get_bt_regressor
from hero.train_utils import load_model
from hero.emulation import get_report, emulate_online_learning

from hbo_bench.local_search_settings import *
from hbo_bench.data_config import DEFAULT_DOP, DEFAULT_HINTSET

In [None]:
initialize_oracles(HBO_BENCH_PATH, ["JOB", "sample_queries"])

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device is {DEVICE}")

In [None]:
job_workload = ORACLES_DICT["JOB"].get_query_names()
sq_workload = ORACLES_DICT["sample_queries"].get_query_names()

# Static Workload

## Ideal Case (all plans are in train data)

### Default Dop

In [None]:
# epochs = 500

# job_ideal_model = NN(
#     fit_settings=ALL_SS, 
#     inference_settings=EMPTY_SS, 
#     model=get_bt_regressor("job_ideal_model", DEVICE),
#     path_to_save=f"{EXPERIMENT_PATH}/models/job_ideal_model.pth"
# )
# job_ideal_model.fit(job_workload, epochs=epochs)
job_ideal_model = load_model(DEVICE, f"{EXPERIMENT_PATH}/models/job_ideal_model.pth", get_bt_regressor("none", DEVICE))

# sq_ideal_model = NN(
#     fit_settings=ALL_SS, 
#     inference_settings=EMPTY_SS, 
#     model=get_bt_regressor("sq_ideal_model", DEVICE),
#     path_to_save=f"{EXPERIMENT_PATH}/models/sq_ideal_model.pth"
# )
# sq_ideal_model.fit(sq_workload, epochs=epochs)
sq_ideal_model = load_model(DEVICE, f"{EXPERIMENT_PATH}/models/sq_ideal_model.pth", get_bt_regressor("none", DEVICE))

In [None]:
def extend_df(df):
    df["ex boost (% of opt)"] = 100 * (df["def_ex"] - df["custom_ex"]) / (df["def_ex"] - df["opt_ex"])
    df["e2e boost (% of opt)"] = 100 * (df["def_e2e"] - df["custom_e2e"]) / (df["def_e2e"] - df["opt_e2e"])
    df["e2e boost (%)"] = 100 * (df["def_e2e"] - df["custom_e2e"]) / df["def_e2e"]
    
    columns = [
        "model", 
        "searching_settings", 
        "workload", 
        "e2e boost (%)", 
        "e2e boost (% of opt)", 
        "ex boost (% of opt)", 
        "n_timeouts (%)", 
        "n_real_degradations (%)",
        "custom_e2e", 
        "custom_ex", 
        "custom_inference", 
        "only_def_dop", 
    ]
    
    def count_real_degradations(predictions):
        return sum(
            _get_e2e_time(q_n, hs, dop, False) > 1.1 * _get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP, False)
            for q_n, hs, dop in predictions
        )
    sizes = df["predictions"].apply(lambda el: len(el))
    df["n_timeouts (%)"] = 100 * df["n_timeouts"].apply(lambda el: int(el)) / sizes
    df["n_real_degradations (%)"] = 100 * df["predictions"].apply(count_real_degradations) / sizes
    df = df[columns]
    return df.round(1)

In [None]:
# ss_and_descrs = [
#     (GREEDY_DEF_DOP_SS, "greedy"), 
#     (PRUNED_GREEDY_DEF_DOP_SS, "pruned greedy"), 
#     (LOCAL_DEF_DOP_SS, "local"),
#     (PRUNED_LOCAL_DEF_DOP_SS, "pruned local"),
#     (ALL_DEF_DOP_SS, "exhaustive")
# ]

# def_dop_ideal_reports = []
# for ss, ss_descr in ss_and_descrs:
#     ideal_job_nn_model = NN(fit_settings=EMPTY_SS, inference_settings=ss, model=get_bt_regressor("ideal_job", DEVICE))
#     ideal_job_nn_model.model = load_model(device=DEVICE, path=f"{EXPERIMENT_PATH}/models/job_ideal_model.pth", model=ideal_job_nn_model.model)
#     def_dop_ideal_reports.append(get_report(ideal_job_nn_model, "NN", job_workload, "JOB", ss, ss_descr, only_def_dop=True))

#     ideal_job_hero_model = Hero(fit_settings=ss)
#     ideal_job_hero_model.fit(job_workload)
#     def_dop_ideal_reports.append(get_report(ideal_job_hero_model, "Hero", job_workload, "JOB", ss, ss_descr, only_def_dop=True))
    
#     ideal_sq_nn_model = NN(fit_settings=EMPTY_SS, inference_settings=ss, model=get_bt_regressor("ideal_sq", DEVICE))
#     ideal_sq_nn_model.model = load_model(device=DEVICE, path=f"{EXPERIMENT_PATH}/models/sq_ideal_model.pth", model=ideal_sq_nn_model.model)
#     def_dop_ideal_reports.append(get_report(ideal_sq_nn_model, "NN", sq_workload, "SQ", ss, ss_descr, only_def_dop=True))

#     ideal_sq_hero_model = Hero(fit_settings=ss)
#     ideal_sq_hero_model.fit(sq_workload)
#     def_dop_ideal_reports.append(get_report(ideal_sq_hero_model, "Hero", sq_workload, "SQ", ss, ss_descr, only_def_dop=True))

# with open(f"{ARTIFACTS_PATH}/def_dop_ideal_reports.json", "w") as f:
#     dump(def_dop_ideal_reports, f)

In [None]:
with open(f"{ARTIFACTS_PATH}/def_dop_ideal_reports.json", "r") as f:
    def_dop_df = extend_df(pd.DataFrame(load(f)))

In [None]:
def_dop_df[(def_dop_df["workload"] == "JOB")].sort_values(by="e2e boost (% of opt)", ascending=False)

In [None]:
def_dop_df[(def_dop_df["workload"] == "SQ")].sort_values(by="e2e boost (% of opt)", ascending=False)

### All Dop's

In [None]:
# ss_and_descrs = [
#     (GREEDY_SS, "greedy"), 
#     (PRUNED_GREEDY_SS, "pruned greedy"), 
#     (LOCAL_SS, "local"),
#     (PRUNED_LOCAL_SS, "pruned local"),
#     (ALL_SS, "exhaustive")
# ]

# all_dops_ideal_reports = []
# for ss, ss_descr in ss_and_descrs:
#     ideal_job_nn_model = NN(fit_settings=EMPTY_SS, inference_settings=ss, model=get_bt_regressor("ideal_job", DEVICE))
#     ideal_job_nn_model.model = load_model(device=DEVICE, path=f"{EXPERIMENT_PATH}/models/job_ideal_model.pth", model=ideal_job_nn_model.model)
#     all_dops_ideal_reports.append(get_report(ideal_job_nn_model, "NN", job_workload, "JOB", ss, ss_descr, only_def_dop=False))

#     ideal_job_hero_model = Hero(fit_settings=ss)
#     ideal_job_hero_model.fit(job_workload)
#     all_dops_ideal_reports.append(get_report(ideal_job_hero_model, "Hero", job_workload, "JOB", ss, ss_descr, only_def_dop=False))
    
#     ideal_sq_nn_model = NN(fit_settings=EMPTY_SS, inference_settings=ss, model=get_bt_regressor("ideal_sq", DEVICE))
#     ideal_sq_nn_model.model = load_model(device=DEVICE, path=f"{EXPERIMENT_PATH}/models/sq_ideal_model.pth", model=ideal_sq_nn_model.model)
#     all_dops_ideal_reports.append(get_report(ideal_sq_nn_model, "NN", sq_workload, "SQ", ss, ss_descr, only_def_dop=False))

#     ideal_sq_hero_model = Hero(fit_settings=ss)
#     ideal_sq_hero_model.fit(sq_workload)
#     all_dops_ideal_reports.append(get_report(ideal_sq_hero_model, "Hero", sq_workload, "SQ", ss, ss_descr, only_def_dop=False))

# with open(f"{ARTIFACTS_PATH}/all_dops_ideal_reports.json", "w") as f:
#     dump(all_dops_ideal_reports, f)

In [None]:
with open(f"{ARTIFACTS_PATH}/all_dops_ideal_reports.json", "r") as f:
    all_dops_df = extend_df(pd.DataFrame(load(f)))

In [None]:
all_dops_df[(all_dops_df["workload"] == "JOB")].sort_values(by="e2e boost (% of opt)", ascending=False)

In [None]:
all_dops_df[(all_dops_df["workload"] == "SQ")].sort_values(by="e2e boost (% of opt)", ascending=False)

We can see that even in the ideal scenario, the NN loses a bit, mainly due to the longer inference time. Moreover, it sometimes leads to degradations and even `T/O` (this probably happens on small queries).

The advantage of a `PRUNED LOCAL` strategy is also evident.

## Online Scenario

Collecting train data during online optimisation.

In [None]:
# for workload, workload_name in [(job_workload, "JOB"), (sq_workload, "SQ")]:
#     def_dop_list_online_reports = []
#     all_dops_list_online_reports = []

#     epochs, iterations = 300, 25
#     for ss, ss_descr in [
#         (GREEDY_DEF_DOP_SS, "GREEDY"),
#         (PRUNED_GREEDY_DEF_DOP_SS, "PRUNED GREEDY"),
#         (LOCAL_DEF_DOP_SS, "LOCAL"),
#         (PRUNED_LOCAL_DEF_DOP_SS, "PRUNED LOCAL"),
#         (ALL_DEF_DOP_SS, "EXHAUSTIVE"),
#     ]:
#         def_dop_list_online_reports.append(emulate_online_learning("NN", workload, workload_name, ss, ss_descr, True, True, epochs, iterations, None, DEVICE))

#     for ss, ss_descr in [
#         (GREEDY_SS, "GREEDY"),
#         (PRUNED_GREEDY_SS, "PRUNED GREEDY"),
#         (LOCAL_SS, "LOCAL"),
#         (PRUNED_LOCAL_SS, "PRUNED LOCAL"),
#         (ALL_SS, "EXHAUSTIVE"),
#     ]:          
#         all_dops_list_online_reports.append(emulate_online_learning("NN", workload, workload_name, ss, ss_descr, False, True, epochs, iterations, None, DEVICE))
    
#     with open(f"{ARTIFACTS_PATH}/{workload_name}_def_dop_list_online_reports.json", "w") as f:
#         dump(def_dop_list_online_reports, f)
#     with open(f"{ARTIFACTS_PATH}/{workload_name}_all_dops_list_online_reports.json", "w") as f:
#         dump(all_dops_list_online_reports, f)

In [None]:
def visualise(
    list_reports,
    figsize=(12, 6),
    label_fontsize=10, 
    linewidth_small=1, 
    linewidth_big=2, 
    markersize=5, 
    markeredgewidth=5,
    linestyle_small='--',
    linestyle_big='-', 
    alpha_small=0.4, 
    alpha_big=1.0, 
    xlabel="iteration", 
    ylabel="Time (sec)", 
    tick_label_fontsize=12, 
    legend_fontsize=10, 
    save_name=None    
):
    sns.set_style("ticks")
    sns.set_palette("deep")
    colors = sns.color_palette("deep", len(list_reports[0]) + 2)

    fig, ax = plt.subplots(figsize=figsize)

    x_values = np.arange(len(list_reports[0]))

    for i, report in enumerate(list_reports):
        if "GREEDY" in report[0]["searching_settings"]:
            marker = "^"
        elif "LOCAL" in report[0]["searching_settings"]:
            marker = "x"
        else:
            marker = "o"
        color = colors[i]
        ax.plot(
            x_values, 
            [el["custom_ex"] for el in report], 
            marker=marker,
            markersize=markersize,
            markeredgewidth=markeredgewidth,
            linewidth=linewidth_small, 
            linestyle=linestyle_small, 
            color=color, 
            label=f'{report[0]["searching_settings"]} EX'
        )
        ax.plot(
            x_values, 
            [el["custom_e2e"] for el in report], 
            marker=marker, 
            linewidth=linewidth_big, 
            markersize=markersize, 
            markeredgewidth=markeredgewidth,
            linestyle=linestyle_big, 
            color=color, 
            label=f'{report[0]["searching_settings"]} E2E'
        )

    for i, metric in enumerate(["opt", "def"], start=1):
        for key, linewidth, linestyle, alpha in zip(
            ["ex", "e2e"], 
            [linewidth_small, linewidth_big], 
            [linestyle_small, linestyle_big], 
            [alpha_small, alpha_big]
        ):
            label = "Optimum" if metric == "opt" else "Default"
            label += " Ex" if key == "ex" else " E2E"
            value = list_reports[0][0][f"{metric}_{key}"]
            plt.plot(
                [0, len(list_reports[0])-1], 
                [value, value], 
                linewidth=linewidth, 
                color=colors[-i], 
                alpha=alpha, 
                linestyle=linestyle, 
                label=label
            )
    
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), fontsize=legend_fontsize, loc="upper right")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.set_xticks(range(len(list_reports[0])))
    ax.set_ylim(bottom=list_reports[0][0][f"opt_ex"] - 10)

    plt.xlabel(xlabel, fontsize=label_fontsize)
    plt.ylabel(ylabel, fontsize=label_fontsize)

    plt.grid(True, which='both', linestyle='--', linewidth=0.2)

    if save_name:
        plt.savefig(f"{ARTIFACTS_PATH}/{save_name}", format='svg', dpi=300)
    
    plt.show()

In [None]:
for workload, workload_name in [(job_workload, "JOB"), (sq_workload, "SQ")]:
    with open(f"{ARTIFACTS_PATH}/{workload_name}_def_dop_list_online_reports.json", "r") as f:
        def_dop_list_online_reports = load(f)
        visualise(def_dop_list_online_reports, figsize=(16, 8), save_name=f"{workload_name}_def_dop_online.svg")

    with open(f"{ARTIFACTS_PATH}/{workload_name}_all_dops_list_online_reports.json", "r") as f:
        all_dops_list_online_reports = load(f)
        visualise(all_dops_list_online_reports, figsize=(16, 8), save_name=f"{workload_name}_all_dops_online.svg")

We see that, as the search space expands, the exhaustive algorithms stop working (at least on `JOB`). The superiority of the `Local Search` algorithm and the pruning procedure is also evident.

Moreover, it did not always converge to the optimum even in 25 iterations - this tells us that it makes sense to take the learning procedure offline (as it is done in `Hero`).

# Dynamic Scenario

## Split by Time

In [None]:
# slow_reports, fast_reports = [], []

# epochs = 300
# for workload, workload_name in [(job_workload, "JOB"), (sq_workload, "SQ")]:
#     workload = [q_n for q_n in workload if _get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP) > 1]

#     slow_time_treshold = np.quantile([_get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP) for q_n in workload], .5)
#     slow_train = [q_n for q_n in workload if _get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP) > slow_time_treshold]
#     slow_test = [q_n for q_n in workload if _get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP) <= slow_time_treshold]
#     assert len(slow_train) + len(slow_test) == len(workload)
#     slow_nnmodel = NN(fit_settings=ALL_SS, inference_settings=EMPTY_SS, model=get_bt_regressor("dummy", DEVICE))
#     slow_nnmodel.fit(slow_train, epochs)

#     fast_time_treshold = np.quantile([_get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP) for q_n in workload], .5)
#     fast_train = [q_n for q_n in workload if _get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP) < fast_time_treshold]
#     fast_test = [q_n for q_n in workload if _get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP) >= fast_time_treshold]
#     assert len(fast_train) + len(fast_test) == len(workload)
#     fast_nnmodel = NN(fit_settings=ALL_SS, inference_settings=EMPTY_SS, model=get_bt_regressor("dummy", DEVICE))
#     fast_nnmodel.fit(fast_train, epochs)

#     for ss, ss_descr in [
#         (GREEDY_SS, "GREEDY"),
#         (PRUNED_GREEDY_SS, "PRUNED GREEDY"),
#         (LOCAL_SS, "LOCAL"),
#         (PRUNED_LOCAL_SS, "PRUNED LOCAL"),
#         (ALL_SS, "EXHAUSTIVE"),
#     ]:
#         slow_reports.append(get_report(slow_nnmodel, "NN", slow_train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#         slow_reports.append(get_report(slow_nnmodel, "NN", slow_test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))
#         slow_heromodel = Hero(ss)
#         slow_heromodel.fit(slow_train)
#         slow_reports.append(get_report(slow_heromodel, "Hero", slow_train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#         slow_reports.append(get_report(slow_heromodel, "Hero", slow_test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))

#         fast_reports.append(get_report(fast_nnmodel, "NN", fast_train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#         fast_reports.append(get_report(fast_nnmodel, "NN", fast_test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))
#         fast_heromodel = Hero(ss)
#         fast_heromodel.fit(fast_train)
#         fast_reports.append(get_report(fast_heromodel, "Hero", fast_train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#         fast_reports.append(get_report(fast_heromodel, "Hero", fast_test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))

#         with open(f"{ARTIFACTS_PATH}/{workload_name}_slow_reports.json", "w") as f:
#             dump(slow_reports, f)
#         with open(f"{ARTIFACTS_PATH}/{workload_name}_fast_reports.json", "w") as f:
#             dump(fast_reports, f)

In [None]:
with open(f"{ARTIFACTS_PATH}/JOB_slow_reports.json", "r") as f:
    job_slow_df = extend_df(pd.DataFrame(load(f)))
with open(f"{ARTIFACTS_PATH}/JOB_fast_reports.json", "r") as f:
    job_fast_df = extend_df(pd.DataFrame(load(f)))
with open(f"{ARTIFACTS_PATH}/SQ_slow_reports.json", "r") as f:
    sq_slow_df = extend_df(pd.DataFrame(load(f)))
with open(f"{ARTIFACTS_PATH}/SQ_fast_reports.json", "r") as f:
    sq_fast_df = extend_df(pd.DataFrame(load(f)))

### slow $\rightarrow$ fast

In [None]:
job_slow_df[(job_slow_df["workload"] == "JOB[train]")].sort_values(by="e2e boost (%)", ascending=False)

In [None]:
job_slow_df[(job_slow_df["workload"] == "JOB[test]")].sort_values(by="e2e boost (%)", ascending=False)

In [None]:
sq_slow_df[(sq_slow_df["workload"] == "SQ[train]")].sort_values(by="e2e boost (%)", ascending=False)

In [None]:
sq_slow_df[(sq_slow_df["workload"] == "SQ[test]")].sort_values(by="e2e boost (%)", ascending=False)

We see, that `Hero` is always better on `train`, is safer on `test` (`SQ`), but sometimes it misses possible boost (`JOB`)

### fast $\rightarrow$ slow

In [None]:
job_fast_df[(job_fast_df["workload"] == "JOB[train]")].sort_values(by="e2e boost (%)", ascending=False)

In [None]:
job_fast_df[(job_fast_df["workload"] == "JOB[test]")].sort_values(by="e2e boost (%)", ascending=False)

In [None]:
sq_fast_df[(sq_fast_df["workload"] == "SQ[train]")].sort_values(by="e2e boost (%)", ascending=False)

In [None]:
sq_fast_df[(sq_fast_df["workload"] == "SQ[test]")].sort_values(by="e2e boost (%)", ascending=False)

**Conclusions.**\
We can perfectly see the signs of **overfitting** in `Hero` (which is what we wanted) - almost perfect performance on training and safe, rare predictions on the test. On new data, we stop predicting, so we don't get degradation.

The NN approach, on the other hand, has the advantage of **being able to generalise knowledge** to new queries. Thus, we can see that generalisation from fast queries on `SQ` is quite effective - we can speed up their execution by 2 times (even taking into account that we get degradations in 17% of cases). However, prediction on new queries can also bring regression, which is observed from generalising over slow queries.

P.S. It is probably easier to generalise knowledge from short queries to long queries because parts of their efficient long query plans are quite fast to execute and must have already been encountered in fast query plans.

## Split by structure

In [None]:
def get_traintest_split(groups, ratio, seed=42, debug=False):
    train, test = [], []
    for group in groups:
        random.seed(seed)
        random.shuffle(group)
        pivot = int(len(group) * ratio)
        train += group[:pivot]
        test += group[pivot:]
        if debug: 
            print(f"{group} -> {group[:pivot]}, {group[pivot:]}")
    return train, test

In [None]:
# epochs = 300
# for workload, workload_name in [(job_workload, "JOB")]:
    
#     logical_trees_to_queries = defaultdict(list)
#     for q_n in workload:
#         logical_trees_to_queries[_get_logical_tree(q_n, DEFAULT_HINTSET, DEFAULT_DOP)].append(q_n)

#     structure_reports = []
#     for seed in range(10):
#         train, test = get_traintest_split([v for v in logical_trees_to_queries.values() if len(v) > 1], ratio=0.5, seed=seed)
#         nnmodel = NN(fit_settings=ALL_SS, inference_settings=ss, model=get_bt_regressor("dummy", DEVICE))
#         nnmodel.fit(train, epochs)

#         for ss, ss_descr in [
#             (GREEDY_SS, "GREEDY"),
#             (PRUNED_GREEDY_SS, "PRUNED GREEDY"),
#             (LOCAL_SS, "LOCAL"),
#             (PRUNED_LOCAL_SS, "PRUNED LOCAL"),
#             (ALL_SS, "EXHAUSTIVE"),
#         ]:
#             structure_reports.append(get_report(nnmodel, "NN", train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#             structure_reports.append(get_report(nnmodel, "NN", test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))

#             heromodel = Hero(ss)
#             heromodel.fit(train)
#             structure_reports.append(get_report(heromodel, "Hero", train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#             structure_reports.append(get_report(heromodel, "Hero", test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))
        
#     with open(f"{ARTIFACTS_PATH}/{workload_name}_structure_reports.json", "w") as f:
#         dump(structure_reports, f)

In [None]:
def aggregate_results(reports):
    df = pd.DataFrame(reports)
    df["ex boost (% of opt)"] = 100 * (df["def_ex"] - df["custom_ex"]) / (df["def_ex"] - df["opt_ex"])
    df["e2e boost (% of opt)"] = 100 * (df["def_e2e"] - df["custom_e2e"]) / (df["def_e2e"] - df["opt_e2e"])
    df["e2e boost (%)"] = 100 * (df["def_e2e"] - df["custom_e2e"]) / df["def_e2e"]

    def count_real_degradations(predictions):
        return sum(
            _get_e2e_time(q_n, hs, dop) > 1.1 * _get_e2e_time(q_n, DEFAULT_HINTSET, DEFAULT_DOP)
            for q_n, hs, dop in predictions
        )
    
    sizes = df["predictions"].apply(lambda el: len(el))
    df["n_timeouts (%)"] = 100 * df["n_timeouts"].apply(lambda el: int(el)) / sizes
    df["n_real_degradations (%)"] = 100 * df["predictions"].apply(count_real_degradations) / sizes

    experiment_cols = ["model", "workload", "searching_settings"]
    value_cols = [
        "e2e boost (%)",
        "e2e boost (% of opt)", 
        "ex boost (% of opt)", 
        "n_timeouts (%)", 
        "n_real_degradations (%)",
        "custom_e2e", 
        "custom_ex", 
        "custom_inference",    
    ]

    df = df[experiment_cols + value_cols]

    grouped_df = df.groupby(experiment_cols).agg(
        {
            col: ["mean", "std"]
            for col in value_cols
        }
    ).reset_index()

    def combine_mean_std(row):
        return f"{row['mean']:.1f} ± {row['std']:.1f}"

    for col in value_cols:
        grouped_df[(col, 'mean ± std')] = grouped_df[col].apply(combine_mean_std, axis=1)

    grouped_df.columns = [' '.join(col).strip() for col in grouped_df.columns.values]
    columns_to_keep = experiment_cols + [f"{col} mean" for col in value_cols] + [f"{col} std" for col in value_cols]
    grouped_df = grouped_df[columns_to_keep]
    grouped_df.columns = experiment_cols + [f"mean {col}" for col in value_cols] + [f"std {col}" for col in value_cols]
    return grouped_df.round(1)

In [None]:
with open(f"{ARTIFACTS_PATH}/JOB_structure_reports.json", "r") as f:
    job_structure_df = aggregate_results(load(f))

In [None]:
interesting_cols = [
    "model", 
    "searching_settings",
    "mean e2e boost (%)",
    "mean e2e boost (% of opt)",
    "mean ex boost (% of opt)",
    "mean n_timeouts (%)",
    "mean n_real_degradations (%)",    
    "std e2e boost (%)",
    "std e2e boost (% of opt)",
    ]

In [None]:
job_structure_df[(job_structure_df["workload"] == "JOB[train]")].sort_values(by="mean e2e boost (% of opt)", ascending=False)[interesting_cols]

In [None]:
job_structure_df[(job_structure_df["workload"] == "JOB[test]")].sort_values(by="mean e2e boost (% of opt)", ascending=False)[interesting_cols]

We see, that generalisation in the presence of structure is efficient (boost up to 36% on JOB and 70% on `SQ`), but even though the structure of the logical plans was repeated, about 20% of the predictions either slowed down the query or led them to `T/O`.

## Random Split

In [None]:
# epochs = 300
# for workload, workload_name in [(job_workload, "JOB"), (sq_workload, "SQ")]:
#     random_split_reports = []
#     for seed in range(10):
#         train, test = get_traintest_split([workload], ratio=0.5, seed=seed)
#         nnmodel = NN(fit_settings=ALL_SS, inference_settings=ss, model=get_bt_regressor("dummy", DEVICE))
#         nnmodel.fit(train, epochs)

#         for ss, ss_descr in [
#             (GREEDY_SS, "GREEDY"),
#             (PRUNED_GREEDY_SS, "PRUNED GREEDY"),
#             (LOCAL_SS, "LOCAL"),
#             (PRUNED_LOCAL_SS, "PRUNED LOCAL"),
#             (ALL_SS, "EXHAUSTIVE"),
#         ]:
#             random_split_reports.append(get_report(nnmodel, "NN", train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#             random_split_reports.append(get_report(nnmodel, "NN", test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))

#             heromodel = Hero(ss)
#             heromodel.fit(train)
#             random_split_reports.append(get_report(heromodel, "Hero", train, f"{workload_name}[train]", ss, ss_descr, only_def_dop=False))
#             random_split_reports.append(get_report(heromodel, "Hero", test, f"{workload_name}[test]", ss, ss_descr, only_def_dop=False))
        
#     with open(f"{ARTIFACTS_PATH}/{workload_name}_random_split_reports.json", "w") as f:
#         dump(random_split_reports, f)

In [None]:
with open(f"{ARTIFACTS_PATH}/JOB_random_split_reports.json", "r") as f:
    job_random_split_df = aggregate_results(load(f))
with open(f"{ARTIFACTS_PATH}/SQ_random_split_reports.json", "r") as f:
    sq_random_split_df = aggregate_results(load(f))

In [None]:
sq_random_split_df[(sq_random_split_df["workload"] == "SQ[test]")].sort_values(by="mean e2e boost (%)", ascending=False)[interesting_cols]

In [None]:
job_random_split_df[(job_random_split_df["workload"] == "JOB[test]")].sort_values(by="mean e2e boost (%)", ascending=False)[interesting_cols]

**Conclusions.**\
We can clearly see that with random partitioning the "power" of generalisation drops significantly. We also can see, that the more plans are evaluated by the NN, the greater the probability of observing degradations and regressions (up to 30% `T/O`). But even versions with pruned search slow down queries in about 20% of cases.