In [1]:
import os
import sys

ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
BTCNN_PATH = os.path.join(ROOT_PATH, "btcnn")
HBO_BENCH_PATH = os.path.join(ROOT_PATH, "hbo_bench")

sys.path.insert(0, ROOT_PATH)
sys.path.insert(0, BTCNN_PATH)
sys.path.insert(0, HBO_BENCH_PATH)

EXPERIMENT_PATH = os.getcwd()
ARTIFACTS_PATH = os.path.join(EXPERIMENT_PATH, "artifacts")

In [2]:
from json import load, dumps, dump
from tqdm import tqdm
import numpy as np
import pandas as pd
from hbo_bench.oracle import Oracle, OracleRequest, TIMEOUT
from hbo_bench.data_config import HINTSETS, DOPS, HINTS, DEFAULT_HINTSET, DEFAULT_DOP
from hbo_bench.utils import get_logical_tree, get_full_plan
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# other benchmarks don't have more than 1 query inside one template 
oracle = Oracle(f"{HBO_BENCH_PATH}/data/processed/JOB")

# Is the template or plan enough to safely estimate the usefullness of hintset?

Here we will check the deviations of execution time inside the group of queries under different hintsets.
Considered groupping functions are:

- query $\rightarrow$ logical plan, and
- query $\rightarrow$ full plan (i.e. logical plan with estimated cardinalities)

For simplicity, we restrict ourselves to groups within which a) a hintset causes some queries to speed up and some to T/O, and b) all queries are sufficiently long (> 1 sec),

In [19]:
from utils import get_full_plan, get_logical_tree, get_selectivities

logical_plan_to_queries = defaultdict(list)
full_plan_to_queries = defaultdict(list)

for query_name in oracle.get_query_names():
    plan = oracle.get_explain_plan(OracleRequest(query_name=query_name, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP))
    logical_plan = get_logical_tree(plan)
    full_plan = get_full_plan(plan)

    logical_plan_to_queries[logical_plan].append(query_name)
    full_plan_to_queries[full_plan].append(query_name)

In [20]:
TIMEOUT_REL = 2

In [21]:
def make_group_summary(query_groups, only_interesting=True):
    """
    In each query group it looks for cases, where hintset for one queries lead to boost 
    and for another one lead to degradation, and collects info about the most interesting ones.
    """
    group_summaries = []
    for query_group in query_groups:   
        min_timeouts = 0
        hs_to_show, def_times_to_show, cust_times_to_show, boosts_to_show = None, None, None, None
        
        for hs in HINTSETS:
            def_times, cust_times, speedups, boosts = [], [], [], []
            for q_n in sorted(query_group):
                cust_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP))
                def_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP))
                
                def_times.append(def_time)
                cust_times.append(cust_time)
                speedups.append(def_time - cust_time)

                if cust_time >= TIMEOUT:
                    boosts.append(-TIMEOUT_REL)
                elif cust_time > def_time:
                    boosts.append(-cust_time / def_time)
                else:
                    boosts.append(def_time / cust_time)

            are_queries_small = max(def_times) < 1000
            if only_interesting and are_queries_small:
               continue
            
            n_timeouts = sum([boost == -TIMEOUT_REL for boost in boosts])
            new_n_timeouts_record = (n_timeouts > min_timeouts)
            n_min_timeouts = (n_timeouts == min_timeouts)
            new_boost_record = (hs_to_show is None or max(boosts_to_show) <= max(boosts))
            boost_is_large_enough = (max(boosts) > 1.01)

            if (new_n_timeouts_record or (n_min_timeouts and new_boost_record)) and boost_is_large_enough:
                min_timeouts = n_timeouts
                hs_to_show = hs
                def_times_to_show = def_times
                cust_times_to_show = cust_times
                boosts_to_show = boosts
                speedups_to_show = speedups
            
        is_interesting_case_found = (hs_to_show is not None and len(query_group) > 1)
        if is_interesting_case_found:
            summary = (sorted(query_group), hs_to_show, speedups_to_show, def_times_to_show, cust_times_to_show)
            group_summaries.append(summary)
            
    return group_summaries

## logical tree $\rightarrow t^{ex}$

In [22]:
logical_plan_summary = make_group_summary(query_groups=logical_plan_to_queries.values())
logical_plan_summary = sorted(logical_plan_summary, key=lambda el: max(el[2]) - min(el[2]), reverse=True)

data = []
for (query_group, hs, speedups, def_times, cust_times) in logical_plan_summary:
    boosts = []
    for cust_time, def_time in zip(cust_times, def_times):
        if cust_time >= TIMEOUT:
            boosts.append("`NaN`")
        elif cust_time < def_time:
            boosts.append(round(def_time / cust_time, 2))
        else:
            boosts.append(round(-cust_time / def_time, 2))

    cust_times = [round(v/1000, 2) if v < TIMEOUT else "`T/O`" for v in cust_times]
    
    def_times = [round(v/1000, 2) for v in def_times]

    data.append({
        "Query Group": query_group,
        "# hintset": hs,
        "Default Ex. Time (sec)": def_times,
        "Custom Ex. Time (sec)": cust_times,
        "Boosts": boosts,
    }
    )
pd.DataFrame(data).to_csv(f"{ARTIFACTS_PATH}/logical_plans.csv", index=False)

## full plan $\rightarrow t^{ex}$

In [23]:
full_plan_summary = make_group_summary(query_groups=full_plan_to_queries.values())
full_plan_summary = sorted(full_plan_summary, key=lambda el: max(el[2]) - min(el[2]), reverse=True)

data = []
for (query_group, hs, speedups, def_times, cust_times) in full_plan_summary:
    boosts = []
    for cust_time, def_time in zip(cust_times, def_times):
        if cust_time >= TIMEOUT:
            boosts.append("`NaN`")
        elif cust_time < def_time:
            boosts.append(round(def_time / cust_time, 2))
        else:
            boosts.append(round(-cust_time / def_time, 2))

    cust_times = [round(v/1000, 2) if v < TIMEOUT else "`T/O`" for v in cust_times]
    
    def_times = [round(v/1000, 2) for v in def_times]

    data.append({
        "Query Group": query_group,
        "# hintset": hs,
        "Default Ex. Time (sec)": def_times,
        "Custom Ex. Time (sec)": cust_times,
        "Boosts": boosts,
    }
    )
pd.DataFrame(data).to_csv(f"{ARTIFACTS_PATH}/full_plans.csv", index=False)

**Conclusion.** It can be seen that this representation is not sufficient for accurate estimation of even the execution time of plans. As a consequence, we cannot talk about predicting hintsets.

# Is pair $\langle plan_{from}, plan_{to} \rangle$ enough to safely estimate the usefullness of hintset?

Since the boosts collected during query exploration are actually an attribute of the **transition**

$$\tau: Plan_{default} \rightarrow \text{*hints applying*} \rightarrow Plan_{custom}$$

it would be logical to consider the resulting plan when making decisions. If suddenly we get some new plan when applying the hintset, our prediction is no longer reliable and it is better to avoid it.

Note: we considere only hintsets that are usefull for at least 1 query.

In [24]:
def get_transitions_and_query_map(def_plan_extractor, custom_plan_extractor, oracle):
    """For each query in oracle, by using given plan extractors, constructs 2 mappings:
        a) transition -> `<custom_time, default_time>`, and
        b) transition -> list of query names that have that transition with some hintset,
    where transition is represented just as a string like 
    `f"{def_plan_extractor(def_plan)} -> {custom_plan_extractor(custom_plan)}"`.
    """
    transitions = defaultdict(list)
    transition_to_queries = defaultdict(list)

    for q_n in oracle.get_query_names():
        
        def_plan = def_plan_extractor(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)))
        def_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP))
        
        for hs in HINTSETS:
            custom_plan = custom_plan_extractor(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP)))
            custom_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP))
            transition_key = f"{def_plan}->{custom_plan}"
            transitions[transition_key].append((custom_time, def_time))
            transition_to_queries[transition_key].append(q_n)
    
    return transitions, transition_to_queries

In [25]:
def make_transitions_summary(transitions, only_interesting=True):
    """for each of transition, that has sufficiently diverse performance
    (e.g. T/O for one query and significant boost for another), returns
    - max degradation
    - speedups (in ms)
    - boosts
    - custom times (in ms)
    - default times (in ms)
    """
    transition_summaries = []
    for transition, transition_info in transitions.items():
        boosts, speedups, custom_times, def_times = [], [], [], []
        
        for info in transition_info:
            cust_time, def_time = info
            custom_times.append(cust_time)
            def_times.append(def_time)

            speedups.append(def_time - cust_time)

            if cust_time >= TIMEOUT:
                boosts.append(-TIMEOUT_REL)
            elif cust_time > def_time:
                boosts.append(-cust_time / def_time)
            else:
                boosts.append(def_time / cust_time)
        
        are_all_queries_small = (max(def_times) < 1000) and (max(custom_times) < 1000)
        are_all_queries_boosted = (min(boosts) >= -1.0)
        are_all_queries_deboosted = (max(boosts) <= 1.0)
        
        if only_interesting and (are_all_queries_small or are_all_queries_boosted or are_all_queries_deboosted):
            continue
        
        is_there_big_deboost = -TIMEOUT_REL in boosts or max([c_t - d_t for c_t, d_t in zip(custom_times, def_times)]) > 500
        if only_interesting and (not is_there_big_deboost):
           continue
        
        summary = (min(boosts), speedups, custom_times, def_times, transition)
        transition_summaries.append(summary)   
        
    return transition_summaries 

## logical transition

In [26]:
logical_transitions, logical_transition_to_queries = get_transitions_and_query_map(
    oracle=oracle, 
    def_plan_extractor=get_logical_tree, 
    custom_plan_extractor=get_logical_tree
)
         
transition_summary = make_transitions_summary(transitions=logical_transitions)
transition_summary = sorted(transition_summary, key=lambda el: max(el[1]) - min(el[1]), reverse=True)

data = []
for max_degradation, speedups, cust_times, def_times, transition_key in transition_summary:
    queries = logical_transition_to_queries[transition_key]
    
    q_to_def_times, q_to_cust_times = defaultdict(list), defaultdict(list)
    for q_n, def_time, cust_time in sorted(zip(queries, def_times, cust_times)):
        q_to_def_times[q_n].append(def_time)
        q_to_cust_times[q_n].append(cust_time)

    query_group, def_times, cust_times, boosts = [], [], [], []
    for q_n in sorted(set(queries)):
        query_group.append(q_n)

        def_time = np.mean(q_to_def_times[q_n])
        cust_time = TIMEOUT if max(q_to_cust_times[q_n]) >= TIMEOUT else np.mean(q_to_cust_times[q_n])

        def_times.append(round(def_time / 1000, 2))

        if cust_time >= TIMEOUT:
            cust_times.append("`T\\O`")
        else:
            cust_times.append(round(cust_time / 1000, 2))

        if cust_time >= TIMEOUT:
            boosts.append("`NaN`")
        elif cust_time > def_time:
            boosts.append(round(-cust_time / def_time, 2))
        else:
            boosts.append(round(def_time / cust_time, 2))

    data.append({
        "Query Group": query_group,
        "Default Ex. Time (sec)": def_times,
        "Custom Ex. Time (sec)": cust_times,
        "Boosts": boosts,
    }
    )
pd.DataFrame(data).to_csv(f"{ARTIFACTS_PATH}/logical_transitions.csv", index=False)

**Conclusion.** Transition only between logic trees does not describe what is going on well enough.

In [27]:
full_transitions, full_transition_to_queries = get_transitions_and_query_map(
    oracle=oracle, 
    def_plan_extractor=get_full_plan, 
    custom_plan_extractor=get_full_plan,
)
         
transition_summary = make_transitions_summary(transitions=full_transitions, only_interesting=False)
transition_summary = sorted(transition_summary, key=lambda el: max(el[1]) - min(el[1]), reverse=True)

data = []
for max_degradation, speedups, cust_times, def_times, transition_key in transition_summary:
    queries = full_transition_to_queries[transition_key]
    
    q_to_def_times, q_to_cust_times = defaultdict(list), defaultdict(list)
    for q_n, def_time, cust_time in sorted(zip(queries, def_times, cust_times)):
        q_to_def_times[q_n].append(def_time)
        q_to_cust_times[q_n].append(cust_time)

    query_group, def_times, cust_times, boosts = [], [], [], []
    for q_n in sorted(set(queries)):
        query_group.append(q_n)

        def_time = np.mean(q_to_def_times[q_n])
        cust_time = TIMEOUT if max(q_to_cust_times[q_n]) >= TIMEOUT else np.mean(q_to_cust_times[q_n])

        def_times.append(round(def_time / 1000, 2))

        if cust_time >= TIMEOUT:
            cust_times.append("`T\\O`")
        else:
            cust_times.append(round(cust_time / 1000, 2))

        if cust_time >= TIMEOUT:
            boosts.append("`NaN`")
        elif cust_time > def_time:
            boosts.append(round(-cust_time / def_time, 2))
        else:
            boosts.append(round(def_time / cust_time, 2))

    data.append({
        "Query Group": query_group,
        "Default Ex. Time (sec)": def_times,
        "Custom Ex. Time (sec)": cust_times,
        "Boosts": boosts,
    }
    )
pd.DataFrame(data).to_csv(f"{ARTIFACTS_PATH}/full_transitions.csv", index=False)

**Conclusion.** When using transitions between full plans, there is no longer a situation where we speed up one query and slow down the other one.

## fuzzy transition

The only difference with full transitions is that we do not distinguish between full plans if the distance between them is not large enough.

In [28]:
def get_distance(sels1, sels2):
    """calculates the maximum among the ratio values of all corresponding coordinates"""
    try:
        assert len(sels1) == len(sels2)
        return max([max(sel1/sel2, sel2/sel1) for sel1, sel2 in zip(sels1, sels2)])
    except Exception:
        return float("inf")


sels1 = [(1, 12), (42, 666), (1, ), (1, 2), (1, 2, 3)]
sels2 = [(2,  3), (42, 666), (2, ), (1, ) , None]
expected_distances = [4, 1, 2, float("inf"), float("inf")]

for sel1, sel2, expected_distance in zip(sels1, sels2, expected_distances):
    assert get_distance(sel1, sel2) == expected_distance

In [34]:
def get_fuzzy_transitions_and_query_map(oracle):
    """For each query in oracle constructs 2 mapping as in `get_transitions_and_query_map`.
    The only difference is that instead of creating separated item for each transition
    'default plan -> custom plan' it tries to squeezy items with same logical transition
    'default logical plan -> custom logical plan' and close enough selectivities.

    P.S. it has side effect -- squeezed transitions are written in 'SQUEEZED_TRANSITIONS' list.
    """
    transitions = defaultdict(list)
    transition_to_queries = defaultdict(list)

    def_logical_trees_to_def_sels = defaultdict(list)
    semi_transition_to_sels = defaultdict(list)

    for q_n in oracle.get_query_names():
        def_logical_tree = get_logical_tree(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)))
        def_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP))
        def_sels = get_selectivities(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)))

        closest_def_sels = find_closest_sels(def_logical_trees_to_def_sels, def_logical_tree, def_sels)
        can_reuse_def_sels = (get_distance(def_sels, closest_def_sels) < DISTANCE_THRESHOLD)
        if can_reuse_def_sels:
            def_sels = closest_def_sels
        else:
            def_logical_trees_to_def_sels[def_logical_tree].append(def_sels)

        for hs in HINTSETS:
            custom_logical_tree = get_logical_tree(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP)))
            custom_sels = get_selectivities(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP)))
            custom_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP))
            
            #if custom_time >= TIMEOUT:
            #    custom_time = TIMEOUT_REL * def_time

            semi_transition_key = f"{def_logical_tree}|{def_sels}->{custom_logical_tree}"
                
            squeezed = False
            closest_custom_sels = find_closest_sels(semi_transition_to_sels, semi_transition_key, custom_sels)

            can_reuse_custom_sels = (get_distance(custom_sels, closest_custom_sels) < DISTANCE_THRESHOLD)
            if can_reuse_custom_sels:
                if (can_reuse_def_sels and closest_custom_sels is not None and closest_custom_sels != custom_sels):
                    squeezed = True                
                custom_sels = closest_custom_sels
            else:
                semi_transition_to_sels[semi_transition_key].append(custom_sels)

            transition_key = f"{def_logical_tree}|{def_sels}->{custom_logical_tree}|{custom_sels}"

            if squeezed:
                real_def_sels = get_selectivities(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)))
                real_custom_sels = get_selectivities(oracle.get_explain_plan(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP)))
                assert closest_custom_sels != real_custom_sels
                
                SQUEEZED_TRANSITIONS.append((transition_key, (tuple(real_def_sels), tuple(real_custom_sels))))
                SQUEEZED_TRANSITIONS.append((transition_key, (tuple(closest_def_sels), tuple(closest_custom_sels))))

            transitions[transition_key].append((custom_time, def_time))
            transition_to_queries[transition_key].append(q_n)

    return transitions, transition_to_queries

In [35]:
def find_closest_sels(logical_trees_to_sels, target_logical_tree, target_sels):
    min_distance = float("inf")
    argmin_sels = None

    for sels in logical_trees_to_sels[target_logical_tree]:
        distance = get_distance(sels, target_sels)
        if distance < min_distance:
            min_distance, argmin_sels = distance, sels
       
    return argmin_sels

**Note,** the value of `DISTANCE_THRESHOLD` parameter is super important -  we can achieve both full and logical transitions by changing it.

In [36]:
DISTANCE_THRESHOLD = 1.5
SQUEEZED_TRANSITIONS = []

In [37]:
def test_fuzzy():
    global DISTANCE_THRESHOLD
    global SQUEEZED_TRANSITIONS
    tmp1, tmp2 = DISTANCE_THRESHOLD, SQUEEZED_TRANSITIONS
    SQUEEZED_TRANSITIONS = []
    
    DISTANCE_THRESHOLD = float("inf")
    logical_transitions, logical_transition_to_queries = get_transitions_and_query_map(
        oracle=oracle, 
        def_plan_extractor=get_logical_tree, 
        custom_plan_extractor=get_logical_tree
    )
    fuzzy_transitions, fuzzy_transition_to_queries = get_fuzzy_transitions_and_query_map(oracle=oracle)
    assert len(fuzzy_transition_to_queries) == len(logical_transition_to_queries)

    DISTANCE_THRESHOLD = 1.0 + 10 ** (-42)
    full_transitions, full_transition_to_queries = get_transitions_and_query_map(
        oracle=oracle, 
        def_plan_extractor=get_full_plan, 
        custom_plan_extractor=get_full_plan
    )
    fuzzy_transitions, fuzzy_transition_to_queries = get_fuzzy_transitions_and_query_map(oracle=oracle)    
    assert len(fuzzy_transition_to_queries) == len(full_transition_to_queries)

    DISTANCE_THRESHOLD = tmp1
    SQUEEZED_TRANSITIONS = tmp2
    
test_fuzzy()

In [38]:
SQUEEZED_TRANSITIONS = []
fuzzy_transitions, fuzzy_transition_to_queries = get_fuzzy_transitions_and_query_map(
    oracle=oracle
)

# lets analyze only squeezed
squeezed_transition_to_real_sels_pairs = defaultdict(set)
for (transition, sels) in SQUEEZED_TRANSITIONS:
    squeezed_transition_to_real_sels_pairs[transition].add(sels)

transition_summary = make_transitions_summary(transitions=fuzzy_transitions, only_interesting=False)
transition_summary = sorted(transition_summary, key=lambda el: max(el[1]) - min(el[1]), reverse=True)

data = []
for max_degradation, speedups, cust_times, def_times, transition_key in transition_summary:

    if transition_key not in squeezed_transition_to_real_sels_pairs:
        continue
    
    if all([c_t >= TIMEOUT for c_t in cust_times]):
        continue

    queries = fuzzy_transition_to_queries[transition_key]
    
    q_to_def_times, q_to_cust_times = defaultdict(list), defaultdict(list)
    for q_n, def_time, cust_time in sorted(zip(queries, def_times, cust_times)):
        q_to_def_times[q_n].append(def_time)
        q_to_cust_times[q_n].append(cust_time)

    query_group, def_times, cust_times, boosts = [], [], [], []
    for q_n in sorted(set(queries)):
        query_group.append(q_n)

        def_time = np.mean(q_to_def_times[q_n])
        cust_time = TIMEOUT if max(q_to_cust_times[q_n]) >= TIMEOUT else np.mean(q_to_cust_times[q_n])

        def_times.append(round(def_time / 1000, 2))

        if cust_time >= TIMEOUT:
            cust_times.append("`T\\O`")
        else:
            cust_times.append(round(cust_time / 1000, 2))

        if cust_time >= TIMEOUT:
            boosts.append("`NaN`")
        elif cust_time > def_time:
            boosts.append(round(-cust_time / def_time, 2))
        else:
            boosts.append(round(def_time / cust_time, 2))

    data.append({
        "Query Group": query_group,
        "Default Ex. Time (sec)": def_times,
        "Custom Ex. Time (sec)": cust_times,
        "Boosts": boosts,
    }
    )
    
pd.DataFrame(data).to_csv(f"{ARTIFACTS_PATH}/fuzzy_transitions.csv", index=False)

**Conclusion.** Even fuzzy transitions are a good enough way to describe queries.