In [1]:
import os
import sys

ROOT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
EXPERIMENT_PATH = f"{ROOT_PATH}/experiments/prediction-modes"
sys.path.insert(0, ROOT_PATH)

In [2]:
from collections import defaultdict
from src.datasets.data_config import DOPS, HINTSETS, DEFAULT_HINTSET
from src.datasets.oracle import Oracle, OracleRequest, TIMEOUT
from src.utils import get_template_id, get_full_plan, get_logical_plan, get_selectivities

In [3]:
# other benchmarks don't have more than 1 query inside one template 
oracle = Oracle(f"{ROOT_PATH}/data/processed/JOB")

In [4]:
# the default value in our settings
DEFAULT_DOP = 64
TIMEOUT_REL = 2

# Definition of transition

Since the boosts collected during query exploration are actually *an attribute of the transition* 

$$\tau: Plan_{default} \rightarrow \textit{/* hintset applying */} \rightarrow Plan_{custom}$$

it would be logical to consider the resulting plan when making decisions. If suddenly we get some new plan when applying the hintset, our prediction is *no longer reliable* and it is better to **avoid** it.

**Note**: we considere only hintsets that are usefull for at least 1 query. 

Unfortunately, in the worst case, even the transition on full plans cannot be estimated accurately enough (`8c`, `8d` and `DOP=1`)

# Transition estimations 

In [5]:
def get_transitions_and_query_map(def_plan_extractor, custom_plan_extractor, oracle):
    """For each query in oracle, by using given plan extractors, constructs 2 mappings:
        a) transition -> `<custom_time, default_time>`, and
        b) transition -> list of query names that have that transition with some hintset,
    where transition is represented just as a string like 
    `f"{def_plan_extractor(def_plan)} -> {custom_plan_extractor(custom_plan)}"`.
    """
    transitions = defaultdict(list)
    transition_to_queries = defaultdict(list)

    for q_n in oracle.get_query_names():
        
        def_plan = def_plan_extractor(query_name=q_n, oracle=oracle, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)
        def_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP))
        
        for hs in HINTSETS:
            custom_plan = custom_plan_extractor(query_name=q_n, oracle=oracle, hintset=hs, dop=DEFAULT_DOP)
            custom_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP))
            transition_key = f"{def_plan}->{custom_plan}"
            transitions[transition_key].append((custom_time, def_time))
            transition_to_queries[transition_key].append(q_n)
    
    return transitions, transition_to_queries

In [6]:
def make_transitions_summary(transitions, only_interesting=True):
    """for each of transition, that has sufficiently diverse performance
    (e.g. T/O for one query and significant boost for another), returns
    - max degradation
    - speedups (in ms)
    - boosts
    - custom times (in ms)
    - default times (in ms)
    """
    transition_summaries = []
    for transition, transition_info in transitions.items():
        boosts, speedups, custom_times, def_times = [], [], [], []
        
        for info in transition_info:
            cust_time, def_time = info
            custom_times.append(cust_time)
            def_times.append(def_time)

            speedups.append(def_time - cust_time)

            if cust_time >= TIMEOUT:
                boosts.append(-TIMEOUT_REL)
            elif cust_time > def_time:
                boosts.append(-cust_time / def_time)
            else:
                boosts.append(def_time / cust_time)
        
        are_all_queries_small = (max(def_times) < 1000) and (max(custom_times) < 1000)
        are_all_queries_boosted = (min(boosts) >= -1.0)
        are_all_queries_deboosted = (max(boosts) <= 1.0)
        
        if only_interesting and (are_all_queries_small or are_all_queries_boosted or are_all_queries_deboosted):
            continue
        
        is_there_big_deboost = -TIMEOUT_REL in boosts or max([c_t - d_t for c_t, d_t in zip(custom_times, def_times)]) > 500
        if only_interesting and (not is_there_big_deboost):
           continue
        
        summary = (min(boosts), speedups, custom_times, def_times, transition)
        transition_summaries.append(summary)   
        
    return transition_summaries 

In [7]:
def pretty_print_summary(query_group, def_times, cust_times):
    import numpy as np
    q_to_def_times, q_to_cust_times = defaultdict(list), defaultdict(list)
    for q_n, def_time, cust_time in sorted(zip(query_group, def_times, cust_times)):
        q_to_def_times[q_n].append(def_time)
        q_to_cust_times[q_n].append(cust_time)

    query_col, def_time_col, cust_time_col, boost_col = [], [], [], []
    for q_n in sorted(set(query_group)):
        query_col.append(f"{q_n}")
        def_time = np.mean(q_to_def_times[q_n])
        def_time_col.append(f"{def_time / 1000:0.2f}s")
        
        cust_time = TIMEOUT if TIMEOUT in q_to_cust_times[q_n] else np.mean(q_to_cust_times[q_n])
        if cust_time >= TIMEOUT:
            cust_time_col.append("`T\\O`")
        else:
            cust_time_col.append(f"{cust_time / 1000:0.2f}s")

        if cust_time >= TIMEOUT:
            boost_col.append("`NaN`")
        elif cust_time > def_time:
            boost_col.append(f"{-cust_time / def_time:0.2f}")
        else:
            boost_col.append(f"{def_time / cust_time:0.2f}")
    
    print(f"| {'{'+', '.join(query_col)+'}'} |{', '.join(def_time_col)} | {', '.join(cust_time_col)}  | {', '.join(boost_col)} |")    

## Logical

In [8]:
logical_transitions, logical_transition_to_queries = get_transitions_and_query_map(
    oracle=oracle, 
    def_plan_extractor=get_logical_plan, 
    custom_plan_extractor=get_logical_plan
)
         
transition_summary = make_transitions_summary(transitions=logical_transitions)
transition_summary = sorted(transition_summary, key=lambda el: max(el[1]) - min(el[1]), reverse=True)
for max_degradation, speedups, cust_times, def_times, transition_key in transition_summary[:7]:
    query_group = logical_transition_to_queries[transition_key]
    print(f"TRANSITION WITH {set(query_group)} AND SPEEDUP VARIATION {(max(speedups) - min(speedups))/1000:0.1f}s")
    pretty_print_summary(query_group, def_times, cust_times)
    print("-" * 60)

TRANSITION WITH {'6c', '6b'} AND SPEEDUP VARIATION 4398046520.9s
| {6b, 6c} |14.53s, 0.05s | 4.90s, `T\O`  | 2.97, `NaN` |
------------------------------------------------------------
TRANSITION WITH {'6e', '6d'} AND SPEEDUP VARIATION 4398046520.4s
| {6d, 6e} |14.36s, 0.05s | 5.02s, `T\O`  | 2.86, `NaN` |
------------------------------------------------------------
TRANSITION WITH {'6c', '6b'} AND SPEEDUP VARIATION 4398046515.6s
| {6b, 6c} |14.53s, 0.05s | 9.99s, `T\O`  | 1.45, `NaN` |
------------------------------------------------------------
TRANSITION WITH {'6c', '6b'} AND SPEEDUP VARIATION 4398046513.2s
| {6b, 6c} |14.53s, 0.05s | 12.40s, `T\O`  | 1.17, `NaN` |
------------------------------------------------------------
TRANSITION WITH {'6e', '6d'} AND SPEEDUP VARIATION 4398046512.8s
| {6d, 6e} |14.36s, 0.05s | 12.65s, `T\O`  | 1.14, `NaN` |
------------------------------------------------------------
TRANSITION WITH {'6c', '6b'} AND SPEEDUP VARIATION 4398046512.4s
| {6b, 6c} |1

| Query Group| Default Time| Custom Time       | Relative Boost|
|------------|-------------|-------------------|---------------|
| {6b, 6c} |14.53s, 0.05s | 4.90s, `T\O`  | 2.97, `NaN` |
| {6d, 6e} |14.36s, 0.05s | 5.02s, `T\O`  | 2.86, `NaN` |
| {17b, 17c, 17d, 17f} |10.47s, 10.08s, 9.98s, 15.92s | 12.73s, 12.30s, 12.47s, 13.75s  | -1.22, -1.22, -1.25, **1.16** |

We see that if we also take into account the structure of the custom plan, the queries of group `{6a 6b 6c 6d 6e}` already become **distinguishable**, but logical plans without statistics are still insufficient to guarantee the absence of regressions, i.e. logical transition **doesn't** help us to address degradation problem

## Full

In [9]:
DEFAULT_DOP = 1
full_transitions, full_transition_to_queries = get_transitions_and_query_map(
    oracle=oracle, 
    def_plan_extractor=get_full_plan, 
    custom_plan_extractor=get_full_plan
)

transition_summary = make_transitions_summary(transitions=full_transitions, only_interesting=False)
transition_summary = sorted(transition_summary, key=lambda el: max(el[1]) - min(el[1]), reverse=True)
for max_degradation, speedups, cust_times, def_times, transition_key in transition_summary[:5]:
    query_group = full_transition_to_queries[transition_key]
    print(f"TRANSITION WITH {set(query_group)} AND SPEEDUP VARIATION {(max(speedups) - min(speedups))/1000:0.1f}s")
    pretty_print_summary(query_group, def_times, cust_times)
    print("-" * 60)
DEFAULT_DOP = 64

TRANSITION WITH {'6c', '6a', '6e'} AND SPEEDUP VARIATION 4398046511.0s
| {6a, 6c, 6e} |0.05s, 0.06s, 0.05s | `T\O`, 0.12s, `T\O`  | `NaN`, -2.20, `NaN` |
------------------------------------------------------------
TRANSITION WITH {'6c', '6a', '6e'} AND SPEEDUP VARIATION 4398046511.0s
| {6a, 6c, 6e} |0.05s, 0.06s, 0.05s | `T\O`, 0.12s, `T\O`  | `NaN`, -2.21, `NaN` |
------------------------------------------------------------
TRANSITION WITH {'8d', '8c'} AND SPEEDUP VARIATION 4398046497.7s
| {8c, 8d} |14.09s, 8.58s | `T\O`, 7.85s  | `NaN`, 1.09 |
------------------------------------------------------------
TRANSITION WITH {'8d', '8c'} AND SPEEDUP VARIATION 4398046497.5s
| {8c, 8d} |14.09s, 8.58s | `T\O`, 8.13s  | `NaN`, 1.05 |
------------------------------------------------------------
TRANSITION WITH {'8d', '8c'} AND SPEEDUP VARIATION 4398046497.4s
| {8c, 8d} |14.09s, 8.58s | `T\O`, 8.16s  | `NaN`, 1.05 |
------------------------------------------------------------


For `full plan - full plan` transitions **almost all degradation problems are solved**. There is only one extreme case with a pair of queries `{8c, 8d}` at `DOP` 1 and 16. These queries are specific because they have exactly the same plans with all hintsets, that is, the planner *sees no difference in them*. 

However, the situation is not as bad as it could be - this is not a case where one hintset is very good for one query and at the same time super bad for another, because we only have `~9%` speedup for `8c` and `T/O` for `8d`. It means that even by refusing hint advising in such a situation, we will not lose a significant boost.

## Fuzzy

There is a desire to reuse the information about `full plan-to-full plan` transitions in situations where a full match has not occurred (in practice, this is unlikely to happen as often as we would like). In this case, a natural solution is to use the ability to measure the distance between plans with the same logical tree. In this case, the experience with the transition will be reused if a) the logical trees match and b) the estimated statistics are close enough to the corresponding transition statistics.

We will name transition that is matched with a fuzzy transition as a *squeezed* transition

In [10]:
def get_distance(sels1, sels2):
    """calculates the maximum among the ratio values of all corresponding coordinates"""
    try:
        assert len(sels1) == len(sels2)
        return max([max(sel1/sel2, sel2/sel1) for sel1, sel2 in zip(sels1, sels2)])
    except Exception:
        return float("inf")


sels1 = [(1, 12), (42, 666), (1, ), (1, 2), (1, 2, 3)]
sels2 = [(2,  3), (42, 666), (2, ), (1, ) , None]
expected_distances = [4, 1, 2, float("inf"), float("inf")]

for sel1, sel2, expected_distance in zip(sels1, sels2, expected_distances):
    assert get_distance(sel1, sel2) == expected_distance

In [11]:
def get_fuzzy_transitions_and_query_map(oracle):
    """For each query in oracle constructs 2 mapping as in `get_transitions_and_query_map`.
    The only difference is that instead of creating separated item for each transition
    'default plan -> custom plan' it tries to squeezy items with same logical transition
    'default logical plan -> custom logical plan' and close enough selectivities.

    P.S. it has side effect -- squeezed transitions are written in 'SQUEEZED_TRANSITIONS' list.
    """
    transitions = defaultdict(list)
    transition_to_queries = defaultdict(list)

    def_logical_trees_to_def_sels = defaultdict(list)
    semi_transition_to_sels = defaultdict(list)

    for q_n in oracle.get_query_names():
        def_logical_tree = get_logical_plan(query_name=q_n, oracle=oracle, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)
        def_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP))
        def_sels = get_selectivities(query_name=q_n, oracle=oracle, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)

        closest_def_sels = find_closest_sels(def_logical_trees_to_def_sels, def_logical_tree, def_sels)
        can_reuse_def_sels = (get_distance(def_sels, closest_def_sels) < DISTANCE_THRESHOLD)
        if can_reuse_def_sels:
            def_sels = closest_def_sels
        else:
            def_logical_trees_to_def_sels[def_logical_tree].append(def_sels)

        for hs in HINTSETS:
            custom_logical_tree = get_logical_plan(query_name=q_n, oracle=oracle, hintset=hs, dop=DEFAULT_DOP)
            custom_sels = get_selectivities(query_name=q_n, oracle=oracle, hintset=hs, dop=DEFAULT_DOP)
            custom_time = oracle.get_execution_time(OracleRequest(query_name=q_n, hintset=hs, dop=DEFAULT_DOP))
            #if custom_time >= TIMEOUT:
            #    custom_time = TIMEOUT_REL * def_time

            semi_transition_key = f"{def_logical_tree}|{def_sels}->{custom_logical_tree}"
                
            squeezed = False
            closest_custom_sels = find_closest_sels(semi_transition_to_sels, semi_transition_key, custom_sels)

            can_reuse_custom_sels = (get_distance(custom_sels, closest_custom_sels) < DISTANCE_THRESHOLD)
            if can_reuse_custom_sels:
                if (can_reuse_def_sels and closest_custom_sels is not None and closest_custom_sels != custom_sels):
                    squeezed = True                
                custom_sels = closest_custom_sels
            else:
                semi_transition_to_sels[semi_transition_key].append(custom_sels)

            transition_key = f"{def_logical_tree}|{def_sels}->{custom_logical_tree}|{custom_sels}"

            if squeezed:
                real_def_sels = get_selectivities(query_name=q_n, oracle=oracle, hintset=DEFAULT_HINTSET, dop=DEFAULT_DOP)
                real_custom_sels = get_selectivities(query_name=q_n, oracle=oracle, hintset=hs, dop=DEFAULT_DOP)
                assert closest_custom_sels != real_custom_sels
                
                SQUEEZED_TRANSITIONS.append((transition_key, (tuple(real_def_sels), tuple(real_custom_sels))))
                SQUEEZED_TRANSITIONS.append((transition_key, (tuple(closest_def_sels), tuple(closest_custom_sels))))

            transitions[transition_key].append((custom_time, def_time))
            transition_to_queries[transition_key].append(q_n)

    return transitions, transition_to_queries

In [12]:
def find_closest_sels(logical_trees_to_sels, target_logical_tree, target_sels):
    min_distance = float("inf")
    argmin_sels = None

    for sels in logical_trees_to_sels[target_logical_tree]:
        distance = get_distance(sels, target_sels)
        if distance < min_distance:
            min_distance, argmin_sels = distance, sels
       
    return argmin_sels

In [13]:
# the value of that paramter is super important;
# we can achieve both full and logical transition by changing it;
DISTANCE_THRESHOLD = 1.5

In [14]:
SQUEEZED_TRANSITIONS = []

In [15]:
def test_fuzzy():
    global DISTANCE_THRESHOLD
    global SQUEEZED_TRANSITIONS
    tmp1, tmp2 = DISTANCE_THRESHOLD, SQUEEZED_TRANSITIONS
    SQUEEZED_TRANSITIONS = []
    
    DISTANCE_THRESHOLD = float("inf")
    logical_transitions, logical_transition_to_queries = get_transitions_and_query_map(
        oracle=oracle, 
        def_plan_extractor=get_logical_plan, 
        custom_plan_extractor=get_logical_plan
    )
    fuzzy_transitions, fuzzy_transition_to_queries = get_fuzzy_transitions_and_query_map(oracle=oracle)
    assert len(fuzzy_transition_to_queries) == len(logical_transition_to_queries)

    DISTANCE_THRESHOLD = 1.0 + 10 ** (-42)
    full_transitions, full_transition_to_queries = get_transitions_and_query_map(
        oracle=oracle, 
        def_plan_extractor=get_full_plan, 
        custom_plan_extractor=get_full_plan
    )
    fuzzy_transitions, fuzzy_transition_to_queries = get_fuzzy_transitions_and_query_map(oracle=oracle)    
    assert len(fuzzy_transition_to_queries) == len(full_transition_to_queries)

    DISTANCE_THRESHOLD = tmp1
    SQUEEZED_TRANSITIONS = tmp2
    

test_fuzzy()

In [16]:
DISTANCE_THRESHOLD = 1.5

In [20]:
DEFAULT_DOP = 64
SQUEEZED_TRANSITIONS = []
fuzzy_transitions, fuzzy_transition_to_queries = get_fuzzy_transitions_and_query_map(
    oracle=oracle
)

# lets analyze only squeezed
squeezed_transition_to_real_sels_pairs = defaultdict(set)
for (transition, sels) in SQUEEZED_TRANSITIONS:
    squeezed_transition_to_real_sels_pairs[transition].add(sels)

transition_summary = make_transitions_summary(transitions=fuzzy_transitions, only_interesting=False)
transition_summary = sorted(transition_summary, key=lambda el: max(el[1]) - min(el[1]), reverse=True)
for max_degradation, speedups, cust_times, def_times, transition_key in transition_summary:

    if transition_key not in squeezed_transition_to_real_sels_pairs:
        continue
    
    if all([c_t >= TIMEOUT for c_t in cust_times]):
        continue

    query_group = fuzzy_transition_to_queries[transition_key]
    print(f"TRANSITION WITH {set(query_group)} AND SPEEDUP VARIATION {(max(speedups) - min(speedups))/1000:0.1f}s")
    pretty_print_summary(query_group, def_times, cust_times)
    print("-" * 60)

TRANSITION WITH {'16c', '16d'} AND SPEEDUP VARIATION 0.7s
| {16c, 16d} |5.38s, 4.17s | 8.04s, 6.08s  | -1.49, -1.46 |
------------------------------------------------------------
TRANSITION WITH {'16c', '16d'} AND SPEEDUP VARIATION 0.6s
| {16c, 16d} |5.38s, 4.17s | 7.97s, 6.12s  | -1.48, -1.47 |
------------------------------------------------------------
TRANSITION WITH {'16c', '16d'} AND SPEEDUP VARIATION 0.6s
| {16c, 16d} |5.38s, 4.17s | 7.78s, 5.96s  | -1.44, -1.43 |
------------------------------------------------------------
TRANSITION WITH {'16c', '16d'} AND SPEEDUP VARIATION 0.6s
| {16c, 16d} |5.38s, 4.17s | 7.75s, 5.96s  | -1.44, -1.43 |
------------------------------------------------------------
TRANSITION WITH {'16c', '16d'} AND SPEEDUP VARIATION 0.6s
| {16c, 16d} |5.38s, 4.17s | 7.78s, 6.00s  | -1.45, -1.44 |
------------------------------------------------------------
TRANSITION WITH {'16c', '16d'} AND SPEEDUP VARIATION 0.5s
| {16c, 16d} |5.38s, 4.17s | 7.72s, 5.99s  | -1

### Table with times for fuzzied transitions with biggest custom time diff (`distance`=1.5) 

| Query Group| Default Time| Custom Time       | Relative Boost |
|------------|-------------|-------------------|----------------|
| {16c, 16d} | 5.382s, 4.166s | 8.041s, 6.076s  | -1.49, -1.46 |
| {1a, 1c} | 1.885s, 1.442s | 0.698s, 0.671s  | 2.70, 2.15 |
| {21a, 21b} | 0.129s, 0.100s | 1.121s, 1.031s  | -8.69, -10.31 |

It can be seen that with careful selection of the `DISTANCE_THRESHOLD` we can squeezy transitions relatively safely so that all queries from them behave similarly. It should be noted that depending on the selection of the `DISTANCE_THRESHOLD`, the compression can turn into logical transitions or full transitions, i.e., the problem of regressions is not finally solved.